From 235cdcab222e49f203e4cf4d6c5377b6f2a7155a Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Oct 2024 12:09:42 +0200 Subject: [PATCH 01/93] Add cg_streaming enum class case. --- include/plssvm/solver_types.hpp | 2 ++ src/plssvm/solver_types.cpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/include/plssvm/solver_types.hpp b/include/plssvm/solver_types.hpp index 0dca4dad9..474db009e 100644 --- a/include/plssvm/solver_types.hpp +++ b/include/plssvm/solver_types.hpp @@ -32,6 +32,8 @@ enum class solver_type { automatic, /** Use the CG algorithm explicitly calculating the kernel matrix and fully storing it on the device. */ cg_explicit, + /** Use the CG algorithm explicitly calculating the kernel matrix and fully storing it on the host. Realized using unified shared memory. */ + cg_streaming, /** Use the CG algorithm implicitly recomputing the kernel matrix each CG iteration (smallest memory footprint). */ cg_implicit }; diff --git a/src/plssvm/solver_types.cpp b/src/plssvm/solver_types.cpp index c830728ec..82a70f589 100644 --- a/src/plssvm/solver_types.cpp +++ b/src/plssvm/solver_types.cpp @@ -23,6 +23,8 @@ std::ostream &operator<<(std::ostream &out, const solver_type solving) { return out << "automatic"; case solver_type::cg_explicit: return out << "cg_explicit"; + case solver_type::cg_streaming: + return out << "cg_streaming"; case solver_type::cg_implicit: return out << "cg_implicit"; } @@ -38,6 +40,8 @@ std::istream &operator>>(std::istream &in, solver_type &solving) { solving = solver_type::automatic; } else if (str == "cg_explicit") { solving = solver_type::cg_explicit; + } else if (str == "cg_streaming") { + solving = solver_type::cg_streaming; } else if (str == "cg_implicit") { solving = solver_type::cg_implicit; } else { From 0006b9ac8d3c9cc19041f860d2e31db8b5a6a1c9 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Oct 2024 12:15:14 +0200 Subject: [PATCH 02/93] Add device_ptr flag to enable shared/managed memory allocations. --- .../backends/CUDA/detail/device_ptr.cuh | 7 +++-- include/plssvm/backends/gpu_device_ptr.hpp | 31 ++++++++++++------- src/plssvm/backends/CUDA/detail/device_ptr.cu | 18 ++++++----- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/include/plssvm/backends/CUDA/detail/device_ptr.cuh b/include/plssvm/backends/CUDA/detail/device_ptr.cuh index de2d8546d..bb99e1ffe 100644 --- a/include/plssvm/backends/CUDA/detail/device_ptr.cuh +++ b/include/plssvm/backends/CUDA/detail/device_ptr.cuh @@ -32,6 +32,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr, - "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types."); + "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types."); public: /// The type of the values used in the device_ptr. @@ -57,14 +57,14 @@ class gpu_device_ptr { * @param[in] size the size of the managed memory * @param[in] queue the queue (or similar) to manage the device_ptr */ - gpu_device_ptr(size_type size, const queue_type queue); + gpu_device_ptr(size_type size, const queue_type queue, bool use_usm_allocations); /** * @brief Construct a device_ptr for the device managed by @p queue with the provided @p shape. * @details The managed memory size is: extents[0] * extents[1]. * @param[in] shape the 2D size of the managed memory; size = shape.x * shape.y * @param[in] queue the queue (or similar) to manage the device_ptr */ - gpu_device_ptr(plssvm::shape shape, const queue_type queue); + gpu_device_ptr(plssvm::shape shape, const queue_type queue, bool use_usm_allocations); /** * @brief Construct a device_ptr for the device managed by @p queue with the provided @p shape including @p padding. * @details The managed memory size is: (shape.x + padding.x) * (shape.y + padding.y). @@ -72,7 +72,7 @@ class gpu_device_ptr { * @param[in] padding the padding applied to the extents * @param[in] queue the queue (or similar) to manage the device_ptr */ - gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue); + gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue, bool use_usm_allocations); /** * @brief Delete copy-constructor to make device_ptr a move only type. @@ -368,31 +368,36 @@ class gpu_device_ptr { plssvm::shape padding_{}; /// The device pointer pointing to the managed memory. device_pointer_type data_{}; + /// If true, use USM allocations automatically migrating the data between host and device. + bool use_usm_allocations_{}; }; - template -gpu_device_ptr::gpu_device_ptr(const size_type size, const queue_type queue) : +gpu_device_ptr::gpu_device_ptr(const size_type size, const queue_type queue, const bool use_usm_allocations) : queue_{ queue }, - shape_{ plssvm::shape{ size, 1 } } { } + shape_{ plssvm::shape{ size, 1 } }, + use_usm_allocations_{ use_usm_allocations } { } template -gpu_device_ptr::gpu_device_ptr(const plssvm::shape shape, const queue_type queue) : +gpu_device_ptr::gpu_device_ptr(const plssvm::shape shape, const queue_type queue, const bool use_usm_allocations) : queue_{ queue }, - shape_{ shape } { } + shape_{ shape }, + use_usm_allocations_{ use_usm_allocations } { } template -gpu_device_ptr::gpu_device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type queue) : +gpu_device_ptr::gpu_device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type queue, const bool use_usm_allocations) : queue_{ queue }, shape_{ shape }, - padding_{ padding } { } + padding_{ padding }, + use_usm_allocations_{ use_usm_allocations } { } template gpu_device_ptr::gpu_device_ptr(gpu_device_ptr &&other) noexcept : queue_{ std::exchange(other.queue_, queue_type{}) }, shape_{ std::exchange(other.shape_, plssvm::shape{}) }, padding_{ std::exchange(other.padding_, plssvm::shape{}) }, - data_{ std::exchange(other.data_, device_pointer_type{}) } { } + data_{ std::exchange(other.data_, device_pointer_type{}) }, + use_usm_allocations_{ std::exchange(other.use_usm_allocations_, false) } { } template auto gpu_device_ptr::operator=(gpu_device_ptr &&other) noexcept -> gpu_device_ptr & { @@ -402,6 +407,7 @@ auto gpu_device_ptr::opera shape_ = std::exchange(other.shape_, plssvm::shape{}); padding_ = std::exchange(other.padding_, plssvm::shape{}); data_ = std::exchange(other.data_, device_pointer_type{}); + use_usm_allocations_ = std::exchange(other.use_usm_allocations_, false); } return *this; } @@ -412,6 +418,7 @@ void gpu_device_ptr::swap( std::swap(shape_, other.shape_); std::swap(padding_, other.padding_); std::swap(data_, other.data_); + std::swap(use_usm_allocations_, other.use_usm_allocations_); } template diff --git a/src/plssvm/backends/CUDA/detail/device_ptr.cu b/src/plssvm/backends/CUDA/detail/device_ptr.cu index 5d7ba74bb..87d069409 100644 --- a/src/plssvm/backends/CUDA/detail/device_ptr.cu +++ b/src/plssvm/backends/CUDA/detail/device_ptr.cu @@ -25,21 +25,25 @@ namespace plssvm::cuda::detail { template -device_ptr::device_ptr(const size_type size, const queue_type device) : - device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { } +device_ptr::device_ptr(const size_type size, const queue_type device, const bool use_usm_allocations) : + device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const queue_type device) : - device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { } +device_ptr::device_ptr(const plssvm::shape shape, const queue_type device, const bool use_usm_allocations) : + device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device) : - base_type{ shape, padding, device } { +device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) : + base_type{ shape, padding, device, use_usm_allocations } { if (queue_ < 0 || queue_ >= static_cast(get_device_count())) { throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) }; } detail::set_device(queue_); - PLSSVM_CUDA_ERROR_CHECK(cudaMalloc(&data_, this->size_padded() * sizeof(value_type))) + if (use_usm_allocations_) { + PLSSVM_CUDA_ERROR_CHECK(cudaMallocManaged(&data_, this->size_padded() * sizeof(value_type))) + } else { + PLSSVM_CUDA_ERROR_CHECK(cudaMalloc(&data_, this->size_padded() * sizeof(value_type))) + } this->memset(0); } From 3508e154af2c51a83884ee7b52fab5b43e1ce045 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 2 Oct 2024 12:15:30 +0200 Subject: [PATCH 03/93] Allocate kernel matrix using shared memory for cg_streaming. --- include/plssvm/backends/CUDA/csvm.hpp | 3 ++- include/plssvm/backends/gpu_csvm.hpp | 6 ++++-- include/plssvm/csvm.hpp | 2 ++ src/plssvm/backends/CUDA/csvm.cu | 7 +++++-- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp index 5e0eed30d..d18e3395a 100644 --- a/include/plssvm/backends/CUDA/csvm.hpp +++ b/include/plssvm/backends/CUDA/csvm.hpp @@ -22,6 +22,7 @@ #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/type_traits.hpp" // PLSSVM_REQUIRES #include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/solver_types.hpp" // plssvm::solver_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform #include // std::size_t @@ -152,7 +153,7 @@ class csvm : public ::plssvm::detail::gpu_csvm gpu_csvmrun_assemble_kernel_matrix_explicit(device_id, exec, params, data_d[device_id], q_red_d[device_id], QA_cost); + device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, solver, data_d[device_id], q_red_d[device_id], QA_cost); kernel_matrices_parts[device_id] = ::plssvm::detail::move_only_any{ std::move(kernel_matrix) }; } break; @@ -385,6 +386,7 @@ void gpu_csvm::blas_level_3(const solver // unreachable break; case solver_type::cg_explicit: + case solver_type::cg_streaming: { const auto &A_d = detail::move_only_any_cast(A[device_id]); PLSSVM_ASSERT(!A_d.empty(), "The A matrix must not be empty!"); diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp index 3e0ea2472..723c4bfd9 100644 --- a/include/plssvm/csvm.hpp +++ b/include/plssvm/csvm.hpp @@ -836,6 +836,8 @@ std::tuple, std::vector, std::vector failed_cg_explicit_constraints = check_sizes(total_memory_needed_explicit_per_device, usable_device_memory_per_device); failed_cg_explicit_constraints.empty()) { diff --git a/src/plssvm/backends/CUDA/csvm.cu b/src/plssvm/backends/CUDA/csvm.cu index 9eebc97e3..1e00f7edf 100644 --- a/src/plssvm/backends/CUDA/csvm.cu +++ b/src/plssvm/backends/CUDA/csvm.cu @@ -150,7 +150,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const { // fit // //***************************************************// -auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const solver_type solver, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { const unsigned long long num_rows_reduced = data_d.shape().x - 1; const unsigned long long num_features = data_d.shape().y; const queue_type &device = devices_[device_id]; @@ -165,7 +165,10 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_); const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id); - device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix + // only store the upper triangular matrix + // if solver == solver_type::cg_explicit: store it explicitly + // if solver == solver_type::cg_streaming: store it using USM + device_ptr_type kernel_matrix_d{ num_entries_padded, device, solver == solver_type::cg_streaming }; const real_type cost_factor = real_type{ 1.0 } / params.cost; // convert execution range block to CUDA's native dim3 From bf19526e30b095aa9705b75634fa0a8d964bd950 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 10:32:08 +0200 Subject: [PATCH 04/93] Use USM allocations in BLAS kernel and slightly change API. --- include/plssvm/backends/CUDA/csvm.hpp | 2 +- include/plssvm/backends/gpu_csvm.hpp | 11 ++++++----- src/plssvm/backends/CUDA/csvm.cu | 4 ++-- src/plssvm/backends/CUDA/detail/device_ptr.cu | 18 +++++++++++------- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp index d18e3395a..1cb1b8268 100644 --- a/include/plssvm/backends/CUDA/csvm.hpp +++ b/include/plssvm/backends/CUDA/csvm.hpp @@ -153,7 +153,7 @@ class csvm : public ::plssvm::detail::gpu_csvm gpu_csvmrun_assemble_kernel_matrix_explicit(device_id, exec, params, solver, data_d[device_id], q_red_d[device_id], QA_cost); + device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, solver == solver_type::cg_streaming, data_d[device_id], q_red_d[device_id], QA_cost); kernel_matrices_parts[device_id] = ::plssvm::detail::move_only_any{ std::move(kernel_matrix) }; } break; @@ -331,7 +332,7 @@ void gpu_csvm::blas_level_3(const solver // the partial C result from a specific device later stored on device 0 to perform the C reduction (inplace matrix addition) device_ptr_type partial_C_d{}; if (num_devices > 1) { - partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0] }; + partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0], solver == solver_type::cg_streaming }; } // split memory allocation and memory copy! @@ -344,8 +345,8 @@ void gpu_csvm::blas_level_3(const solver const queue_type &device = devices_[device_id]; // allocate memory on the device - B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device }; - C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device }; + B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device, solver == solver_type::cg_streaming }; + C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device, solver == solver_type::cg_streaming }; } #pragma omp parallel for ordered if (num_devices > 1) diff --git a/src/plssvm/backends/CUDA/csvm.cu b/src/plssvm/backends/CUDA/csvm.cu index 1e00f7edf..4d93723a7 100644 --- a/src/plssvm/backends/CUDA/csvm.cu +++ b/src/plssvm/backends/CUDA/csvm.cu @@ -150,7 +150,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const { // fit // //***************************************************// -auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const solver_type solver, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { const unsigned long long num_rows_reduced = data_d.shape().x - 1; const unsigned long long num_features = data_d.shape().y; const queue_type &device = devices_[device_id]; @@ -168,7 +168,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons // only store the upper triangular matrix // if solver == solver_type::cg_explicit: store it explicitly // if solver == solver_type::cg_streaming: store it using USM - device_ptr_type kernel_matrix_d{ num_entries_padded, device, solver == solver_type::cg_streaming }; + device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations }; const real_type cost_factor = real_type{ 1.0 } / params.cost; // convert execution range block to CUDA's native dim3 diff --git a/src/plssvm/backends/CUDA/detail/device_ptr.cu b/src/plssvm/backends/CUDA/detail/device_ptr.cu index 87d069409..a8aece30f 100644 --- a/src/plssvm/backends/CUDA/detail/device_ptr.cu +++ b/src/plssvm/backends/CUDA/detail/device_ptr.cu @@ -35,7 +35,7 @@ device_ptr::device_ptr(const plssvm::shape shape, const queue_type device, co template device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) : base_type{ shape, padding, device, use_usm_allocations } { - if (queue_ < 0 || queue_ >= static_cast(get_device_count())) { + if (queue_ < 0 || queue_ >= get_device_count()) { throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) }; } detail::set_device(queue_); @@ -97,9 +97,11 @@ void device_ptr::copy_to_device(const_host_pointer_type data_to_copy, const s PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); - detail::set_device(queue_); - const size_type rcount = std::min(count, this->size_padded() - pos); - PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(data_ + pos, data_to_copy, rcount * sizeof(value_type), cudaMemcpyHostToDevice)) + if (!use_usm_allocations_) { + detail::set_device(queue_); + const size_type rcount = std::min(count, this->size_padded() - pos); + PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(data_ + pos, data_to_copy, rcount * sizeof(value_type), cudaMemcpyHostToDevice)) + } } template @@ -120,9 +122,11 @@ void device_ptr::copy_to_host(host_pointer_type buffer, const size_type pos, PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); - detail::set_device(queue_); - const size_type rcount = std::min(count, this->size_padded() - pos); - PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(buffer, data_ + pos, rcount * sizeof(value_type), cudaMemcpyDeviceToHost)) + if (!use_usm_allocations_) { + detail::set_device(queue_); + const size_type rcount = std::min(count, this->size_padded() - pos); + PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(buffer, data_ + pos, rcount * sizeof(value_type), cudaMemcpyDeviceToHost)) + } } template From e403c62707289a393cd2edcb94651581935a6dd4 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 10:44:54 +0200 Subject: [PATCH 05/93] Remove USM related if in copy functions. --- src/plssvm/backends/CUDA/detail/device_ptr.cu | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/plssvm/backends/CUDA/detail/device_ptr.cu b/src/plssvm/backends/CUDA/detail/device_ptr.cu index a8aece30f..00f20f66e 100644 --- a/src/plssvm/backends/CUDA/detail/device_ptr.cu +++ b/src/plssvm/backends/CUDA/detail/device_ptr.cu @@ -97,11 +97,9 @@ void device_ptr::copy_to_device(const_host_pointer_type data_to_copy, const s PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); - if (!use_usm_allocations_) { - detail::set_device(queue_); - const size_type rcount = std::min(count, this->size_padded() - pos); - PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(data_ + pos, data_to_copy, rcount * sizeof(value_type), cudaMemcpyHostToDevice)) - } + detail::set_device(queue_); + const size_type rcount = std::min(count, this->size_padded() - pos); + PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(data_ + pos, data_to_copy, rcount * sizeof(value_type), cudaMemcpyHostToDevice)) } template @@ -122,11 +120,9 @@ void device_ptr::copy_to_host(host_pointer_type buffer, const size_type pos, PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); - if (!use_usm_allocations_) { - detail::set_device(queue_); - const size_type rcount = std::min(count, this->size_padded() - pos); - PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(buffer, data_ + pos, rcount * sizeof(value_type), cudaMemcpyDeviceToHost)) - } + detail::set_device(queue_); + const size_type rcount = std::min(count, this->size_padded() - pos); + PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(buffer, data_ + pos, rcount * sizeof(value_type), cudaMemcpyDeviceToHost)) } template From 766386070b9f8cb59b456fb9fb17fab9012a3c07 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 11:11:13 +0200 Subject: [PATCH 06/93] Use variable to specify whether USM allocations should be used. --- include/plssvm/backends/gpu_csvm.hpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp index 9ea248dc1..089d8cbb0 100644 --- a/include/plssvm/backends/gpu_csvm.hpp +++ b/include/plssvm/backends/gpu_csvm.hpp @@ -230,6 +230,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvmnum_available_devices(); const std::size_t num_rows_reduced = A.shape().x - 1; @@ -253,8 +254,8 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm gpu_csvmrun_assemble_kernel_matrix_explicit(device_id, exec, params, solver == solver_type::cg_streaming, data_d[device_id], q_red_d[device_id], QA_cost); + device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, use_usm_allocations, data_d[device_id], q_red_d[device_id], QA_cost); kernel_matrices_parts[device_id] = ::plssvm::detail::move_only_any{ std::move(kernel_matrix) }; } break; @@ -323,6 +324,7 @@ void gpu_csvm::blas_level_3(const solver PLSSVM_ASSERT(B.shape() == C.shape(), "The B ({}) and C ({}) matrices must have the same shape!", B.shape(), C.shape()); PLSSVM_ASSERT(B.padding() == C.padding(), "The B ({}) and C ({}) matrices must have the same padding!", B.padding(), C.padding()); + const bool use_usm_allocations = solver == solver_type::cg_streaming; const std::size_t num_devices = this->num_available_devices(); // the C and B matrices; completely stored on each device @@ -332,7 +334,7 @@ void gpu_csvm::blas_level_3(const solver // the partial C result from a specific device later stored on device 0 to perform the C reduction (inplace matrix addition) device_ptr_type partial_C_d{}; if (num_devices > 1) { - partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0], solver == solver_type::cg_streaming }; + partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0], use_usm_allocations }; } // split memory allocation and memory copy! @@ -345,8 +347,8 @@ void gpu_csvm::blas_level_3(const solver const queue_type &device = devices_[device_id]; // allocate memory on the device - B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device, solver == solver_type::cg_streaming }; - C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device, solver == solver_type::cg_streaming }; + B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device, use_usm_allocations }; + C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device, use_usm_allocations }; } #pragma omp parallel for ordered if (num_devices > 1) From cd6deeadd8741f7c340687b69f507e5108e5f866 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 11:48:25 +0200 Subject: [PATCH 07/93] Add solver_type::automatic handling for cg_streaming. --- include/plssvm/csvm.hpp | 99 +++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 43 deletions(-) diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp index 723c4bfd9..1e23aec9f 100644 --- a/include/plssvm/csvm.hpp +++ b/include/plssvm/csvm.hpp @@ -46,6 +46,7 @@ #include // std::size_t #include // std::numeric_limits::lowest #include // std::unique_ptr +#include // std::accumulate #include // std::optional, std::make_optional, std::nullopt #include // std::milli #include // std::tie @@ -791,6 +792,7 @@ std::tuple, std::vector, std::vectornum_available_devices() }; const std::vector total_memory_needed_explicit_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs); + const detail::memory_size total_memory_needed_streaming = std::accumulate(total_memory_needed_explicit_per_device.cbegin(), total_memory_needed_explicit_per_device.cend(), detail::memory_size{}); const std::vector total_memory_needed_implicit_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs); // format a vector differentiating between it containing only a single entry or multiple @@ -809,8 +811,9 @@ std::tuple, std::vector, std::vector(percentual_safety_margin * 100.0L), minimal_safety_margin, detail::tracking::tracking_entry{ "solver", "system_memory", total_system_memory }, @@ -818,10 +821,12 @@ std::tuple, std::vector, std::vector, std::vector, std::vector failed_cg_explicit_constraints = check_sizes(total_memory_needed_explicit_per_device, usable_device_memory_per_device); failed_cg_explicit_constraints.empty()) { @@ -846,55 +849,65 @@ std::tuple, std::vector, std::vector failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) { + if (total_memory_needed_streaming <= usable_system_memory) { // use the implicit solver type - used_solver = solver_type::cg_implicit; + used_solver = solver_type::cg_streaming; } else { - // not enough device memory available for the implicit case - throw kernel_launch_resources{ fmt::format("Not enough device memory available on device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) }; + detail::log(verbosity_level::full, "Cannot use cg_streaming due to memory constraints on the system memory!\n"); + + // check whether there is enough memory available for cg_implicit + if (const std::vector failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) { + // use the implicit solver type + used_solver = solver_type::cg_implicit; + } else { + // not enough device memory available for the implicit case + throw kernel_launch_resources{ fmt::format("Not enough device memory available on device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) }; + } } } // enforce max mem alloc size if requested #if defined(PLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE) - // get the maximum possible memory allocation size per device - const std::vector max_mem_alloc_size_per_device = this->get_max_mem_alloc_size(); + // not applicable for the streaming CG implementation using USM! + if (used_solver != solver_type::cg_streaming) { + // get the maximum possible memory allocation size per device + const std::vector max_mem_alloc_size_per_device = this->get_max_mem_alloc_size(); - // get the maximum single allocation size per device - const std::vector max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); - const std::vector max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); + // get the maximum single allocation size per device + const std::vector max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); + const std::vector max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); - // output the maximum memory allocation size per device - detail::log(verbosity_level::full, - " - maximum supported single memory allocation size: {}\n" - " - maximum needed single memory allocation size (cg_explicit): {}\n" - " - maximum needed single memory allocation size (cg_implicit): {}\n", - format_vector(max_mem_alloc_size_per_device), - format_vector(max_single_allocation_cg_explicit_size_per_device), - format_vector(max_single_allocation_cg_implicit_size_per_device)); - PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device })); - PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device })); - PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device })); - - // check whether the maximum single memory allocation sizes per device can be satisfied - // check whether the maximum single cg_explicit memory allocation size can be satisfied - if (const std::vector failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device); - used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) { - // max mem alloc size constraints not fulfilled + // output the maximum memory allocation size per device detail::log(verbosity_level::full, - "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n", - format_vector(failed_cg_explicit_constraints)); - // can't use cg_explicit - used_solver = solver_type::cg_implicit; - } - if (const std::vector failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device); - used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) { - // can't fulfill maximum single memory allocation size even for cg_implicit - plssvm::detail::log(verbosity_level::full | verbosity_level::warning, - "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, " - "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n"); - throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) }; + " - maximum supported single memory allocation size: {}\n" + " - maximum needed single memory allocation size (cg_explicit): {}\n" + " - maximum needed single memory allocation size (cg_implicit): {}\n", + format_vector(max_mem_alloc_size_per_device), + format_vector(max_single_allocation_cg_explicit_size_per_device), + format_vector(max_single_allocation_cg_implicit_size_per_device)); + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device })); + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device })); + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device })); + + // check whether the maximum single memory allocation sizes per device can be satisfied + // check whether the maximum single cg_explicit memory allocation size can be satisfied + if (const std::vector failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device); + used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) { + // max mem alloc size constraints not fulfilled + detail::log(verbosity_level::full, + "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n", + format_vector(failed_cg_explicit_constraints)); + // can't use cg_explicit + used_solver = solver_type::cg_implicit; + } + if (const std::vector failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device); + used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) { + // can't fulfill maximum single memory allocation size even for cg_implicit + plssvm::detail::log(verbosity_level::full | verbosity_level::warning, + "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, " + "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n"); + throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) }; + } } #endif } From 2dc78811355a393d12017ffd21f57dd5661cfbfc Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 14:42:51 +0200 Subject: [PATCH 08/93] Only use USM for the kernel matrix. --- include/plssvm/backends/CUDA/detail/device_ptr.cuh | 3 +++ include/plssvm/backends/gpu_csvm.hpp | 11 +++++------ include/plssvm/backends/gpu_device_ptr.hpp | 3 +++ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/plssvm/backends/CUDA/detail/device_ptr.cuh b/include/plssvm/backends/CUDA/detail/device_ptr.cuh index bb99e1ffe..e361a8d1d 100644 --- a/include/plssvm/backends/CUDA/detail/device_ptr.cuh +++ b/include/plssvm/backends/CUDA/detail/device_ptr.cuh @@ -59,6 +59,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr gpu_csvm::blas_level_3(const solver PLSSVM_ASSERT(B.shape() == C.shape(), "The B ({}) and C ({}) matrices must have the same shape!", B.shape(), C.shape()); PLSSVM_ASSERT(B.padding() == C.padding(), "The B ({}) and C ({}) matrices must have the same padding!", B.padding(), C.padding()); - const bool use_usm_allocations = solver == solver_type::cg_streaming; const std::size_t num_devices = this->num_available_devices(); // the C and B matrices; completely stored on each device @@ -334,7 +333,7 @@ void gpu_csvm::blas_level_3(const solver // the partial C result from a specific device later stored on device 0 to perform the C reduction (inplace matrix addition) device_ptr_type partial_C_d{}; if (num_devices > 1) { - partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0], use_usm_allocations }; + partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0] }; } // split memory allocation and memory copy! @@ -347,8 +346,8 @@ void gpu_csvm::blas_level_3(const solver const queue_type &device = devices_[device_id]; // allocate memory on the device - B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device, use_usm_allocations }; - C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device, use_usm_allocations }; + B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device }; + C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device }; } #pragma omp parallel for ordered if (num_devices > 1) diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp index b5d396051..c4a277e06 100644 --- a/include/plssvm/backends/gpu_device_ptr.hpp +++ b/include/plssvm/backends/gpu_device_ptr.hpp @@ -56,6 +56,7 @@ class gpu_device_ptr { * @brief Construct a device_ptr for the device managed by @p queue with the extents { @p size, 1 }. * @param[in] size the size of the managed memory * @param[in] queue the queue (or similar) to manage the device_ptr + * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend */ gpu_device_ptr(size_type size, const queue_type queue, bool use_usm_allocations); /** @@ -63,6 +64,7 @@ class gpu_device_ptr { * @details The managed memory size is: extents[0] * extents[1]. * @param[in] shape the 2D size of the managed memory; size = shape.x * shape.y * @param[in] queue the queue (or similar) to manage the device_ptr + * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend */ gpu_device_ptr(plssvm::shape shape, const queue_type queue, bool use_usm_allocations); /** @@ -71,6 +73,7 @@ class gpu_device_ptr { * @param[in] shape the extents of the managed memory * @param[in] padding the padding applied to the extents * @param[in] queue the queue (or similar) to manage the device_ptr + * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend */ gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue, bool use_usm_allocations); From 55ad7211f6abf8b73e2e0a92a355eecaf9f05b8c Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 14:44:58 +0200 Subject: [PATCH 09/93] Improve automatic solver_type handling. --- include/plssvm/csvm.hpp | 112 ++++++++++++-------- include/plssvm/detail/data_distribution.hpp | 17 +++ src/plssvm/detail/data_distribution.cpp | 46 ++++++++ 3 files changed, 128 insertions(+), 47 deletions(-) diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp index 1e23aec9f..098111397 100644 --- a/include/plssvm/csvm.hpp +++ b/include/plssvm/csvm.hpp @@ -792,7 +792,7 @@ std::tuple, std::vector, std::vectornum_available_devices() }; const std::vector total_memory_needed_explicit_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs); - const detail::memory_size total_memory_needed_streaming = std::accumulate(total_memory_needed_explicit_per_device.cbegin(), total_memory_needed_explicit_per_device.cend(), detail::memory_size{}); + const std::pair> total_memory_needed_streaming_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(num_features, num_rhs); const std::vector total_memory_needed_implicit_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs); // format a vector differentiating between it containing only a single entry or multiple @@ -811,9 +811,9 @@ std::tuple, std::vector, std::vector(percentual_safety_margin * 100.0L), minimal_safety_margin, detail::tracking::tracking_entry{ "solver", "system_memory", total_system_memory }, @@ -821,13 +821,15 @@ std::tuple, std::vector, std::vector std::vector { @@ -849,11 +851,17 @@ std::tuple, std::vector, std::vector failed_cg_streaming_constraints = check_sizes(total_memory_needed_streaming_per_device.second, usable_device_memory_per_device); + total_memory_needed_streaming_per_device.first <= usable_system_memory && failed_cg_streaming_constraints.empty()) { // use the implicit solver type used_solver = solver_type::cg_streaming; } else { - detail::log(verbosity_level::full, "Cannot use cg_streaming due to memory constraints on the system memory!\n"); + if (!failed_cg_streaming_constraints.empty()) { + detail::log(verbosity_level::full, "Cannot use cg_streaming due to memory constraints on device(s) {}!\n", format_vector(failed_cg_streaming_constraints)); + } + if (total_memory_needed_streaming_per_device.first > usable_system_memory) { + detail::log(verbosity_level::full, "Cannot use cg_streaming due to system memory constraints!\n"); + } // check whether there is enough memory available for cg_implicit if (const std::vector failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) { @@ -868,46 +876,56 @@ std::tuple, std::vector, std::vector max_mem_alloc_size_per_device = this->get_max_mem_alloc_size(); + // get the maximum possible memory allocation size per device + const std::vector max_mem_alloc_size_per_device = this->get_max_mem_alloc_size(); - // get the maximum single allocation size per device - const std::vector max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); - const std::vector max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); + // get the maximum single allocation size per device + const std::vector max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); + const std::vector max_single_allocation_cg_streaming_size_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); + const std::vector max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); - // output the maximum memory allocation size per device + // output the maximum memory allocation size per device + detail::log(verbosity_level::full, + " - maximum supported single memory allocation size: {}\n" + " - maximum needed single memory allocation size (cg_explicit): {}\n" + " - maximum needed single memory allocation size (cg_streaming): {}\n" + " - maximum needed single memory allocation size (cg_implicit): {}\n", + format_vector(max_mem_alloc_size_per_device), + format_vector(max_single_allocation_cg_explicit_size_per_device), + format_vector(max_single_allocation_cg_streaming_size_per_device), + format_vector(max_single_allocation_cg_implicit_size_per_device)); + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device })); + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device })); + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_streaming", max_single_allocation_cg_streaming_size_per_device })); + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device })); + + // check whether the maximum single memory allocation sizes per device can be satisfied + // check whether the maximum single cg_explicit memory allocation size can be satisfied + if (const std::vector failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device); + used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) { + // max mem alloc size constraints not fulfilled detail::log(verbosity_level::full, - " - maximum supported single memory allocation size: {}\n" - " - maximum needed single memory allocation size (cg_explicit): {}\n" - " - maximum needed single memory allocation size (cg_implicit): {}\n", - format_vector(max_mem_alloc_size_per_device), - format_vector(max_single_allocation_cg_explicit_size_per_device), - format_vector(max_single_allocation_cg_implicit_size_per_device)); - PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device })); - PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device })); - PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device })); - - // check whether the maximum single memory allocation sizes per device can be satisfied - // check whether the maximum single cg_explicit memory allocation size can be satisfied - if (const std::vector failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device); - used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) { - // max mem alloc size constraints not fulfilled - detail::log(verbosity_level::full, - "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n", - format_vector(failed_cg_explicit_constraints)); - // can't use cg_explicit - used_solver = solver_type::cg_implicit; - } - if (const std::vector failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device); - used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) { - // can't fulfill maximum single memory allocation size even for cg_implicit - plssvm::detail::log(verbosity_level::full | verbosity_level::warning, - "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, " - "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n"); - throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) }; - } + "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_streaming.\n", + format_vector(failed_cg_explicit_constraints)); + // can't use cg_explicit + used_solver = solver_type::cg_streaming; + } + if (const std::vector failed_cg_streaming_constraints = check_sizes(max_single_allocation_cg_streaming_size_per_device, max_mem_alloc_size_per_device); + used_solver == solver_type::cg_streaming && !failed_cg_streaming_constraints.empty()) { + // max mem alloc size constraints not fulfilled + detail::log(verbosity_level::full, + "Cannot use cg_streaming due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n", + format_vector(failed_cg_streaming_constraints)); + // can't use cg_streaming + used_solver = solver_type::cg_implicit; + } + if (const std::vector failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device); + used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) { + // can't fulfill maximum single memory allocation size even for cg_implicit + plssvm::detail::log(verbosity_level::full | verbosity_level::warning, + "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, " + "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n"); + throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) }; } #endif } diff --git a/include/plssvm/detail/data_distribution.hpp b/include/plssvm/detail/data_distribution.hpp index 0d4acd5ac..c7968108a 100644 --- a/include/plssvm/detail/data_distribution.hpp +++ b/include/plssvm/detail/data_distribution.hpp @@ -20,6 +20,7 @@ #include // std::size_t #include // std::ostream forward declaration +#include // std::pair #include // std::vector namespace plssvm::detail { @@ -164,6 +165,22 @@ class triangular_data_distribution : public data_distribution { */ [[nodiscard]] std::vector calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(std::size_t num_features, std::size_t num_classes) const; + /** + * @brief Calculate the theoretical total memory needed per place for assembling the kernel matrix using USM. + * @param[in] num_features the total number of features + * @param[in] num_classes the total number of classes + * @return the theoretical total memory needed per place for cg_streaming (`[[nodiscard]]`) + */ + [[nodiscard]] std::pair> calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(std::size_t num_features, std::size_t num_classes) const; + + /** + * @brief Calculate the theoretical maximum single memory allocation size per place for assembling the kernel matrix using USM. + * @param[in] num_features the total number of features + * @param[in] num_classes the total number of classes + * @return the theoretical maximum single memory allocation size per place for cg_streaming (`[[nodiscard]]`) + */ + [[nodiscard]] std::vector calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(std::size_t num_features, std::size_t num_classes) const; + /** * @brief Calculate the theoretical total memory needed per place for implicitly assembling the kernel matrix. * @param[in] num_features the total number of features diff --git a/src/plssvm/detail/data_distribution.cpp b/src/plssvm/detail/data_distribution.cpp index dc979761e..db326fa59 100644 --- a/src/plssvm/detail/data_distribution.cpp +++ b/src/plssvm/detail/data_distribution.cpp @@ -18,6 +18,7 @@ #include // std::max, std::fill #include // std::size_t #include // std::ostream +#include // std::pair, std::make_pair #include // std::vector [[nodiscard]] std::size_t calculate_data_set_num_entries(const std::size_t num_data_points, const std::size_t num_features) noexcept { @@ -170,6 +171,51 @@ std::vector triangular_data_distribution::calculate_maximum_explici return res; } +std::pair> triangular_data_distribution::calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(const std::size_t num_features, const std::size_t num_classes) const { + PLSSVM_ASSERT(num_features > 0, "At least one feature must be present!"); + PLSSVM_ASSERT(num_classes > 0, "At least two classes must be present!"); + + const std::size_t num_places = this->num_places(); + const std::size_t num_rows = this->num_rows() + 1; // account for dimensional reduction + // first: system memory + // second: device memory + std::pair> res = std::make_pair(0_B, std::vector(num_places, 0_B)); + + for (std::size_t device_id = 0; device_id < num_places; ++device_id) { + // check whether the current device is responsible for at least one data point! + if (this->place_specific_num_rows(device_id) == 0) { + continue; + } + + // data set including padding + const std::size_t data_set_size = ::calculate_data_set_num_entries(num_rows, num_features); + + // the size of q_red + const std::size_t q_red_size = ::calculate_q_red_num_entries(num_rows); + + // the size of the explicitly stored kernel matrix + const std::size_t kernel_matrix_size{ this->calculate_explicit_kernel_matrix_num_entries_padded(device_id) }; + + // the B and C matrices for the explicit SYMM kernel + std::size_t blas_matrices_size = 2 * ::calculate_blas_matrix_entries(num_rows, num_classes); + if (device_id == 0 && num_places > 1) { + // device 0 has to save an additional matrix used to accumulate the partial results from the other devices + blas_matrices_size += ::calculate_blas_matrix_entries(num_rows, num_classes); + } + + // add up the individual sizes and report the memory size in BYTES + // for streaming, the kernel matrix is on the host, while everything else is on the device + res.first += memory_size{ sizeof(real_type) * kernel_matrix_size }; + res.second[device_id] = memory_size{ sizeof(real_type) * (q_red_size + std::max(data_set_size, blas_matrices_size)) }; + } + + return res; +} + +std::vector triangular_data_distribution::calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const { + return this->calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_classes); +} + std::vector triangular_data_distribution::calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const { PLSSVM_ASSERT(num_features > 0, "At least one feature must be present!"); PLSSVM_ASSERT(num_classes > 0, "At least two classes must be present!"); From dad3561fa3eb10600a2ca36ecd979a56a9245ee1 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 14:50:51 +0200 Subject: [PATCH 10/93] Implement cg_streaming via USM allocations in SYCL. --- .../plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp | 2 +- .../SYCL/AdaptiveCpp/detail/device_ptr.hpp | 10 +++++++--- include/plssvm/backends/SYCL/DPCPP/csvm.hpp | 2 +- .../backends/SYCL/DPCPP/detail/device_ptr.hpp | 10 +++++++--- src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp | 6 ++++-- .../SYCL/AdaptiveCpp/detail/device_ptr.cpp | 18 +++++++++++------- src/plssvm/backends/SYCL/DPCPP/csvm.cpp | 6 ++++-- .../backends/SYCL/DPCPP/detail/device_ptr.cpp | 18 +++++++++++------- 8 files changed, 46 insertions(+), 26 deletions(-) diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp index 131116260..121e891db 100644 --- a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp +++ b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp @@ -168,7 +168,7 @@ class csvm : public ::plssvm::detail::gpu_csvm device_ptr_type { +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { const std::size_t num_rows_reduced = data_d.shape().x - 1; const std::size_t num_features = data_d.shape().y; const queue_type &device = devices_[device_id]; @@ -218,7 +218,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_); const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id); - device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix + // if solver == solver_type::cg_explicit: store it explicitly + // if solver == solver_type::cg_streaming: store it using USM + device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations }; const real_type cost_factor = real_type{ 1.0 } / params.cost; // convert execution range block to SYCL's native range<2> diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp index 0338d10c9..2c571e591 100644 --- a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp +++ b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp @@ -26,17 +26,21 @@ namespace plssvm::adaptivecpp::detail { template -device_ptr::device_ptr(const size_type size, const queue &q) : - device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q } { } +device_ptr::device_ptr(const size_type size, const queue &q, const bool use_usm_allocations) : + device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const queue &q) : - device_ptr{ shape, plssvm::shape{ 0, 0 }, q } { } +device_ptr::device_ptr(const plssvm::shape shape, const queue &q, const bool use_usm_allocations) : + device_ptr{ shape, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q) : - base_type{ shape, padding, q } { - data_ = ::sycl::malloc_device(this->size_padded(), queue_.impl->sycl_queue); +device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q, const bool use_usm_allocations) : + base_type{ shape, padding, q, use_usm_allocations } { + if (use_usm_allocations_) { + data_ = ::sycl::malloc_shared(this->size_padded(), queue_.impl->sycl_queue); + } else { + data_ = ::sycl::malloc_device(this->size_padded(), queue_.impl->sycl_queue); + } this->memset(0); } diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp index 4d626174b..5687d42ce 100644 --- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp @@ -191,7 +191,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id) // fit // //***************************************************// -auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { const std::size_t num_rows_reduced = data_d.shape().x - 1; const std::size_t num_features = data_d.shape().y; const queue_type &device = devices_[device_id]; @@ -206,7 +206,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_); const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id); - device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix + // if solver == solver_type::cg_explicit: store it explicitly + // if solver == solver_type::cg_streaming: store it using USM + device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations }; const real_type cost_factor = real_type{ 1.0 } / params.cost; // convert execution range block to SYCL's native range<2> diff --git a/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp index c24b84407..456102d02 100644 --- a/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp @@ -23,17 +23,21 @@ namespace plssvm::dpcpp::detail { template -device_ptr::device_ptr(const size_type size, const queue &q) : - device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q } { } +device_ptr::device_ptr(const size_type size, const queue &q, const bool use_usm_allocations) : + device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const queue &q) : - device_ptr{ shape, plssvm::shape{ 0, 0 }, q } { } +device_ptr::device_ptr(const plssvm::shape shape, const queue &q, const bool use_usm_allocations) : + device_ptr{ shape, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, plssvm::shape padding, const queue &q) : - base_type{ shape, padding, q } { - data_ = ::sycl::malloc_device(this->size_padded(), queue_.impl->sycl_queue); +device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q, const bool use_usm_allocations) : + base_type{ shape, padding, q, use_usm_allocations } { + if (use_usm_allocations_) { + data_ = ::sycl::malloc_shared(this->size_padded(), queue_.impl->sycl_queue); + } else { + data_ = ::sycl::malloc_device(this->size_padded(), queue_.impl->sycl_queue); + } this->memset(0); } From f29c792ae8035a09149f420266107a7eb4549a3f Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 14:54:53 +0200 Subject: [PATCH 11/93] Implement cg_streaming via USM allocations in HIP. --- include/plssvm/backends/HIP/csvm.hpp | 2 +- .../backends/HIP/detail/device_ptr.hip.hpp | 12 ++++++++---- src/plssvm/backends/HIP/csvm.hip | 6 ++++-- src/plssvm/backends/HIP/detail/device_ptr.hip | 18 +++++++++++------- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/include/plssvm/backends/HIP/csvm.hpp b/include/plssvm/backends/HIP/csvm.hpp index 12aea214d..d50b948e0 100644 --- a/include/plssvm/backends/HIP/csvm.hpp +++ b/include/plssvm/backends/HIP/csvm.hpp @@ -156,7 +156,7 @@ class csvm : public ::plssvm::detail::gpu_csvm device_ptr_type { +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { const unsigned long long num_rows_reduced = data_d.shape().x - 1; const unsigned long long num_features = data_d.shape().y; const queue_type &device = devices_[device_id]; @@ -180,7 +180,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_); const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id); - device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix + // if solver == solver_type::cg_explicit: store it explicitly + // if solver == solver_type::cg_streaming: store it using USM + device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations }; const real_type cost_factor = real_type{ 1.0 } / params.cost; // convert execution range block to HIP's native dim3 diff --git a/src/plssvm/backends/HIP/detail/device_ptr.hip b/src/plssvm/backends/HIP/detail/device_ptr.hip index 560783097..c958c73fd 100644 --- a/src/plssvm/backends/HIP/detail/device_ptr.hip +++ b/src/plssvm/backends/HIP/detail/device_ptr.hip @@ -29,21 +29,25 @@ namespace plssvm::hip::detail { template -device_ptr::device_ptr(const size_type size, const queue_type device) : - device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { } +device_ptr::device_ptr(const size_type size, const queue_type device, const bool use_usm_allocations) : + device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const queue_type device) : - device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { } +device_ptr::device_ptr(const plssvm::shape shape, const queue_type device, const bool use_usm_allocations) : + device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device) : - base_type{ shape, padding, device } { +device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) : + base_type{ shape, padding, device, use_usm_allocations } { if (queue_ < 0 || queue_ >= static_cast(get_device_count())) { throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) }; } detail::set_device(queue_); - PLSSVM_HIP_ERROR_CHECK(hipMalloc(&data_, this->size_padded() * sizeof(value_type))) + if (use_usm_allocations_) { + PLSSVM_HIP_ERROR_CHECK(hipMallocManaged(&data_, this->size_padded() * sizeof(value_type))) + } else { + PLSSVM_HIP_ERROR_CHECK(hipMalloc(&data_, this->size_padded() * sizeof(value_type))) + } this->memset(0); } From c53ea4252c848f9fc90f7964ce96917c42ada829 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 14:59:35 +0200 Subject: [PATCH 12/93] For OpenMP and stdpar, cg_streaming is equal to cg_explicit. --- src/plssvm/backends/OpenMP/csvm.cpp | 2 ++ src/plssvm/backends/stdpar/csvm.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index 938c4c843..526257278 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -100,6 +100,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // unreachable break; case solver_type::cg_explicit: + case solver_type::cg_streaming: { const plssvm::detail::triangular_data_distribution dist{ A.num_rows() - 1, this->num_available_devices() }; std::vector kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); // only explicitly store the upper triangular matrix @@ -153,6 +154,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s // unreachable break; case solver_type::cg_explicit: + case solver_type::cg_streaming: { const std::size_t num_rhs = B.shape().x; const std::size_t num_rows = B.shape().y; diff --git a/src/plssvm/backends/stdpar/csvm.cpp b/src/plssvm/backends/stdpar/csvm.cpp index 1df113531..841fcaa34 100644 --- a/src/plssvm/backends/stdpar/csvm.cpp +++ b/src/plssvm/backends/stdpar/csvm.cpp @@ -68,6 +68,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // unreachable break; case solver_type::cg_explicit: + case solver_type::cg_streaming: { const plssvm::detail::triangular_data_distribution dist{ A.num_rows() - 1, this->num_available_devices() }; std::vector kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); // only explicitly store the upper triangular matrix @@ -121,6 +122,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s // unreachable break; case solver_type::cg_explicit: + case solver_type::cg_streaming: { const std::size_t num_rhs = B.shape().x; const std::size_t num_rows = B.shape().y; From f41aa355f9c0f31d92ec74878b20e602849d5cd4 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 16:20:51 +0200 Subject: [PATCH 13/93] Implement cg_streaming via USM allocations in OpenCL (using some ugly workarounds). --- include/plssvm/backends/OpenCL/csvm.hpp | 2 +- .../backends/OpenCL/detail/device_ptr.hpp | 28 +++- .../plssvm/backends/OpenCL/detail/utility.hpp | 17 ++- include/plssvm/detail/type_traits.hpp | 20 +++ include/plssvm/detail/utility.hpp | 15 +++ src/plssvm/backends/OpenCL/csvm.cpp | 52 ++++---- .../backends/OpenCL/detail/device_ptr.cpp | 124 +++++++++++++----- 7 files changed, 195 insertions(+), 63 deletions(-) diff --git a/include/plssvm/backends/OpenCL/csvm.hpp b/include/plssvm/backends/OpenCL/csvm.hpp index 11d57c424..460f8d54e 100644 --- a/include/plssvm/backends/OpenCL/csvm.hpp +++ b/include/plssvm/backends/OpenCL/csvm.hpp @@ -155,7 +155,7 @@ class csvm : public ::plssvm::detail::gpu_csvm // std::size_t +#include // std::variant namespace plssvm::opencl::detail { @@ -35,6 +36,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr get_variant(); + /** + * @brief Get a pointer to the device memory. + * @details If USM allocations are used, returns a `T*` otherwise returns a `cl_mem` object. + * @return a variant containing the device memory pointer (`[[nodiscard]]`) + */ + [[nodiscard]] std::variant get_variant() const; + /** * @copydoc plssvm::detail::gpu_device_ptr::memset(int, size_type, size_type) */ @@ -123,6 +141,10 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr; diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp index 5e58435f3..780ce6ba1 100644 --- a/include/plssvm/backends/OpenCL/detail/utility.hpp +++ b/include/plssvm/backends/OpenCL/detail/utility.hpp @@ -20,10 +20,12 @@ #include "plssvm/backends/OpenCL/detail/kernel.hpp" // plssvm::opencl::detail::compute_kernel_name #include "plssvm/backends/OpenCL/exceptions.hpp" // plssvm::opencl::backend_exception #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/type_list.hpp" // plssvm::detail::{remove_cvref_t, is_variant_v} +#include "plssvm/detail/utility.hpp" // plssvm::detail::visit_overload #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "CL/cl.h" // cl_uint, cl_int, clSetKernelArg, clEnqueueNDRangeKernel, clFinish +#include "CL/cl.h" // cl_uint, cl_int, clSetKernelArg, clSetKernelArgSVMPointer, clEnqueueNDRangeKernel, clFinish #include "fmt/format.h" // fmt::format @@ -31,6 +33,7 @@ #include // std::string #include // std::string_view #include // std::forward, std::pair +#include // std::variant, std::visit #include // std::vector /** @@ -141,7 +144,17 @@ inline void set_kernel_args(cl_kernel kernel, Args... args) { cl_uint i = 0; // iterate over parameter pack and set OpenCL kernel ([&](auto &arg) { - const error_code ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg); + error_code ec{}; + // check if we have to set a variant value + if constexpr (::plssvm::detail::is_variant_v<::plssvm::detail::remove_cvref_t>) { + std::visit(::plssvm::detail::visit_overload{ + [&](cl_mem &kernel_arg) { ec = clSetKernelArg(kernel, i++, sizeof(decltype(kernel_arg)), &kernel_arg); }, + [&](auto &kernel_arg) { ec = clSetKernelArgSVMPointer(kernel, i++, kernel_arg); } }, + arg); + } else { + // set kernel argument normally + ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg); + } PLSSVM_OPENCL_ERROR_CHECK(ec, fmt::format("error setting OpenCL kernel argument {}", i - 1)) }(args), ...); diff --git a/include/plssvm/detail/type_traits.hpp b/include/plssvm/detail/type_traits.hpp index 0ad95542d..effa4f556 100644 --- a/include/plssvm/detail/type_traits.hpp +++ b/include/plssvm/detail/type_traits.hpp @@ -24,6 +24,7 @@ #include // std::enable_if_t, std::remove_cv_t, std::remove_reference_t, std::false_type, std::true_type #include // std::unordered_map, std::unordered_multimap #include // std::unordered_set, std::unordered_multiset +#include // std::variant #include // std::vector namespace plssvm::detail { @@ -342,6 +343,25 @@ constexpr bool is_unordered_associative_container_v = is_unordered_set_v || i template constexpr bool is_container_v = is_sequence_container_v || is_associative_container_v || is_unordered_associative_container_v; +/** + * @brief Type trait to check whether @p T is a `std::variant`. + * @tparam T the type to check + */ +template +struct is_variant : std::false_type { }; + +/** + * @copybrief plssvm::detail::is_variant + */ +template +struct is_variant> : std::true_type { }; + +/** + * @copybrief plssvm::detail::is_variant + */ +template +constexpr bool is_variant_v = is_variant::value; + } // namespace plssvm::detail #endif // PLSSVM_DETAIL_TYPE_TRAITS_HPP_ diff --git a/include/plssvm/detail/utility.hpp b/include/plssvm/detail/utility.hpp index e81d46ae1..613a571cc 100644 --- a/include/plssvm/detail/utility.hpp +++ b/include/plssvm/detail/utility.hpp @@ -50,6 +50,21 @@ namespace plssvm::detail { +/** + * @brief Shorthand for a more readable `std::visit` overload set. + * @tparam Ts the visited types + */ +template +struct visit_overload : Ts... { + using Ts::operator()...; +}; + +/** + * @brief plssvm::detail::visit_overload + */ +template +visit_overload(Ts...) -> visit_overload; + /** * @brief Invokes undefined behavior. Used to mark code paths that may never be reachable. * @details See: C++23 [`std::unreachable`](https://en.cppreference.com/w/cpp/utility/unreachable) diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp index 562a63893..8fa57874e 100644 --- a/src/plssvm/backends/OpenCL/csvm.cpp +++ b/src/plssvm/backends/OpenCL/csvm.cpp @@ -262,7 +262,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s // fit // //***************************************************// -auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { const cl_ulong num_rows_reduced = data_d.shape().x - 1; const cl_ulong num_features = data_d.shape().y; const queue_type &device = devices_[device_id]; @@ -277,7 +277,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_); const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id); - device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix + // if solver == solver_type::cg_explicit: store it explicitly + // if solver == solver_type::cg_streaming: store it using USM + device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations }; const real_type cost_factor = real_type{ 1.0 } / params.cost; // convert execution range block to OpenCL's native std::vector @@ -295,22 +297,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons switch (params.kernel_type) { case kernel_function_type::linear: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y); break; case kernel_function_type::polynomial: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; } } @@ -342,7 +344,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get_variant(), beta, C_d.get_variant(), grid_offset_x, grid_offset_y); } // convert execution range block to OpenCL's native std::vector @@ -359,7 +361,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get_variant(), beta, C_d.get_variant(), grid_offset_x, grid_offset_y); } } detail::device_synchronize(device); @@ -382,7 +384,7 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_add_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get(), rhs_d.get(), grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_add_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get_variant(), rhs_d.get_variant(), grid_offset_x, grid_offset_y); } detail::device_synchronize(device); } @@ -404,7 +406,7 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm: const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_scale_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get(), scale, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_scale_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get_variant(), scale, grid_offset_x, grid_offset_y); } detail::device_synchronize(device); } @@ -437,22 +439,22 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de switch (params.kernel_type) { case kernel_function_type::linear: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y); break; case kernel_function_type::polynomial: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; } } @@ -488,7 +490,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::w_kernel), native_partial_grid, native_block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::w_kernel), native_partial_grid, native_block, w_d.get_variant(), alpha_d.get_variant(), sv_d.get_variant(), num_classes, num_sv, device_specific_num_sv, sv_offset, grid_offset_x, grid_offset_y); } detail::device_synchronize(device); @@ -519,22 +521,22 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai switch (params.kernel_type) { case kernel_function_type::linear: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_linear), native_partial_grid, native_block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_linear), native_partial_grid, native_block, out_d.get_variant(), sv_or_w_d.get_variant(), rho_d.get_variant(), predict_points_d.get_variant(), num_classes, num_predict_points, num_features, grid_offset_x, grid_offset_y); break; case kernel_function_type::polynomial: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_polynomial), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_polynomial), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_rbf), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_rbf), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_sigmoid), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_sigmoid), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_laplacian), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_laplacian), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_chi_squared), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_chi_squared), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; } } diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp index 6aa67802a..c9439ae9e 100644 --- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp +++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp @@ -21,30 +21,36 @@ #include "fmt/format.h" // fmt::format -#include // std::min +#include // std::min, std::fill #include // std::array #include // std::size_t +#include // std::memcpy #include // std::terminate #include // std::cerr, std::endl +#include // std::variant #include // std::vector namespace plssvm::opencl::detail { template -device_ptr::device_ptr(const size_type size, const command_queue &queue) : - device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, queue } { } +device_ptr::device_ptr(const size_type size, const command_queue &queue, const bool use_usm_allocations) : + device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, queue, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const command_queue &queue) : - device_ptr{ shape, plssvm::shape{ 0, 0 }, queue } { } +device_ptr::device_ptr(const plssvm::shape shape, const command_queue &queue, const bool use_usm_allocations) : + device_ptr{ shape, plssvm::shape{ 0, 0 }, queue, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const command_queue &queue) : - base_type{ shape, padding, &queue } { +device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const command_queue &queue, const bool use_usm_allocations) : + base_type{ shape, padding, &queue, use_usm_allocations } { error_code err{}; cl_context cont{}; PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast(&cont), nullptr), "error retrieving the command queue context") - data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err); + if (use_usm_allocations_) { + usm_ptr_ = static_cast(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0)); + } else { + data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err); + } PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the buffer") this->memset(0); } @@ -56,12 +62,35 @@ device_ptr::~device_ptr() { if (data_ != nullptr) { PLSSVM_OPENCL_ERROR_CHECK(clReleaseMemObject(data_), "error releasing the buffer") } + if (use_usm_allocations_ && usm_ptr_ != nullptr) { + cl_context cont{}; + PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast(&cont), nullptr), "error retrieving the command queue context") + clSVMFree(cont, usm_ptr_); + } } catch (const plssvm::exception &e) { std::cout << e.what_with_loc() << std::endl; std::terminate(); } } +template +auto device_ptr::get_variant() -> std::variant { + if (use_usm_allocations_) { + return { usm_ptr_ }; + } else { + return { this->get() }; + } +} + +template +auto device_ptr::get_variant() const -> std::variant { + if (use_usm_allocations_) { + return { usm_ptr_ }; + } else { + return { this->get() }; + } +} + template void device_ptr::memset(const int pattern, const size_type pos, const size_type num_bytes) { PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); @@ -70,10 +99,14 @@ void device_ptr::memset(const int pattern, const size_type pos, const size_ty throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) }; } const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type)); - error_code err; - const auto correct_value = static_cast(pattern); - err = clEnqueueFillBuffer(queue_->queue, data_, &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset") + if (use_usm_allocations_) { + std::memset(usm_ptr_ + pos, pattern, rnum_bytes); + } else { + error_code err; + const auto correct_value = static_cast(pattern); + err = clEnqueueFillBuffer(queue_->queue, data_, &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr); + PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset") + } device_synchronize(*queue_); } @@ -87,9 +120,13 @@ void device_ptr::fill(const value_type value, const size_type pos, const size // run GPU kernel const size_type rcount = std::min(count, this->size_padded() - pos); - error_code err; - err = clEnqueueFillBuffer(queue_->queue, data_, &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill") + if (use_usm_allocations_) { + std::fill(usm_ptr_ + pos, usm_ptr_ + pos + rcount, value); + } else { + error_code err; + err = clEnqueueFillBuffer(queue_->queue, data_, &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr); + PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill") + } device_synchronize(*queue_); } @@ -99,9 +136,13 @@ void device_ptr::copy_to_device(const_host_pointer_type data_to_copy, const s PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); const size_type rcount = std::min(count, this->size_padded() - pos); - error_code err; - err = clEnqueueWriteBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer") + if (use_usm_allocations_) { + std::memcpy(usm_ptr_ + pos, data_to_copy, rcount); + } else { + error_code err; + err = clEnqueueWriteBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr); + PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer") + } device_synchronize(*queue_); } @@ -114,17 +155,32 @@ void device_ptr::copy_to_device_strided(const_host_pointer_type data_to_copy, throw backend_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) }; } - const std::array buffer_origin{ 0, 0, 0 }; - const std::array host_origin{ 0, 0, 0 }; - const std::array region{ width * sizeof(value_type), height, 1 }; - const std::size_t buffer_row_pitch = this->shape_padded().x * sizeof(value_type); - const std::size_t buffer_slice_pitch = 0; - const std::size_t host_row_pitch = spitch * sizeof(value_type); - const std::size_t host_slice_pitch = 0; - - error_code err; - err = clEnqueueWriteBufferRect(queue_->queue, data_, CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the strided data to the device buffer") + if (use_usm_allocations_) { + if (spitch == width) { + // can use normal copy since we have no line strides + this->copy_to_device(data_to_copy, 0, width * height); + } else { + std::vector temp(this->shape_padded().x * height, value_type{ 0.0 }); + value_type *pos = temp.data(); + for (std::size_t row = 0; row < height; ++row) { + std::memcpy(pos, data_to_copy + row * spitch, width * sizeof(value_type)); + pos += this->shape_padded().x; + } + this->copy_to_device(temp); + } + } else { + const std::array buffer_origin{ 0, 0, 0 }; + const std::array host_origin{ 0, 0, 0 }; + const std::array region{ width * sizeof(value_type), height, 1 }; + const std::size_t buffer_row_pitch = this->shape_padded().x * sizeof(value_type); + const std::size_t buffer_slice_pitch = 0; + const std::size_t host_row_pitch = spitch * sizeof(value_type); + const std::size_t host_slice_pitch = 0; + + error_code err; + err = clEnqueueWriteBufferRect(queue_->queue, data_, CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr); + PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the strided data to the device buffer") + } device_synchronize(*queue_); } @@ -134,9 +190,13 @@ void device_ptr::copy_to_host(host_pointer_type buffer, const size_type pos, PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); const size_type rcount = std::min(count, this->size_padded() - pos); - error_code err; - err = clEnqueueReadBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer") + if (use_usm_allocations_) { + std::memcpy(buffer, usm_ptr_ + pos, rcount); + } else { + error_code err; + err = clEnqueueReadBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr); + PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer") + } device_synchronize(*queue_); } From b5894e01422611fba501d48065697315affc54a7 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 16:42:17 +0200 Subject: [PATCH 14/93] Only call get_variant() where necessary. --- src/plssvm/backends/OpenCL/csvm.cpp | 46 ++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp index 8fa57874e..359bc0268 100644 --- a/src/plssvm/backends/OpenCL/csvm.cpp +++ b/src/plssvm/backends/OpenCL/csvm.cpp @@ -297,22 +297,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons switch (params.kernel_type) { case kernel_function_type::linear: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y); break; case kernel_function_type::polynomial: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; } } @@ -344,7 +344,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get_variant(), beta, C_d.get_variant(), grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y); } // convert execution range block to OpenCL's native std::vector @@ -361,7 +361,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get_variant(), beta, C_d.get_variant(), grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y); } } detail::device_synchronize(device); @@ -384,7 +384,7 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_add_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get_variant(), rhs_d.get_variant(), grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_add_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get(), rhs_d.get(), grid_offset_x, grid_offset_y); } detail::device_synchronize(device); } @@ -406,7 +406,7 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm: const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_scale_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get_variant(), scale, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_scale_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get(), scale, grid_offset_x, grid_offset_y); } detail::device_synchronize(device); } @@ -439,22 +439,22 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de switch (params.kernel_type) { case kernel_function_type::linear: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y); break; case kernel_function_type::polynomial: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; } } @@ -490,7 +490,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::w_kernel), native_partial_grid, native_block, w_d.get_variant(), alpha_d.get_variant(), sv_d.get_variant(), num_classes, num_sv, device_specific_num_sv, sv_offset, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::w_kernel), native_partial_grid, native_block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, grid_offset_x, grid_offset_y); } detail::device_synchronize(device); @@ -521,22 +521,22 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai switch (params.kernel_type) { case kernel_function_type::linear: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_linear), native_partial_grid, native_block, out_d.get_variant(), sv_or_w_d.get_variant(), rho_d.get_variant(), predict_points_d.get_variant(), num_classes, num_predict_points, num_features, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_linear), native_partial_grid, native_block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, grid_offset_x, grid_offset_y); break; case kernel_function_type::polynomial: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_polynomial), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_polynomial), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_rbf), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_rbf), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_sigmoid), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_sigmoid), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_laplacian), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_laplacian), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_chi_squared), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_chi_squared), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; } } From ed9b633eec394dd6850c145b31ab31dbf7e2976e Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 7 Oct 2024 17:13:41 +0200 Subject: [PATCH 15/93] Add and improve error check. --- src/plssvm/backends/OpenCL/detail/device_ptr.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp index c9439ae9e..88114d6e1 100644 --- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp +++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp @@ -43,15 +43,16 @@ device_ptr::device_ptr(const plssvm::shape shape, const command_queue &queue, template device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const command_queue &queue, const bool use_usm_allocations) : base_type{ shape, padding, &queue, use_usm_allocations } { - error_code err{}; cl_context cont{}; PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast(&cont), nullptr), "error retrieving the command queue context") if (use_usm_allocations_) { usm_ptr_ = static_cast(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0)); + PLSSVM_ASSERT(usm_ptr_ != nullptr, "error creating OpenCL SVM allocation"); } else { + error_code err{}; data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err); + PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the buffer") } - PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the buffer") this->memset(0); } From d8502751aec9abd091bb63a690fac724e2d91671 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Tue, 8 Oct 2024 17:50:14 +0200 Subject: [PATCH 16/93] Use cg_explicit as maximum allocation size constraint. --- src/plssvm/detail/data_distribution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plssvm/detail/data_distribution.cpp b/src/plssvm/detail/data_distribution.cpp index db326fa59..016260389 100644 --- a/src/plssvm/detail/data_distribution.cpp +++ b/src/plssvm/detail/data_distribution.cpp @@ -213,7 +213,7 @@ std::pair> triangular_data_distribution::c } std::vector triangular_data_distribution::calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const { - return this->calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_classes); + return this->calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_classes); } std::vector triangular_data_distribution::calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const { From ed2e2a86acb48d0d53ff60044092934baa4114d4 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Tue, 8 Oct 2024 17:58:58 +0200 Subject: [PATCH 17/93] Improve output by mentioning the maximum guaranteed allocation size. --- include/plssvm/csvm.hpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp index 098111397..d9999c49a 100644 --- a/include/plssvm/csvm.hpp +++ b/include/plssvm/csvm.hpp @@ -879,6 +879,15 @@ std::tuple, std::vector, std::vector max_mem_alloc_size_per_device = this->get_max_mem_alloc_size(); + // utility function returning a vector of memory sizes that where the reasons for a failed check + const auto get_failed_memory_sizes = [&max_mem_alloc_size_per_device](const std::vector &failed_devices) { + std::vector failed_memory_sizes{}; + for (const std::size_t device : failed_devices) { + failed_memory_sizes.push_back(max_mem_alloc_size_per_device[device]); + } + return failed_memory_sizes; + }; + // get the maximum single allocation size per device const std::vector max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); const std::vector max_single_allocation_cg_streaming_size_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); @@ -903,9 +912,11 @@ std::tuple, std::vector, std::vector failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device); used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) { + // max mem alloc size constraints not fulfilled detail::log(verbosity_level::full, - "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_streaming.\n", + "Cannot use cg_explicit due to maximum single memory allocation constraints ({}) on device(s) {}! Falling back to cg_streaming.\n", + format_vector(get_failed_memory_sizes(failed_cg_explicit_constraints)), format_vector(failed_cg_explicit_constraints)); // can't use cg_explicit used_solver = solver_type::cg_streaming; @@ -914,7 +925,8 @@ std::tuple, std::vector, std::vector, std::vector, std::vector Date: Tue, 8 Oct 2024 18:02:51 +0200 Subject: [PATCH 18/93] Throw an exception if clSVMAlloc failed. --- src/plssvm/backends/OpenCL/detail/device_ptr.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp index 88114d6e1..086884dc6 100644 --- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp +++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp @@ -14,6 +14,7 @@ #include "plssvm/backends/OpenCL/detail/utility.hpp" // PLSSVM_OPENCL_ERROR_CHECK #include "plssvm/backends/OpenCL/exceptions.hpp" // plssvm::opencl::backend_exception #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/exceptions/exceptions.hpp" // plssvm::exception #include "plssvm/shape.hpp" // plssvm::shape @@ -47,6 +48,9 @@ device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast(&cont), nullptr), "error retrieving the command queue context") if (use_usm_allocations_) { usm_ptr_ = static_cast(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0)); + if (usm_ptr_ == nullptr) { + throw backend_exception{ fmt::format("Failed to allocate {} of memory using clSVMAlloc(...). Maybe that's larger than CL_DEVICE_MAX_MEM_ALLOC_SIZE?", ::plssvm::detail::memory_size{ this->size_padded() * sizeof(value_type) }) }; + } PLSSVM_ASSERT(usm_ptr_ != nullptr, "error creating OpenCL SVM allocation"); } else { error_code err{}; From 9fcdd7fab2915072d78b45e8f8b97f9ba9c6f0f9 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Tue, 8 Oct 2024 18:03:14 +0200 Subject: [PATCH 19/93] Rewrite OpenCL context logic to also support cg_streaming with multiple GPUs. --- .../plssvm/backends/OpenCL/detail/context.hpp | 11 +- src/plssvm/backends/OpenCL/csvm.cpp | 9 +- src/plssvm/backends/OpenCL/detail/context.cpp | 11 +- src/plssvm/backends/OpenCL/detail/utility.cpp | 195 ++++++++++-------- 4 files changed, 125 insertions(+), 101 deletions(-) diff --git a/include/plssvm/backends/OpenCL/detail/context.hpp b/include/plssvm/backends/OpenCL/detail/context.hpp index 2b26e6f93..c49f235dd 100644 --- a/include/plssvm/backends/OpenCL/detail/context.hpp +++ b/include/plssvm/backends/OpenCL/detail/context.hpp @@ -15,13 +15,12 @@ #include "CL/cl.h" // cl_context, cl_platform_id, cl_device_id -#include // std::vector - namespace plssvm::opencl::detail { /** * @brief RAII wrapper class around a cl_context. - * @details Also contains the associated platform and a list of all associated devices. + * @details Also contains the associated platform and device. + * @note Each context is guaranteed to only contain a single device, i.e., on multi-device system, one context for each device is created. */ class context { public: @@ -35,7 +34,7 @@ class context { * @param[in] platform the OpenCL platform associated with this OpenCL context * @param[in] devices the list of devices associated with this OpenCL cl_context */ - context(cl_context device_context, cl_platform_id platform, std::vector devices); + context(cl_context device_context, cl_platform_id platform, cl_device_id device); /** * @brief Delete copy-constructor to make context a move only type. @@ -78,8 +77,8 @@ class context { cl_context device_context{}; /// The OpenCL platform associated with this context. cl_platform_id platform{}; - /// All devices associated with this context. - std::vector devices{}; + /// The device associated with this context. + cl_device_id device{}; }; } // namespace plssvm::opencl::detail diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp index 359bc0268..cb032be85 100644 --- a/src/plssvm/backends/OpenCL/csvm.cpp +++ b/src/plssvm/backends/OpenCL/csvm.cpp @@ -106,16 +106,9 @@ void csvm::init(const target_platform target) { // get all available OpenCL contexts for the current target including devices with respect to the requested target platform std::tie(contexts_, target_) = detail::get_contexts(target); - // currently, only EXACTLY one OpenCL context is allowed + // at least one context must be created if (contexts_.empty()) { throw backend_exception{ fmt::format("No OpenCL context for the target {} could be found!", target_) }; - } else if (contexts_.size() > 1) { - throw backend_exception{ fmt::format("Currently only a single OpenCL context is allowed, but {} were found for the target {}!", contexts_.size(), target_) }; - } - - // throw exception if no devices for the requested target could be found - if (contexts_[0].devices.empty()) { - throw backend_exception{ fmt::format("OpenCL backend selected but no devices for the target {} were found!", target) }; } // print OpenCL info diff --git a/src/plssvm/backends/OpenCL/detail/context.cpp b/src/plssvm/backends/OpenCL/detail/context.cpp index e534e079d..e2184a0f4 100644 --- a/src/plssvm/backends/OpenCL/detail/context.cpp +++ b/src/plssvm/backends/OpenCL/detail/context.cpp @@ -12,25 +12,24 @@ #include // std::addressof #include // std::exchange, std::move -#include // std::vector namespace plssvm::opencl::detail { -context::context(cl_context p_device_context, cl_platform_id p_platform, std::vector p_devices) : +context::context(cl_context p_device_context, cl_platform_id p_platform, cl_device_id p_device) : device_context{ p_device_context }, platform{ p_platform }, - devices{ std::move(p_devices) } { } + device{ p_device } { } context::context(context &&other) noexcept : device_context{ std::exchange(other.device_context, nullptr) }, platform{ std::exchange(other.platform, nullptr) }, - devices{ std::move(other.devices) } { } + device{ other.device } { } -context &context::operator=(context &&other)noexcept { +context &context::operator=(context &&other) noexcept { if (this != std::addressof(other)) { other.device_context = std::exchange(other.device_context, nullptr); platform = std::exchange(other.platform, nullptr); - devices = std::move(other.devices); + device = std::move(other.device); } return *this; } diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp index 9a62c77cf..41354b4e4 100644 --- a/src/plssvm/backends/OpenCL/detail/utility.cpp +++ b/src/plssvm/backends/OpenCL/detail/utility.cpp @@ -139,10 +139,12 @@ namespace plssvm::opencl::detail { for (auto &[platform, devices] : platform_devices) { // create context and associated OpenCL platform with it std::array context_properties = { CL_CONTEXT_PLATFORM, reinterpret_cast(platform.first), 0 }; - cl_context cont = clCreateContext(context_properties.data(), static_cast(devices.size()), devices.data(), nullptr, nullptr, &err); - PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the OpenCL context") - // add OpenCL context to vector of context wrappers - contexts.emplace_back(cont, platform.first, std::move(devices)); + for (auto &device : devices) { + cl_context cont = clCreateContext(context_properties.data(), cl_uint{ 1 }, &device, nullptr, nullptr, &err); + PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the OpenCL context") + // add OpenCL context to vector of context wrappers + contexts.emplace_back(cont, platform.first, device); + } } return std::make_pair(std::move(contexts), target); @@ -208,12 +210,7 @@ std::vector> kernel_type_to_function } std::vector create_command_queues(const std::vector &contexts, const kernel_function_type kernel_function, const std::vector> &kernel_names) { - std::vector queues; - for (std::vector::size_type device = 0; device < contexts[0].devices.size(); ++device) { - queues.emplace_back(contexts[0], contexts[0].devices[device]); - } - PLSSVM_ASSERT(!queues.empty(), "At least one command queue must be available!"); - + // a small helper function for better error messages const auto cl_build_program_error_message = [](cl_program prog, cl_device_id device, const std::size_t device_idx) { // determine the size of the log std::size_t log_size{}; @@ -228,28 +225,41 @@ std::vector create_command_queues(const std::vector &con } }; - // determine OpenCL compile options - std::string compile_options{ "-cl-mad-enable -cl-no-signed-zeros" }; + //**************************************************************************// + // determine per device compile options // + //**************************************************************************// + + // determine OpenCL compile options per device + std::string global_compile_options{ "-cl-mad-enable -cl-no-signed-zeros" }; #if defined(PLSSVM_ENABLE_FAST_MATH) - compile_options += " -cl-fast-relaxed-math"; + global_compile_options += " -cl-fast-relaxed-math"; #endif + std::vector compile_options(contexts.size(), global_compile_options); + // only use PTX inline assembly if enabled during CMake configuration #if defined(PLSSVM_OPENCL_BACKEND_USE_PTX_INLINE_ASSEMBLY) - std::size_t platform_vendor_size{ 0 }; - clGetPlatformInfo(contexts[0].platform, CL_PLATFORM_VENDOR, 0, nullptr, &platform_vendor_size); - std::string platform_vendor(platform_vendor_size, '\0'); - clGetPlatformInfo(contexts[0].platform, CL_PLATFORM_VENDOR, platform_vendor_size, platform_vendor.data(), nullptr); - const bool use_inline_assembly = ::plssvm::detail::contains(::plssvm::detail::as_lower_case(platform_vendor), "nvidia"); - if (use_inline_assembly) { - compile_options += " -DPLSSVM_USE_NVIDIA_PTX_INLINE_ASSEMBLY"; - plssvm::detail::log(verbosity_level::full, - "Enabling atomicAdd acceleration using PTX inline assembly.\n"); + for (std::size_t idx = 0; idx < contexts.size(); ++idx) { + auto &context = contexts[idx]; + + std::size_t platform_vendor_size{ 0 }; + clGetPlatformInfo(context.platform, CL_PLATFORM_VENDOR, 0, nullptr, &platform_vendor_size); + std::string platform_vendor(platform_vendor_size, '\0'); + clGetPlatformInfo(context.platform, CL_PLATFORM_VENDOR, platform_vendor_size, platform_vendor.data(), nullptr); + const bool use_inline_assembly = ::plssvm::detail::contains(::plssvm::detail::as_lower_case(platform_vendor), "nvidia"); + if (use_inline_assembly) { + compile_options[idx] += " -DPLSSVM_USE_NVIDIA_PTX_INLINE_ASSEMBLY"; + plssvm::detail::log(verbosity_level::full, + "Enabling atomicAdd acceleration using PTX inline assembly for device {}.\n", + idx); + } + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "opencl", fmt::format("use_inline_assembly_device_{}", idx), use_inline_assembly })); } - PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "opencl", "use_inline_assembly", use_inline_assembly })); #endif - error_code err, err_bin; + //**************************************************************************// + // assemble the OpenCL kernel // + //**************************************************************************// // note: unsigned long long may NOT be used in an OpenCL kernel (use ulong instead) // note: real_type temp{ 0.0 } may NOT be used in an OpenCL kernel (use real_type temp = 0.0 instead) @@ -347,23 +357,25 @@ std::vector create_command_queues(const std::vector &con ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE", fmt::format("{}", PADDING_SIZE)); // get all device names - std::vector device_names(contexts[0].devices.size()); - for (typename std::vector::size_type device_id = 0; device_id < device_names.size(); ++device_id) { + std::vector device_names{}; + for (auto &context : contexts) { // get device name std::size_t name_length{}; - PLSSVM_OPENCL_ERROR_CHECK(clGetDeviceInfo(contexts[0].devices[device_id], CL_DEVICE_NAME, 0, nullptr, &name_length), "error obtaining device name size") + PLSSVM_OPENCL_ERROR_CHECK(clGetDeviceInfo(context.device, CL_DEVICE_NAME, 0, nullptr, &name_length), "error obtaining device name size") std::string device_name(name_length - 1, '\0'); - PLSSVM_OPENCL_ERROR_CHECK(clGetDeviceInfo(contexts[0].devices[device_id], CL_DEVICE_NAME, name_length, device_name.data(), nullptr), "error obtaining device name") - device_names[device_id] = std::move(device_name); + PLSSVM_OPENCL_ERROR_CHECK(clGetDeviceInfo(context.device, CL_DEVICE_NAME, name_length, device_name.data(), nullptr), "error obtaining device name") + device_names.push_back(device_name); } - // append number of device to influence checksum calculation + // append other information to make the kernel string unique kernel_src_string.append(fmt::format("\n\n" - "// devices: [{}]\n" + "// num_devices: {}\n" + "// device_names: [{}]\n" "// OpenCL library: \"{}\"\n" "// OpenCL target version: {}\n" "// CMAKE_BUILD_TYPE: {}\n" "// compile_options: \"{}\"\n", + contexts.size(), fmt::join(device_names, ", "), PLSSVM_OPENCL_LIBRARY, CL_TARGET_OPENCL_VERSION, @@ -376,14 +388,13 @@ std::vector create_command_queues(const std::vector &con // convert string to const char* const char *kernel_src_ptr = kernel_src_string.c_str(); - // data to build the final OpenCL program - std::vector binary_sizes(contexts[0].devices.size()); - std::vector> binaries(contexts[0].devices.size()); - std::vector binaries_ptr(binaries.size()); - // create caching folder in the temporary directory and change the permissions such that everybody has read/write access const std::filesystem::path cache_dir_name = std::filesystem::temp_directory_path() / "plssvm_opencl_cache" / checksum; + //**************************************************************************// + // check whether a cached OpenCL kernel can be used // + //**************************************************************************// + // potential reasons why OpenCL caching could fail enum class caching_status { success, @@ -414,41 +425,59 @@ std::vector create_command_queues(const std::vector &con // get directory iterator auto dirIter = std::filesystem::directory_iterator(cache_dir_name); // get files in directory -> account for stored preprocessed source file - if (static_cast(std::count_if(std::filesystem::begin(dirIter), std::filesystem::end(dirIter), [](const auto &entry) { return entry.is_regular_file(); })) != contexts[0].devices.size() + 1) { + if (static_cast(std::count_if(std::filesystem::begin(dirIter), std::filesystem::end(dirIter), [](const auto &entry) { return entry.is_regular_file(); })) != contexts.size() + 1) { use_cached_binaries = caching_status::error_invalid_number_of_cached_files; } } + //**************************************************************************// + // fill the OpenCL binaries (either compile or use cached values) // + //**************************************************************************// + + // data to build the final OpenCL program + std::vector binary_sizes(contexts.size()); + std::vector> binaries(contexts.size()); + std::vector binaries_ptr(binaries.size()); + + error_code err, err_bin; + if (use_cached_binaries != caching_status::success) { plssvm::detail::log(verbosity_level::full, "Building OpenCL kernels from source (reason: {}).\n", caching_status_to_string(use_cached_binaries)); - // create and build program - cl_program program = clCreateProgramWithSource(contexts[0], 1, &kernel_src_ptr, nullptr, &err); - PLSSVM_OPENCL_ERROR_CHECK(err, "error creating program from source") + // build OpenCL kernels for each context, i.e., each device + for (std::size_t idx = 0; idx < contexts.size(); ++idx) { + auto &context = contexts[idx]; + auto &device = context.device; - err = clBuildProgram(program, static_cast(contexts[0].devices.size()), contexts[0].devices.data(), compile_options.c_str(), nullptr, nullptr); + // create and build program + cl_program program = clCreateProgramWithSource(context, 1, &kernel_src_ptr, nullptr, &err); + PLSSVM_OPENCL_ERROR_CHECK(err, "error creating program from source") - if (!err) { - // check all devices for errors - for (std::vector::size_type device = 0; device < contexts[0].devices.size(); ++device) { - cl_build_program_error_message(program, contexts[0].devices[device], device); + err = clBuildProgram(program, 1, &device, compile_options[idx].c_str(), nullptr, nullptr); + + if (!err) { + // check device for errors + cl_build_program_error_message(program, device, idx); + PLSSVM_OPENCL_ERROR_CHECK(err, "error building program") } - PLSSVM_OPENCL_ERROR_CHECK(err, "error building program") - } - // get sizes of binaries - err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, contexts[0].devices.size() * sizeof(std::size_t), binary_sizes.data(), nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error retrieving the kernel (binary) kernel sizes") - for (std::vector>::size_type i = 0; i < binaries.size(); ++i) { - binaries[i] = std::vector(binary_sizes[i]); - binaries_ptr[i] = binaries[i].data(); // only necessary for OpenCL's void ** calls! - } + // get sizes of binaries + err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(std::size_t), &binary_sizes[idx], nullptr); + PLSSVM_OPENCL_ERROR_CHECK(err, "error retrieving the kernel (binary) kernel sizes") + binaries[idx] = std::vector(binary_sizes[idx]); + binaries_ptr[idx] = binaries[idx].data(); // only necessary for OpenCL's void ** calls! + + // get binaries + err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binaries_ptr[idx], nullptr); + PLSSVM_OPENCL_ERROR_CHECK(err, "error retrieving the kernel binaries") - // get binaries - err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, contexts[0].devices.size() * sizeof(unsigned char *), binaries_ptr.data(), nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error retrieving the kernel binaries") + // release resource + if (program) { + PLSSVM_OPENCL_ERROR_CHECK(clReleaseProgram(program), "error releasing OpenCL program resources") + } + } // write binaries to file if (!std::filesystem::exists(cache_dir_name)) { @@ -474,11 +503,6 @@ std::vector create_command_queues(const std::vector &con plssvm::detail::log(verbosity_level::full, "Cached OpenCL kernel binaries in {}.\n", cache_dir_name); - - // release resource - if (program) { - PLSSVM_OPENCL_ERROR_CHECK(clReleaseProgram(program), "error releasing OpenCL program resources") - } } else { plssvm::detail::log(verbosity_level::full, "Using cached OpenCL kernel binaries from {}.\n", @@ -515,32 +539,41 @@ std::vector create_command_queues(const std::vector &con } } - // build from binaries - cl_program binary_program = clCreateProgramWithBinary(contexts[0], static_cast(contexts[0].devices.size()), contexts[0].devices.data(), binary_sizes.data(), const_cast(binaries_ptr.data()), &err_bin, &err); - PLSSVM_OPENCL_ERROR_CHECK(err_bin, "error loading binaries") - PLSSVM_OPENCL_ERROR_CHECK(err, "error creating binary program") - err = clBuildProgram(binary_program, static_cast(contexts[0].devices.size()), contexts[0].devices.data(), nullptr, nullptr, nullptr); - if (!err) { - // check all devices for errors - for (std::vector::size_type device = 0; device < contexts[0].devices.size(); ++device) { - cl_build_program_error_message(binary_program, contexts[0].devices[device], device); + std::vector queues{}; + // compile kernels for each context, i.e., each device + for (std::size_t idx = 0; idx < contexts.size(); ++idx) { + auto &context = contexts[idx]; + auto &device = context.device; + + // build from binaries + cl_program binary_program = clCreateProgramWithBinary(context, cl_uint{ 1 }, &device, &binary_sizes[idx], const_cast(&binaries_ptr[idx]), &err_bin, &err); + PLSSVM_OPENCL_ERROR_CHECK(err_bin, "error loading binaries") + PLSSVM_OPENCL_ERROR_CHECK(err, "error creating binary program") + err = clBuildProgram(binary_program, cl_uint{ 1 }, &device, nullptr, nullptr, nullptr); + if (!err) { + // check device for errors + cl_build_program_error_message(binary_program, device, idx); + PLSSVM_OPENCL_ERROR_CHECK(err, "error building program") } - PLSSVM_OPENCL_ERROR_CHECK(err, "error building program") - } - // build all kernels, one for each device - for (std::vector::size_type device = 0; device < contexts[0].devices.size(); ++device) { + // each context contains exactly one device + command_queue queue{ context, device }; + + // build all kernels, one for each device for (const std::pair &name : kernel_names) { // create kernel - queues[device].add_kernel(name.first, kernel{ clCreateKernel(binary_program, name.second.c_str(), &err) }); - PLSSVM_OPENCL_ERROR_CHECK(err, fmt::format("error creating OpenCL kernel {} for device {}", name.second, device)) + queue.add_kernel(name.first, kernel{ clCreateKernel(binary_program, name.second.c_str(), &err) }); + PLSSVM_OPENCL_ERROR_CHECK(err, fmt::format("error creating OpenCL kernel {} for device {}", name.second, idx)) } - } - // release resource - if (binary_program) { - PLSSVM_OPENCL_ERROR_CHECK(clReleaseProgram(binary_program), "error releasing OpenCL binary program resources") + // release resource + if (binary_program) { + PLSSVM_OPENCL_ERROR_CHECK(clReleaseProgram(binary_program), "error releasing OpenCL binary program resources") + } + + queues.push_back(std::move(queue)); } + PLSSVM_ASSERT(!queues.empty(), "At least one command queue must be available!"); return queues; } From 1dd509ceae04235ac122cbe7735bf189c9fddd13 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 9 Oct 2024 15:10:50 +0200 Subject: [PATCH 20/93] Use the correct OpenCL functions to perform SVM pointer operations and improve simplicity of implementation by using a std::variant as device_pointer_type. --- .../backends/OpenCL/detail/device_ptr.hpp | 21 +--- include/plssvm/backends/gpu_device_ptr.hpp | 36 ++++--- src/plssvm/backends/OpenCL/csvm.cpp | 16 +-- .../backends/OpenCL/detail/device_ptr.cpp | 98 +++++++++---------- 4 files changed, 74 insertions(+), 97 deletions(-) diff --git a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp index a88f4ce0f..ab7ee5f4e 100644 --- a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp +++ b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp @@ -29,9 +29,9 @@ namespace plssvm::opencl::detail { * @tparam T the type of the kernel pointer to wrap */ template -class device_ptr : public ::plssvm::detail::gpu_device_ptr> { +class device_ptr : public ::plssvm::detail::gpu_device_ptr, device_ptr> { /// The template base type of the OpenCL device_ptr class. - using base_type = ::plssvm::detail::gpu_device_ptr>; + using base_type = ::plssvm::detail::gpu_device_ptr, device_ptr>; using base_type::data_; using base_type::queue_; @@ -104,19 +104,6 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr get_variant(); - /** - * @brief Get a pointer to the device memory. - * @details If USM allocations are used, returns a `T*` otherwise returns a `cl_mem` object. - * @return a variant containing the device memory pointer (`[[nodiscard]]`) - */ - [[nodiscard]] std::variant get_variant() const; - /** * @copydoc plssvm::detail::gpu_device_ptr::memset(int, size_type, size_type) */ @@ -141,10 +128,6 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr; diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp index c4a277e06..d6e5045f1 100644 --- a/include/plssvm/backends/gpu_device_ptr.hpp +++ b/include/plssvm/backends/gpu_device_ptr.hpp @@ -19,8 +19,12 @@ #include "plssvm/matrix.hpp" // plssvm::layout_type, plssvm::matrix #include "plssvm/shape.hpp" // plssvm::shape -#include // std::size_t -#include // std::vector +#include "fmt/format.h" // fmt::format + +#include // std::swap +#include // std::size_t +#include // std::exchange +#include // std::vector namespace plssvm::detail { @@ -231,7 +235,7 @@ class gpu_device_ptr { */ void fill(value_type value, size_type pos = 0); /** - * @brief Fill up-to @p count values to @p value starting at position @p pos. + * @brief Fill up-to @p count values of @p value starting at position @p pos. * @details Fill `[pos, rcount)` where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`. * @param[in] value the fill value * @param[in] pos the position to start the fill @@ -426,14 +430,14 @@ void gpu_device_ptr::swap( template void gpu_device_ptr::memset(const int pattern, const size_type pos) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->memset(pattern, pos, this->size_padded() * sizeof(value_type)); } template void gpu_device_ptr::fill(const value_type value, const size_type pos) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->fill(value, pos, this->size_padded()); } @@ -441,7 +445,7 @@ void gpu_device_ptr::fill( template template void gpu_device_ptr::copy_to_device(const matrix &data_to_copy) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (data_to_copy.size_padded() < this->size_padded()) { throw gpu_device_ptr_exception{ fmt::format("Too few data to perform copy (needed: {}, provided: {})!", this->size_padded(), data_to_copy.size_padded()) }; @@ -451,14 +455,14 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device(const std::vector &data_to_copy) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->copy_to_device(data_to_copy, 0, this->size_padded()); } template void gpu_device_ptr::copy_to_device(const std::vector &data_to_copy, const size_type pos, const size_type count) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); const size_type rcount = std::min(count, this->size_padded() - pos); if (data_to_copy.size() < rcount) { @@ -469,7 +473,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device(const_host_pointer_type data_to_copy) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); this->copy_to_device(data_to_copy, 0, this->size_padded()); @@ -478,7 +482,7 @@ void gpu_device_ptr::copy_ template template void gpu_device_ptr::copy_to_device_strided(const matrix &data_to_copy, const std::size_t start_row, const std::size_t num_rows) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (start_row + num_rows > data_to_copy.num_rows()) { throw gpu_device_ptr_exception{ fmt::format("Tried to copy lines {}-{} (zero-based index) to the device, but the matrix has only {} lines!", start_row, start_row + num_rows - 1, data_to_copy.num_rows()) }; @@ -504,7 +508,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device_strided(const std::vector &data_to_copy, std::size_t spitch, std::size_t width, std::size_t height) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (width > spitch) { throw gpu_device_ptr_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) }; @@ -519,7 +523,7 @@ void gpu_device_ptr::copy_ template template void gpu_device_ptr::copy_to_host(matrix &buffer) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (buffer.size_padded() < this->size_padded()) { throw gpu_device_ptr_exception{ fmt::format("Buffer too small to perform copy (needed: {}, provided: {})!", this->size_padded(), buffer.size_padded()) }; @@ -529,14 +533,14 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_host(std::vector &buffer) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->copy_to_host(buffer, 0, this->size_padded()); } template void gpu_device_ptr::copy_to_host(std::vector &buffer, const size_type pos, const size_type count) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); const size_type rcount = std::min(count, this->size_padded() - pos); if (buffer.size() < rcount) { @@ -547,7 +551,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_host(host_pointer_type buffer) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); this->copy_to_host(buffer, 0, this->size_padded()); @@ -555,7 +559,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_other_device(derived_gpu_device_ptr &target) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?"); this->copy_to_other_device(target, 0, this->size_padded()); diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp index cb032be85..2daddbcaa 100644 --- a/src/plssvm/backends/OpenCL/csvm.cpp +++ b/src/plssvm/backends/OpenCL/csvm.cpp @@ -290,22 +290,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons switch (params.kernel_type) { case kernel_function_type::linear: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y); break; case kernel_function_type::polynomial: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get(params.gamma)); break; } } @@ -337,7 +337,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y); } // convert execution range block to OpenCL's native std::vector @@ -354,7 +354,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : const cl_ulong grid_offset_x = offsets.x; const cl_ulong grid_offset_y = offsets.y; - detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y); + detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y); } } detail::device_synchronize(device); diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp index 086884dc6..6e417524b 100644 --- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp +++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp @@ -22,10 +22,9 @@ #include "fmt/format.h" // fmt::format -#include // std::min, std::fill +#include // std::min #include // std::array #include // std::size_t -#include // std::memcpy #include // std::terminate #include // std::cerr, std::endl #include // std::variant @@ -47,11 +46,11 @@ device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding cl_context cont{}; PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast(&cont), nullptr), "error retrieving the command queue context") if (use_usm_allocations_) { - usm_ptr_ = static_cast(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0)); - if (usm_ptr_ == nullptr) { + T* usm_ptr = static_cast(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0)); + if (usm_ptr == nullptr) { throw backend_exception{ fmt::format("Failed to allocate {} of memory using clSVMAlloc(...). Maybe that's larger than CL_DEVICE_MAX_MEM_ALLOC_SIZE?", ::plssvm::detail::memory_size{ this->size_padded() * sizeof(value_type) }) }; } - PLSSVM_ASSERT(usm_ptr_ != nullptr, "error creating OpenCL SVM allocation"); + data_ = usm_ptr; } else { error_code err{}; data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err); @@ -64,13 +63,18 @@ template device_ptr::~device_ptr() { // avoid compiler warnings try { - if (data_ != nullptr) { - PLSSVM_OPENCL_ERROR_CHECK(clReleaseMemObject(data_), "error releasing the buffer") - } - if (use_usm_allocations_ && usm_ptr_ != nullptr) { - cl_context cont{}; - PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast(&cont), nullptr), "error retrieving the command queue context") - clSVMFree(cont, usm_ptr_); + if (use_usm_allocations_) { + T* usm_ptr = std::get(data_); + if (usm_ptr != nullptr) { + cl_context cont{}; + PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast(&cont), nullptr), "error retrieving the command queue context") + clSVMFree(cont, usm_ptr); + } + } else { + cl_mem mem = std::get(data_); + if (mem != nullptr) { + PLSSVM_OPENCL_ERROR_CHECK(clReleaseMemObject(mem), "error releasing the buffer") + } } } catch (const plssvm::exception &e) { std::cout << e.what_with_loc() << std::endl; @@ -78,46 +82,29 @@ device_ptr::~device_ptr() { } } -template -auto device_ptr::get_variant() -> std::variant { - if (use_usm_allocations_) { - return { usm_ptr_ }; - } else { - return { this->get() }; - } -} - -template -auto device_ptr::get_variant() const -> std::variant { - if (use_usm_allocations_) { - return { usm_ptr_ }; - } else { - return { this->get() }; - } -} - template void device_ptr::memset(const int pattern, const size_type pos, const size_type num_bytes) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (pos >= this->size_padded()) { throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) }; } const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type)); + + const auto correct_value = static_cast(pattern); + error_code err; if (use_usm_allocations_) { - std::memset(usm_ptr_ + pos, pattern, rnum_bytes); + err = clEnqueueSVMMemFill(queue_->queue, std::get(data_) + pos, &correct_value, sizeof(unsigned char), rnum_bytes, 0, nullptr, nullptr); } else { - error_code err; - const auto correct_value = static_cast(pattern); - err = clEnqueueFillBuffer(queue_->queue, data_, &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset") + err = clEnqueueFillBuffer(queue_->queue, std::get(data_), &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr); } + PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset") device_synchronize(*queue_); } template void device_ptr::fill(const value_type value, const size_type pos, const size_type count) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (pos >= this->size_padded()) { throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) }; @@ -125,35 +112,37 @@ void device_ptr::fill(const value_type value, const size_type pos, const size // run GPU kernel const size_type rcount = std::min(count, this->size_padded() - pos); + + error_code err; if (use_usm_allocations_) { - std::fill(usm_ptr_ + pos, usm_ptr_ + pos + rcount, value); + err = clEnqueueSVMMemFill(queue_->queue, std::get(data_) + pos, &value, sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr); } else { - error_code err; - err = clEnqueueFillBuffer(queue_->queue, data_, &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill") + err = clEnqueueFillBuffer(queue_->queue, std::get(data_), &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr); } + PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill") device_synchronize(*queue_); } template void device_ptr::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); const size_type rcount = std::min(count, this->size_padded() - pos); + + error_code err; if (use_usm_allocations_) { - std::memcpy(usm_ptr_ + pos, data_to_copy, rcount); + err = clEnqueueSVMMemcpy(queue_->queue, CL_TRUE, std::get(data_) + pos, data_to_copy, rcount * sizeof(value_type), 0, nullptr, nullptr); } else { - error_code err; - err = clEnqueueWriteBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer") + err = clEnqueueWriteBuffer(queue_->queue, std::get(data_), CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr); } + PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer") device_synchronize(*queue_); } template void device_ptr::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); if (width > spitch) { @@ -183,7 +172,7 @@ void device_ptr::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t host_slice_pitch = 0; error_code err; - err = clEnqueueWriteBufferRect(queue_->queue, data_, CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr); + err = clEnqueueWriteBufferRect(queue_->queue, std::get(data_), CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr); PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the strided data to the device buffer") } device_synchronize(*queue_); @@ -191,23 +180,24 @@ void device_ptr::copy_to_device_strided(const_host_pointer_type data_to_copy, template void device_ptr::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); const size_type rcount = std::min(count, this->size_padded() - pos); + + error_code err; if (use_usm_allocations_) { - std::memcpy(buffer, usm_ptr_ + pos, rcount); + err = clEnqueueSVMMemcpy(queue_->queue, CL_TRUE, buffer, std::get(data_) + pos, rcount * sizeof(value_type), 0, nullptr, nullptr); } else { - error_code err; - err = clEnqueueReadBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr); - PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer") + err = clEnqueueReadBuffer(queue_->queue, std::get(data_), CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr); } + PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer") device_synchronize(*queue_); } template void device_ptr::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const { - PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?"); const size_type rcount = std::min(count, this->size_padded() - pos); From 570ba776d2a80dbf35d0288b86cca824de1e4f50 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 9 Oct 2024 15:34:02 +0200 Subject: [PATCH 21/93] Fix usage of undefined type alias in assertion message. --- include/plssvm/backends/gpu_device_ptr.hpp | 28 +++++++++---------- .../backends/OpenCL/detail/device_ptr.cpp | 14 +++++----- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp index d6e5045f1..64888a86e 100644 --- a/include/plssvm/backends/gpu_device_ptr.hpp +++ b/include/plssvm/backends/gpu_device_ptr.hpp @@ -430,14 +430,14 @@ void gpu_device_ptr::swap( template void gpu_device_ptr::memset(const int pattern, const size_type pos) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->memset(pattern, pos, this->size_padded() * sizeof(value_type)); } template void gpu_device_ptr::fill(const value_type value, const size_type pos) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->fill(value, pos, this->size_padded()); } @@ -445,7 +445,7 @@ void gpu_device_ptr::fill( template template void gpu_device_ptr::copy_to_device(const matrix &data_to_copy) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (data_to_copy.size_padded() < this->size_padded()) { throw gpu_device_ptr_exception{ fmt::format("Too few data to perform copy (needed: {}, provided: {})!", this->size_padded(), data_to_copy.size_padded()) }; @@ -455,14 +455,14 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device(const std::vector &data_to_copy) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->copy_to_device(data_to_copy, 0, this->size_padded()); } template void gpu_device_ptr::copy_to_device(const std::vector &data_to_copy, const size_type pos, const size_type count) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); const size_type rcount = std::min(count, this->size_padded() - pos); if (data_to_copy.size() < rcount) { @@ -473,7 +473,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device(const_host_pointer_type data_to_copy) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); this->copy_to_device(data_to_copy, 0, this->size_padded()); @@ -482,7 +482,7 @@ void gpu_device_ptr::copy_ template template void gpu_device_ptr::copy_to_device_strided(const matrix &data_to_copy, const std::size_t start_row, const std::size_t num_rows) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (start_row + num_rows > data_to_copy.num_rows()) { throw gpu_device_ptr_exception{ fmt::format("Tried to copy lines {}-{} (zero-based index) to the device, but the matrix has only {} lines!", start_row, start_row + num_rows - 1, data_to_copy.num_rows()) }; @@ -508,7 +508,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_device_strided(const std::vector &data_to_copy, std::size_t spitch, std::size_t width, std::size_t height) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (width > spitch) { throw gpu_device_ptr_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) }; @@ -523,7 +523,7 @@ void gpu_device_ptr::copy_ template template void gpu_device_ptr::copy_to_host(matrix &buffer) const { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (buffer.size_padded() < this->size_padded()) { throw gpu_device_ptr_exception{ fmt::format("Buffer too small to perform copy (needed: {}, provided: {})!", this->size_padded(), buffer.size_padded()) }; @@ -533,14 +533,14 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_host(std::vector &buffer) const { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); this->copy_to_host(buffer, 0, this->size_padded()); } template void gpu_device_ptr::copy_to_host(std::vector &buffer, const size_type pos, const size_type count) const { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); const size_type rcount = std::min(count, this->size_padded() - pos); if (buffer.size() < rcount) { @@ -551,7 +551,7 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_host(host_pointer_type buffer) const { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); this->copy_to_host(buffer, 0, this->size_padded()); @@ -559,8 +559,8 @@ void gpu_device_ptr::copy_ template void gpu_device_ptr::copy_to_other_device(derived_gpu_device_ptr &target) const { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); - PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?"); this->copy_to_other_device(target, 0, this->size_padded()); } diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp index 6e417524b..d25e879e4 100644 --- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp +++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp @@ -84,7 +84,7 @@ device_ptr::~device_ptr() { template void device_ptr::memset(const int pattern, const size_type pos, const size_type num_bytes) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (pos >= this->size_padded()) { throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) }; @@ -104,7 +104,7 @@ void device_ptr::memset(const int pattern, const size_type pos, const size_ty template void device_ptr::fill(const value_type value, const size_type pos, const size_type count) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); if (pos >= this->size_padded()) { throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) }; @@ -125,7 +125,7 @@ void device_ptr::fill(const value_type value, const size_type pos, const size template void device_ptr::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); const size_type rcount = std::min(count, this->size_padded() - pos); @@ -142,7 +142,7 @@ void device_ptr::copy_to_device(const_host_pointer_type data_to_copy, const s template void device_ptr::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!"); if (width > spitch) { @@ -180,7 +180,7 @@ void device_ptr::copy_to_device_strided(const_host_pointer_type data_to_copy, template void device_ptr::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!"); const size_type rcount = std::min(count, this->size_padded() - pos); @@ -197,8 +197,8 @@ void device_ptr::copy_to_host(host_pointer_type buffer, const size_type pos, template void device_ptr::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const { - PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); - PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?"); + PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?"); + PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?"); const size_type rcount = std::min(count, this->size_padded() - pos); if (target.size_padded() < rcount) { From 38c27fea12edac6ceae5070117a9eebeb73643e9 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 9 Oct 2024 16:02:08 +0200 Subject: [PATCH 22/93] Update tests to support USM device_ptr and the cg_streaming solver. --- tests/backends/CUDA/detail/device_ptr.cpp | 21 +- tests/backends/HIP/detail/device_ptr.hip | 21 +- tests/backends/OpenCL/detail/device_ptr.cpp | 24 +- tests/backends/OpenCL/detail/utility.cpp | 4 +- .../SYCL/AdaptiveCpp/detail/device_ptr.cpp | 21 +- .../backends/SYCL/DPCPP/detail/device_ptr.cpp | 21 +- tests/backends/generic_csvm_tests.hpp | 5 +- tests/backends/generic_device_ptr_tests.hpp | 271 +++++++++++------- tests/backends/generic_gpu_csvm_tests.hpp | 78 ++++- tests/detail/cmd/parser_train.cpp | 2 +- tests/solver_types.cpp | 5 +- tests/types_to_test.hpp | 2 +- 12 files changed, 357 insertions(+), 118 deletions(-) diff --git a/tests/backends/CUDA/detail/device_ptr.cpp b/tests/backends/CUDA/detail/device_ptr.cpp index f97d0d8ab..52ba58ed5 100644 --- a/tests/backends/CUDA/detail/device_ptr.cpp +++ b/tests/backends/CUDA/detail/device_ptr.cpp @@ -18,10 +18,11 @@ #include // std::tuple -template +template struct cuda_device_ptr_test_type { using device_ptr_type = plssvm::cuda::detail::device_ptr; using queue_type = int; + static constexpr bool use_usm_allocations = UUA; static const queue_type &default_queue() { static const queue_type queue = 0; @@ -29,7 +30,7 @@ struct cuda_device_ptr_test_type { } }; -using cuda_device_ptr_tuple = std::tuple, cuda_device_ptr_test_type>; +using cuda_device_ptr_tuple = std::tuple, cuda_device_ptr_test_type>; // the tests used in the instantiated GTest test suites using cuda_device_ptr_type_gtest = util::combine_test_parameters_gtest_t>; @@ -40,3 +41,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtr, DevicePtr, cuda_device_ptr_type_gt INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtr, DevicePtrLayout, cuda_device_ptr_layout_type_gtest, naming::test_parameter_to_name); INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrDeathTest, DevicePtrDeathTest, cuda_device_ptr_type_gtest, naming::test_parameter_to_name); + +// +// test USM pointer +// + +using cuda_device_ptr_usm_tuple = std::tuple, cuda_device_ptr_test_type>; + +// the tests used in the instantiated GTest test suites +using cuda_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t>; +using cuda_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t, util::layout_type_list>; + +// instantiate type-parameterized tests +INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSM, DevicePtr, cuda_device_ptr_usm_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSM, DevicePtrLayout, cuda_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name); + +INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSMDeathTest, DevicePtrDeathTest, cuda_device_ptr_usm_type_gtest, naming::test_parameter_to_name); diff --git a/tests/backends/HIP/detail/device_ptr.hip b/tests/backends/HIP/detail/device_ptr.hip index ecf8ce92a..09e9992de 100644 --- a/tests/backends/HIP/detail/device_ptr.hip +++ b/tests/backends/HIP/detail/device_ptr.hip @@ -18,10 +18,11 @@ #include // std::tuple -template +template struct hip_device_ptr_test_type { using device_ptr_type = plssvm::hip::detail::device_ptr; using queue_type = int; + constexpr static bool use_usm_allocations = UUA; static const queue_type &default_queue() { static const queue_type queue = 0; @@ -29,7 +30,7 @@ struct hip_device_ptr_test_type { } }; -using hip_device_ptr_tuple = std::tuple, hip_device_ptr_test_type>; +using hip_device_ptr_tuple = std::tuple, hip_device_ptr_test_type>; // the tests used in the instantiated GTest test suites using hip_device_ptr_type_gtest = util::combine_test_parameters_gtest_t>; @@ -40,3 +41,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtr, DevicePtr, hip_device_ptr_type_gtes INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtr, DevicePtrLayout, hip_device_ptr_layout_type_gtest, naming::test_parameter_to_name); INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrDeathTest, DevicePtrDeathTest, hip_device_ptr_type_gtest, naming::test_parameter_to_name); + +// +// test USM pointer +// + +using hip_device_ptr_usm_tuple = std::tuple, hip_device_ptr_test_type>; + +// the tests used in the instantiated GTest test suites +using hip_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t>; +using hip_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t, util::layout_type_list>; + +// instantiate type-parameterized tests +INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSM, DevicePtr, hip_device_ptr_usm_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSM, DevicePtrLayout, hip_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name); + +INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSMDeathTest, DevicePtrDeathTest, hip_device_ptr_usm_type_gtest, naming::test_parameter_to_name); diff --git a/tests/backends/OpenCL/detail/device_ptr.cpp b/tests/backends/OpenCL/detail/device_ptr.cpp index b9a638e04..0ea0f4e40 100644 --- a/tests/backends/OpenCL/detail/device_ptr.cpp +++ b/tests/backends/OpenCL/detail/device_ptr.cpp @@ -13,6 +13,7 @@ #include "plssvm/backends/OpenCL/detail/command_queue.hpp" // plssvm::opencl::detail::command_queue #include "plssvm/backends/OpenCL/detail/context.hpp" // plssvm::opencl::detail::context #include "plssvm/backends/OpenCL/detail/utility.hpp" // plssvm::opencl::detail::get_contexts +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "tests/backends/generic_device_ptr_tests.hpp" // generic device pointer tests to instantiate #include "tests/naming.hpp" // naming::test_parameter_to_name @@ -23,19 +24,20 @@ #include // std::tuple #include // std::vector -template +template struct opencl_device_ptr_test_type { using device_ptr_type = plssvm::opencl::detail::device_ptr; using queue_type = plssvm::opencl::detail::command_queue; + constexpr static bool use_usm_allocations = UUA; static const queue_type &default_queue() { static const std::vector contexts{ plssvm::opencl::detail::get_contexts(plssvm::target_platform::automatic).first }; - static const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].devices[0] }; + static const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].device }; return queue; } }; -using opencl_device_ptr_tuple = std::tuple, opencl_device_ptr_test_type>; +using opencl_device_ptr_tuple = std::tuple, opencl_device_ptr_test_type>; // the tests used in the instantiated GTest test suites using opencl_device_ptr_type_gtest = util::combine_test_parameters_gtest_t>; @@ -46,3 +48,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtr, DevicePtr, opencl_device_ptr_typ INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtr, DevicePtrLayout, opencl_device_ptr_layout_type_gtest, naming::test_parameter_to_name); INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrDeathTest, DevicePtrDeathTest, opencl_device_ptr_type_gtest, naming::test_parameter_to_name); + +// +// test USM pointer +// + +using opencl_device_ptr_usm_tuple = std::tuple, opencl_device_ptr_test_type>; + +// the tests used in the instantiated GTest test suites +using opencl_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t>; +using opencl_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t, util::layout_type_list>; + +// instantiate type-parameterized tests +INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSM, DevicePtr, opencl_device_ptr_usm_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSM, DevicePtrLayout, opencl_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name); + +INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSMDeathTest, DevicePtrDeathTest, opencl_device_ptr_usm_type_gtest, naming::test_parameter_to_name); diff --git a/tests/backends/OpenCL/detail/utility.cpp b/tests/backends/OpenCL/detail/utility.cpp index 3204d15a5..fe24e594a 100644 --- a/tests/backends/OpenCL/detail/utility.cpp +++ b/tests/backends/OpenCL/detail/utility.cpp @@ -91,7 +91,7 @@ TEST(OpenCLUtility, get_opencl_target_version) { TEST(OpenCLUtility, get_driver_version) { // create a valid command queue const std::vector contexts{ plssvm::opencl::detail::get_contexts(plssvm::target_platform::automatic).first }; - const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].devices[0] }; + const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].device }; // the device name should not be empty const std::string driver_version = plssvm::opencl::detail::get_driver_version(queue); EXPECT_FALSE(driver_version.empty()); @@ -100,7 +100,7 @@ TEST(OpenCLUtility, get_driver_version) { TEST(OpenCLUtility, get_device_name) { // create a valid command queue const std::vector contexts{ plssvm::opencl::detail::get_contexts(plssvm::target_platform::automatic).first }; - const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].devices[0] }; + const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].device }; // the device name should not be empty const std::string name = plssvm::opencl::detail::get_device_name(queue); EXPECT_FALSE(name.empty()); diff --git a/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp index 2b1f5f558..7bfb8cd43 100644 --- a/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp +++ b/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp @@ -20,10 +20,11 @@ #include // std::tuple -template +template struct adaptivecpp_device_ptr_test_type { using device_ptr_type = plssvm::adaptivecpp::detail::device_ptr; using queue_type = typename device_ptr_type::queue_type; + constexpr static bool use_usm_allocations = UUA; static const queue_type &default_queue() { static const queue_type queue = plssvm::adaptivecpp::detail::get_default_queue(); @@ -31,7 +32,7 @@ struct adaptivecpp_device_ptr_test_type { } }; -using adaptivecpp_device_ptr_tuple = std::tuple, adaptivecpp_device_ptr_test_type>; +using adaptivecpp_device_ptr_tuple = std::tuple, adaptivecpp_device_ptr_test_type>; // the tests used in the instantiated GTest test suites using adaptivecpp_device_ptr_type_gtest = util::combine_test_parameters_gtest_t>; @@ -42,3 +43,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtr, DevicePtr, adaptivecpp_devi INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtr, DevicePtrLayout, adaptivecpp_device_ptr_layout_type_gtest, naming::test_parameter_to_name); INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrDeathTest, DevicePtrDeathTest, adaptivecpp_device_ptr_type_gtest, naming::test_parameter_to_name); + +// +// test USM pointer +// + +using adaptivecpp_device_ptr_usm_tuple = std::tuple, adaptivecpp_device_ptr_test_type>; + +// the tests used in the instantiated GTest test suites +using adaptivecpp_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t>; +using adaptivecpp_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t, util::layout_type_list>; + +// instantiate type-parameterized tests +INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSM, DevicePtr, adaptivecpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSM, DevicePtrLayout, adaptivecpp_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name); + +INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSMDeathTest, DevicePtrDeathTest, adaptivecpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name); diff --git a/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp b/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp index 06722fc3f..afbc9cd1b 100644 --- a/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp +++ b/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp @@ -20,10 +20,11 @@ #include // std::tuple -template +template struct dpcpp_device_ptr_test_type { using device_ptr_type = plssvm::dpcpp::detail::device_ptr; using queue_type = typename device_ptr_type::queue_type; + constexpr static bool use_usm_allocations = UUA; static const queue_type &default_queue() { static const queue_type queue = plssvm::dpcpp::detail::get_default_queue(); @@ -31,7 +32,7 @@ struct dpcpp_device_ptr_test_type { } }; -using dpcpp_device_ptr_tuple = std::tuple, dpcpp_device_ptr_test_type>; +using dpcpp_device_ptr_tuple = std::tuple, dpcpp_device_ptr_test_type>; // the tests used in the instantiated GTest test suites using dpcpp_device_ptr_type_gtest = util::combine_test_parameters_gtest_t>; @@ -42,3 +43,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtr, DevicePtr, dpcpp_device_ptr_type_ INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtr, DevicePtrLayout, dpcpp_device_ptr_layout_type_gtest, naming::test_parameter_to_name); INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrDeathTest, DevicePtrDeathTest, dpcpp_device_ptr_type_gtest, naming::test_parameter_to_name); + +// +// test USM pointer +// + +using dpcpp_device_ptr_usm_tuple = std::tuple, dpcpp_device_ptr_test_type>; + +// the tests used in the instantiated GTest test suites +using dpcpp_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t>; +using dpcpp_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t, util::layout_type_list>; + +// instantiate type-parameterized tests +INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSM, DevicePtr, dpcpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name); +INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSM, DevicePtrLayout, dpcpp_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name); + +INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSMDeathTest, DevicePtrDeathTest, dpcpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name); diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp index 562785728..4cfac24bb 100644 --- a/tests/backends/generic_csvm_tests.hpp +++ b/tests/backends/generic_csvm_tests.hpp @@ -194,6 +194,7 @@ template VALUE NOT USED! } case plssvm::solver_type::cg_explicit: + case plssvm::solver_type::cg_streaming: // no additional arguments are used return init_explicit_matrices(std::move(matr), csvm); case plssvm::solver_type::cg_implicit: @@ -900,7 +901,7 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) { #else SUCCEED() << "Solver type is automatic, but assertions are disabled!"; #endif - } else if constexpr (solver == plssvm::solver_type::cg_explicit) { + } else if constexpr (solver == plssvm::solver_type::cg_explicit || solver == plssvm::solver_type::cg_streaming) { // run the assemble the kernel matrix kernels const std::vector kernel_matrix_d = svm.assemble_kernel_matrix(solver, params, data, q_red, QA_cost); ASSERT_EQ(kernel_matrix_d.size(), num_devices); @@ -1010,7 +1011,7 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) { #else SUCCEED() << "Solver type is automatic, but assertions are disabled!"; #endif - } else if constexpr (solver == plssvm::solver_type::cg_explicit) { + } else if constexpr (solver == plssvm::solver_type::cg_explicit || solver == plssvm::solver_type::cg_streaming) { // run the assemble the kernel matrix kernels const std::vector kernel_matrix_d = svm.assemble_kernel_matrix(solver, params, data, q_red, QA_cost); ASSERT_EQ(kernel_matrix_d.size(), num_devices); diff --git a/tests/backends/generic_device_ptr_tests.hpp b/tests/backends/generic_device_ptr_tests.hpp index 6a8713dc7..2f142d49d 100644 --- a/tests/backends/generic_device_ptr_tests.hpp +++ b/tests/backends/generic_device_ptr_tests.hpp @@ -40,13 +40,14 @@ TYPED_TEST_SUITE_P(DevicePtr); TYPED_TEST_P(DevicePtr, default_construct) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; // default construct device_ptr const device_ptr_type ptr{}; // empty data EXPECT_FALSE(static_cast(ptr)); - EXPECT_EQ(ptr.get(), nullptr); + EXPECT_EQ(ptr.get(), data_ptr_type{}); EXPECT_EQ(ptr.size(), 0); EXPECT_EQ(ptr.shape(), (plssvm::shape{ 0, 0 })); EXPECT_TRUE(ptr.empty()); @@ -55,15 +56,17 @@ TYPED_TEST_P(DevicePtr, default_construct) { TYPED_TEST_P(DevicePtr, construct_size) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr{ 42, queue }; + const device_ptr_type ptr{ 42, queue, use_usm_allocations }; // check data EXPECT_TRUE(static_cast(ptr)); - EXPECT_NE(ptr.get(), nullptr); + EXPECT_NE(ptr.get(), data_ptr_type{}); EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 1 })); // check padding EXPECT_EQ(ptr.padding(), (plssvm::shape{ 0, 0 })); @@ -73,15 +76,17 @@ TYPED_TEST_P(DevicePtr, construct_size) { TYPED_TEST_P(DevicePtr, construct_shape) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, queue }; + const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations }; // check data EXPECT_TRUE(static_cast(ptr)); - EXPECT_NE(ptr.get(), nullptr); + EXPECT_NE(ptr.get(), data_ptr_type{}); EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 16 })); // check padding EXPECT_EQ(ptr.padding(), (plssvm::shape{ 0, 0 })); @@ -91,15 +96,17 @@ TYPED_TEST_P(DevicePtr, construct_shape) { TYPED_TEST_P(DevicePtr, construct_shape_and_padding) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 4 }, queue }; + const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations }; // check data EXPECT_TRUE(static_cast(ptr)); - EXPECT_NE(ptr.get(), nullptr); + EXPECT_NE(ptr.get(), data_ptr_type{}); EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 16 })); // check padding EXPECT_EQ(ptr.padding(), (plssvm::shape{ 4, 4 })); @@ -109,17 +116,19 @@ TYPED_TEST_P(DevicePtr, construct_shape_and_padding) { TYPED_TEST_P(DevicePtr, move_construct) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type first{ 42, queue }; + device_ptr_type first{ 42, queue, use_usm_allocations }; const device_ptr_type second{ std::move(first) }; // check data EXPECT_TRUE(static_cast(second)); // EXPECT_EQ(second.queue(), queue); - EXPECT_NE(second.get(), nullptr); + EXPECT_NE(second.get(), data_ptr_type{}); EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 })); // check padding EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 })); @@ -127,7 +136,7 @@ TYPED_TEST_P(DevicePtr, move_construct) { // check moved-from data EXPECT_FALSE(static_cast(first)); - EXPECT_EQ(first.get(), nullptr); + EXPECT_EQ(first.get(), data_ptr_type{}); EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 })); // check padding EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 })); @@ -137,17 +146,19 @@ TYPED_TEST_P(DevicePtr, move_construct) { TYPED_TEST_P(DevicePtr, move_construct_with_padding) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue }; + device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations }; const device_ptr_type second{ std::move(first) }; // check data EXPECT_TRUE(static_cast(second)); // EXPECT_EQ(second.queue(), queue); - EXPECT_NE(second.get(), nullptr); + EXPECT_NE(second.get(), data_ptr_type{}); EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 })); // check padding EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 })); @@ -155,7 +166,7 @@ TYPED_TEST_P(DevicePtr, move_construct_with_padding) { // check moved-from data EXPECT_FALSE(static_cast(first)); - EXPECT_EQ(first.get(), nullptr); + EXPECT_EQ(first.get(), data_ptr_type{}); EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 })); // check padding EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 })); @@ -165,11 +176,13 @@ TYPED_TEST_P(DevicePtr, move_construct_with_padding) { TYPED_TEST_P(DevicePtr, move_assign) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type first{ 42, queue }; + device_ptr_type first{ 42, queue, use_usm_allocations }; device_ptr_type second; // move assign @@ -177,7 +190,7 @@ TYPED_TEST_P(DevicePtr, move_assign) { // check data EXPECT_TRUE(static_cast(second)); - EXPECT_NE(second.get(), nullptr); + EXPECT_NE(second.get(), data_ptr_type{}); EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 })); // check padding EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 })); @@ -185,7 +198,7 @@ TYPED_TEST_P(DevicePtr, move_assign) { // check moved-from data EXPECT_FALSE(static_cast(first)); - EXPECT_EQ(first.get(), nullptr); + EXPECT_EQ(first.get(), data_ptr_type{}); EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 })); // check padding EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 })); @@ -195,11 +208,13 @@ TYPED_TEST_P(DevicePtr, move_assign) { TYPED_TEST_P(DevicePtr, move_assign_with_padding) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue }; + device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations }; device_ptr_type second; // move assign @@ -207,7 +222,7 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) { // check data EXPECT_TRUE(static_cast(second)); - EXPECT_NE(second.get(), nullptr); + EXPECT_NE(second.get(), data_ptr_type{}); EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 })); // check padding EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 })); @@ -215,7 +230,7 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) { // check moved-from data EXPECT_FALSE(static_cast(first)); - EXPECT_EQ(first.get(), nullptr); + EXPECT_EQ(first.get(), data_ptr_type{}); EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 })); // check padding EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 })); @@ -225,11 +240,13 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) { TYPED_TEST_P(DevicePtr, swap_member_function) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct two device_ptr - device_ptr_type first{ 42, queue }; + device_ptr_type first{ 42, queue, use_usm_allocations }; device_ptr_type second{}; // swap both device_ptr using the member function @@ -237,14 +254,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function) { // check data EXPECT_TRUE(static_cast(second)); - EXPECT_NE(second.get(), nullptr); + EXPECT_NE(second.get(), data_ptr_type{}); EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 })); // check padding EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 })); EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 42, 1 })); EXPECT_FALSE(static_cast(first)); - EXPECT_EQ(first.get(), nullptr); + EXPECT_EQ(first.get(), data_ptr_type{}); EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 })); // check padding EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 })); @@ -254,11 +271,13 @@ TYPED_TEST_P(DevicePtr, swap_member_function) { TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct two device_ptr - device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue }; + device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations }; device_ptr_type second{}; // swap both device_ptr using the member function @@ -266,14 +285,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) { // check data EXPECT_TRUE(static_cast(second)); - EXPECT_NE(second.get(), nullptr); + EXPECT_NE(second.get(), data_ptr_type{}); EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 })); // check padding EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 })); EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 46, 15 })); EXPECT_FALSE(static_cast(first)); - EXPECT_EQ(first.get(), nullptr); + EXPECT_EQ(first.get(), data_ptr_type{}); EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 })); // check padding EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 })); @@ -283,12 +302,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) { TYPED_TEST_P(DevicePtr, swap_free_function) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct two device_ptr - device_ptr_type first{ 42, queue }; - device_ptr_type second; + device_ptr_type first{ 42, queue, use_usm_allocations }; + device_ptr_type second{}; // swap both device_ptr using the free function using std::swap; @@ -296,14 +317,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function) { // check data EXPECT_TRUE(static_cast(second)); - EXPECT_NE(second.get(), nullptr); + EXPECT_NE(second.get(), data_ptr_type{}); EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 })); // check padding EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 })); EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 42, 1 })); EXPECT_FALSE(static_cast(first)); - EXPECT_EQ(first.get(), nullptr); + EXPECT_EQ(first.get(), data_ptr_type{}); EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 })); // check padding EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 })); @@ -313,12 +334,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function) { TYPED_TEST_P(DevicePtr, swap_free_function_with_padding) { using test_type = typename TestFixture::fixture_test_type; using device_ptr_type = typename test_type::device_ptr_type; + using data_ptr_type = typename device_ptr_type::device_pointer_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct two device_ptr - device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue }; - device_ptr_type second; + device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations }; + device_ptr_type second{}; // swap both device_ptr using the free function using std::swap; @@ -326,14 +349,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function_with_padding) { // check data EXPECT_TRUE(static_cast(second)); - EXPECT_NE(second.get(), nullptr); + EXPECT_NE(second.get(), data_ptr_type{}); EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 })); // check padding EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 })); EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 46, 15 })); EXPECT_FALSE(static_cast(first)); - EXPECT_EQ(first.get(), nullptr); + EXPECT_EQ(first.get(), data_ptr_type{}); EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 })); // check padding EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 })); @@ -345,9 +368,10 @@ TYPED_TEST_P(DevicePtr, operator_bool) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr1{ 42, queue }; + const device_ptr_type ptr1{ 42, queue, use_usm_allocations }; EXPECT_TRUE(static_cast(ptr1)); // construct empty device_ptr @@ -360,17 +384,18 @@ TYPED_TEST_P(DevicePtr, size) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr1{ 42, queue }; + const device_ptr_type ptr1{ 42, queue, use_usm_allocations }; EXPECT_EQ(ptr1.size(), 42); // construct device_ptr with shape - const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue }; + const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations }; EXPECT_EQ(ptr2.size(), 42 * 16); // construct device_ptr with shape and padding - const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue }; + const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations }; EXPECT_EQ(ptr3.size(), 42 * 16); // construct empty device_ptr @@ -383,17 +408,18 @@ TYPED_TEST_P(DevicePtr, shape) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr1{ 42, queue }; + const device_ptr_type ptr1{ 42, queue, use_usm_allocations }; EXPECT_EQ(ptr1.shape(), (plssvm::shape{ 42, 1 })); // construct device_ptr with shape - const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue }; + const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations }; EXPECT_EQ(ptr2.shape(), (plssvm::shape{ 42, 16 })); // construct device_ptr with shape and padding - const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue }; + const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations }; EXPECT_EQ(ptr3.shape(), (plssvm::shape{ 42, 16 })); // construct empty device_ptr @@ -406,21 +432,22 @@ TYPED_TEST_P(DevicePtr, empty) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr1{ 42, queue }; + const device_ptr_type ptr1{ 42, queue, use_usm_allocations }; EXPECT_FALSE(ptr1.empty()); // construct device_ptr - const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue }; + const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations }; EXPECT_FALSE(ptr2.empty()); // construct device_ptr - const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue }; + const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations }; EXPECT_FALSE(ptr3.empty()); // construct device_ptr - const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue }; + const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations }; EXPECT_TRUE(ptr4.empty()); // construct empty device_ptr @@ -433,9 +460,10 @@ TYPED_TEST_P(DevicePtr, padding) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue }; + const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations }; EXPECT_EQ(ptr.padding(), (plssvm::shape{ 4, 5 })); ; } @@ -445,9 +473,10 @@ TYPED_TEST_P(DevicePtr, size_padded) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue }; + const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations }; EXPECT_EQ(ptr.size_padded(), (42 + 4) * (16 + 5)); } @@ -456,17 +485,18 @@ TYPED_TEST_P(DevicePtr, shape_padded) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr1{ 42, queue }; + const device_ptr_type ptr1{ 42, queue, use_usm_allocations }; EXPECT_EQ(ptr1.shape_padded(), (plssvm::shape{ 42, 1 })); // construct device_ptr with shape - const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue }; + const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations }; EXPECT_EQ(ptr2.shape_padded(), (plssvm::shape{ 42, 16 })); // construct device_ptr with shape and padding - const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue }; + const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations }; EXPECT_EQ(ptr3.shape_padded(), (plssvm::shape{ 45, 19 })); // construct empty device_ptr @@ -479,25 +509,26 @@ TYPED_TEST_P(DevicePtr, is_padded) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - const device_ptr_type ptr1{ 42, queue }; + const device_ptr_type ptr1{ 42, queue, use_usm_allocations }; EXPECT_FALSE(ptr1.is_padded()); // construct device_ptr - const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue }; + const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations }; EXPECT_FALSE(ptr2.is_padded()); // construct device_ptr - const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue }; + const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations }; EXPECT_TRUE(ptr3.is_padded()); // construct device_ptr - const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue }; + const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations }; EXPECT_TRUE(ptr4.is_padded()); // construct device_ptr - const device_ptr_type ptr5{ plssvm::shape{ 42, 16 }, plssvm::shape{ 0, 0 }, queue }; + const device_ptr_type ptr5{ plssvm::shape{ 42, 16 }, plssvm::shape{ 0, 0 }, queue, use_usm_allocations }; EXPECT_FALSE(ptr5.is_padded()); // construct empty device_ptr @@ -511,9 +542,10 @@ TYPED_TEST_P(DevicePtr, memset) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // memset values to all ones ptr.memset(1, 2); @@ -534,9 +566,10 @@ TYPED_TEST_P(DevicePtr, memset_with_numbytes) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // memset values to all ones ptr.memset(1, 2, 4 * sizeof(value_type)); @@ -556,9 +589,10 @@ TYPED_TEST_P(DevicePtr, memset_invalid_pos) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // perform invalid memset EXPECT_THROW_WHAT(ptr.memset(0, 10, 1), @@ -572,9 +606,10 @@ TYPED_TEST_P(DevicePtr, fill) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // fill values with a specific value ptr.fill(value_type{ 42.0 }, 2); @@ -595,9 +630,10 @@ TYPED_TEST_P(DevicePtr, fill_with_count) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // fill values with a specific value ptr.fill(value_type{ 42.0 }, 2, 4); @@ -618,9 +654,10 @@ TYPED_TEST_P(DevicePtr, fill_invalid_pos) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // perform invalid fill EXPECT_THROW_WHAT(ptr.fill(value_type{ 42.0 }, 10, 1), @@ -634,9 +671,10 @@ TYPED_TEST_P(DevicePtr, copy_vector) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // create data to copy to the device std::vector data(14, 42); @@ -657,9 +695,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_back_all) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 6, queue }; + device_ptr_type ptr{ 6, queue, use_usm_allocations }; // create data to copy to the device std::vector data(6, 42); @@ -680,9 +719,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_back_some) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 6, queue }; + device_ptr_type ptr{ 6, queue, use_usm_allocations }; // create data to copy to the device std::vector data(6, 42); @@ -703,9 +743,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_to_too_many) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 6, queue }; + device_ptr_type ptr{ 6, queue, use_usm_allocations }; // create data to copy to the device std::vector data(6, 42); @@ -726,9 +767,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_too_few_host_elements) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // try copying data to the device with too few elements std::vector data(8, 42); @@ -741,9 +783,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_too_few_buffer_elements) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // try copying data back to the host with a buffer with too few elements std::vector buffer(8); @@ -756,9 +799,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_too_few_host_elements) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // try copying data to the device with too few elements std::vector data(4, 42); @@ -771,9 +815,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_too_few_buffer_elements) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 6, queue }; + device_ptr_type ptr{ 6, queue, use_usm_allocations }; // try copying data back to the host with a buffer with too few elements std::vector buffer(4); @@ -786,9 +831,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations }; // create data to copy to the device std::vector data(20); // 5 x 4 @@ -811,9 +857,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided_invalid_spitch_width_combination) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations }; // create data to copy to the device std::vector data(20); // 5 x 4 @@ -828,9 +875,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided_submatrix_too_big) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations }; // create data to copy to the device std::vector data(20); // 5 x 4 @@ -845,9 +893,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // create data to copy to the device std::vector data(14, 42); @@ -868,9 +917,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_back_all) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 6, queue }; + device_ptr_type ptr{ 6, queue, use_usm_allocations }; // create data to copy to the device std::vector data(6, 42); @@ -891,9 +941,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_back_some) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 6, queue }; + device_ptr_type ptr{ 6, queue, use_usm_allocations }; // create data to copy to the device std::vector data(6, 42); @@ -914,9 +965,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_to_too_many) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 6, queue }; + device_ptr_type ptr{ 6, queue, use_usm_allocations }; // create data to copy to the device std::vector data(6, 42); @@ -937,9 +989,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_strided) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations }; // create data to copy to the device std::vector data(20); // 5 x 4 @@ -962,9 +1015,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_strided_invalid_spitch_width_combination) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations }; // create data to copy to the device std::vector data(20); // 5 x 4 @@ -979,9 +1033,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // create data to copy to the device std::vector data(14, 42); @@ -990,7 +1045,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device) { ptr.copy_to_device(data); // other device_ptr - device_ptr_type other_ptr{ 10, queue }; + device_ptr_type other_ptr{ 10, queue, !use_usm_allocations }; ptr.copy_to_other_device(other_ptr); // copy data back to the host @@ -1007,9 +1062,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_too_few_device_elements) using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // create data to copy to the device std::vector data(14, 42); @@ -1018,7 +1074,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_too_few_device_elements) ptr.copy_to_device(data); // other device_ptr - device_ptr_type other_ptr{ 5, queue }; + device_ptr_type other_ptr{ 5, queue, !use_usm_allocations }; EXPECT_THROW_WHAT(ptr.copy_to_other_device(other_ptr), plssvm::exception, "Buffer too small to perform copy (needed: 10, provided: 5)!"); } @@ -1028,9 +1084,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_with_count) { using value_type = typename device_ptr_type::value_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // create data to copy to the device std::vector data(14, 42); @@ -1039,7 +1096,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_with_count) { ptr.copy_to_device(data); // other device_ptr - device_ptr_type other_ptr{ 5, queue }; + device_ptr_type other_ptr{ 5, queue, !use_usm_allocations }; ptr.copy_to_other_device(other_ptr, 1, 5); // copy data back to the host @@ -1113,9 +1170,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // create data to copy to the device const plssvm::matrix data{ plssvm::shape{ 5, 3 }, value_type{ 42 } }; @@ -1146,9 +1204,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_with_padding) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations }; // create data to copy to the device const plssvm::matrix data{ plssvm::shape{ 5, 3 }, value_type{ 42 }, plssvm::shape{ 4, 4 } }; @@ -1171,9 +1230,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_different_layouts) { const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; constexpr plssvm::layout_type other_layout = layout == plssvm::layout_type::aos ? plssvm::layout_type::soa : plssvm::layout_type::aos; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // create data to copy to the device const plssvm::matrix data{ plssvm::shape{ 5, 3 }, value_type{ 42 } }; @@ -1204,9 +1264,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_too_few_host_elements) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // try copying data to the device with too few elements plssvm::matrix data{ plssvm::shape{ 2, 4 }, value_type{ 42 } }; @@ -1220,9 +1281,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_too_few_buffer_elements) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // try copying data back to the host with a buffer with too few elements plssvm::matrix buffer{ plssvm::shape{ 2, 4 } }; @@ -1236,9 +1298,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape(2, 3), queue }; + device_ptr_type ptr{ plssvm::shape(2, 3), queue, use_usm_allocations }; // create data to copy to the device const auto data = util::generate_specific_matrix>(plssvm::shape{ 5, 3 }); @@ -1261,9 +1324,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_with_padding) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 3, 3 }, plssvm::shape{ 4, 4 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 3, 3 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations }; // create data to copy to the device const auto data = util::generate_specific_matrix>(plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 }); @@ -1291,9 +1355,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_different_layouts) { const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; constexpr plssvm::layout_type other_layout = layout == plssvm::layout_type::aos ? plssvm::layout_type::soa : plssvm::layout_type::aos; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 2, 3 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 2, 3 }, queue, use_usm_allocations }; // create data to copy to the device const auto data = util::generate_specific_matrix>(plssvm::shape{ 5, 3 }); @@ -1325,9 +1390,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_full_matrix_strided) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape(5, 3), queue }; + device_ptr_type ptr{ plssvm::shape(5, 3), queue, use_usm_allocations }; // create data to copy to the device const auto data = util::generate_specific_matrix>(plssvm::shape{ 5, 3 }); @@ -1349,9 +1415,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_too_few_host_elements) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue, use_usm_allocations }; // try copying data to the device with too few elements plssvm::matrix data{ plssvm::shape{ 2, 4 }, value_type{ 42 } }; @@ -1365,9 +1432,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_invalid_submatrix) { using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>; + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue }; + device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue, use_usm_allocations }; // try copying data to the device with too few elements plssvm::matrix data{ plssvm::shape{ 4, 5 }, value_type{ 42 } }; @@ -1423,9 +1491,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_invalid_host_ptr) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // copy with invalid data pointer EXPECT_DEATH(ptr.copy_to_device(nullptr), ::testing::HasSubstr("Invalid host pointer for the data to copy!")); @@ -1456,9 +1525,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_with_count_invalid_host_ptr) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // copy with invalid data pointer EXPECT_DEATH(ptr.copy_to_device(nullptr, 0, 10), ::testing::HasSubstr("Invalid host pointer for the data to copy!")); @@ -1486,9 +1556,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_strided_invalid_host_ptr) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct device_ptr - device_ptr_type ptr{ 1, queue }; + device_ptr_type ptr{ 1, queue, use_usm_allocations }; // copy with invalid data pointer EXPECT_DEATH(ptr.copy_to_device_strided(nullptr, 0, 0, 0), ::testing::HasSubstr("Invalid host pointer for the data to copy!")); @@ -1513,10 +1584,11 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_to_other_device_invalid_device_ptr) { using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct default device_ptr device_ptr_type def{}; - device_ptr_type ptr{ 2, queue }; + device_ptr_type ptr{ 2, queue, use_usm_allocations }; // copy with invalid device pointer EXPECT_DEATH(def.copy_to_other_device(ptr), ::testing::HasSubstr("Invalid data pointer! Maybe *this has been default constructed?")); @@ -1528,10 +1600,11 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_to_other_device_with_count_invalid_device_ using device_ptr_type = typename test_type::device_ptr_type; using queue_type = typename test_type::queue_type; const queue_type &queue = test_type::default_queue(); + constexpr bool use_usm_allocations = test_type::use_usm_allocations; // construct default device_ptr device_ptr_type def{}; - device_ptr_type ptr{ 10, queue }; + device_ptr_type ptr{ 10, queue, use_usm_allocations }; // copy with invalid device pointer EXPECT_DEATH(def.copy_to_other_device(ptr, 0, 10), ::testing::HasSubstr("Invalid data pointer! Maybe *this has been default constructed?")); diff --git a/tests/backends/generic_gpu_csvm_tests.hpp b/tests/backends/generic_gpu_csvm_tests.hpp index dea31b85c..dfd0b2bb4 100644 --- a/tests/backends/generic_gpu_csvm_tests.hpp +++ b/tests/backends/generic_gpu_csvm_tests.hpp @@ -409,7 +409,82 @@ TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_assemble_kernel_matrix_explicit) const plssvm::detail::execution_range exec{ block, svm.get_max_work_group_size(device_id), grid, svm.get_max_grid_size(device_id) }; // calculate the current part of the kernel matrix - const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost); + const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, false, data_d, q_red_d, QA_cost); + + // copy the kernel matrix back to the host + std::vector kernel_matrix(kernel_matrix_d.size()); + kernel_matrix_d.copy_to_host(kernel_matrix); + + // calculate ground truth + const std::vector correct_kernel_matrix = ground_truth::assemble_device_specific_kernel_matrix(params, data_matr, q_red, QA_cost, *svm.data_distribution_, device_id); + + // check for correctness + ASSERT_EQ(kernel_matrix.size(), correct_kernel_matrix.size()); + EXPECT_FLOATING_POINT_VECTOR_NEAR_EPS(kernel_matrix, correct_kernel_matrix, 1e6); + } +} + +TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_assemble_kernel_matrix_explicit_USM) { + using csvm_test_type = util::test_parameter_type_at_t<0, TypeParam>; + using mock_csvm_type = typename csvm_test_type::mock_csvm_type; + using device_ptr_type = typename csvm_test_type::device_ptr_type; + constexpr plssvm::kernel_function_type kernel = util::test_parameter_value_at_v<0, TypeParam>; + + plssvm::parameter params{ plssvm::kernel_type = kernel }; + if constexpr (kernel != plssvm::kernel_function_type::linear) { + params.gamma = plssvm::real_type{ 0.001 }; + } + const plssvm::data_set data{ PLSSVM_TEST_FILE }; + auto data_matr{ data.data() }; + if constexpr (kernel == plssvm::kernel_function_type::chi_squared) { + // chi-squared is well-defined for non-negative values only + data_matr = util::matrix_abs(data_matr); + } + + // create C-SVM: must be done using the mock class since the member function to test is private or protected + const mock_csvm_type svm = util::construct_from_tuple(params, csvm_test_type::additional_arguments); + const std::size_t num_devices = svm.num_available_devices(); + // be sure to use the correct data distribution + svm.data_distribution_ = std::make_unique(data.num_data_points() - 1, num_devices); + + // perform dimensional reduction + const auto [q_red, QA_cost] = ground_truth::perform_dimensional_reduction(params, data_matr); + + for (std::size_t device_id = 0; device_id < num_devices; ++device_id) { + SCOPED_TRACE(fmt::format("device_id {} ({}/{})", device_id, device_id + 1, num_devices)); + + // check whether the current device is responsible for at least one data point! + if (svm.data_distribution_->place_specific_num_rows(device_id) == 0) { + continue; + } + auto &device = svm.devices_[device_id]; + + // upload complete A and q_red to each device + device_ptr_type data_d{ data_matr.shape(), data_matr.padding(), device }; + data_d.copy_to_device(data_matr); + + device_ptr_type q_red_d{ q_red.size() + plssvm::PADDING_SIZE, device }; + q_red_d.copy_to_device(q_red, 0, q_red.size()); + + // kernel launch specific sizes + const unsigned long long num_rows_reduced = data_matr.shape().x; + const unsigned long long device_specific_num_rows = svm.data_distribution_->place_specific_num_rows(device_id); + const unsigned long long device_row_offset = svm.data_distribution_->place_row_offset(device_id); + + // the block dimension is THREAD_BLOCK_SIZE x THREAD_BLOCK_SIZE + const plssvm::detail::dim_type block{ std::size_t{ plssvm::THREAD_BLOCK_SIZE }, std::size_t{ plssvm::THREAD_BLOCK_SIZE } }; + + // define the full execution grid + const plssvm::detail::dim_type grid{ + static_cast(std::ceil(static_cast(num_rows_reduced - device_row_offset) / static_cast(block.x * plssvm::INTERNAL_BLOCK_SIZE))), + static_cast(std::ceil(static_cast(device_specific_num_rows) / static_cast(block.y * plssvm::INTERNAL_BLOCK_SIZE))) + }; + + // create the final execution range + const plssvm::detail::execution_range exec{ block, svm.get_max_work_group_size(device_id), grid, svm.get_max_grid_size(device_id) }; + + // calculate the current part of the kernel matrix + const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, true, data_d, q_red_d, QA_cost); // copy the kernel matrix back to the host std::vector kernel_matrix(kernel_matrix_d.size()); @@ -606,6 +681,7 @@ TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_predict_kernel) { REGISTER_TYPED_TEST_SUITE_P(GenericGPUCSVMKernelFunction, run_assemble_kernel_matrix_explicit, + run_assemble_kernel_matrix_explicit_USM, run_assemble_kernel_matrix_implicit_blas_level_3, run_predict_kernel); diff --git a/tests/detail/cmd/parser_train.cpp b/tests/detail/cmd/parser_train.cpp index ba1392d75..54ea0de43 100644 --- a/tests/detail/cmd/parser_train.cpp +++ b/tests/detail/cmd/parser_train.cpp @@ -383,7 +383,7 @@ TEST_P(ParserTrainSolver, parsing) { // clang-format off INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSolver, ::testing::Combine( ::testing::Values("-l", "--solver"), - ::testing::Values(plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_implicit)), + ::testing::Values(plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_streaming, plssvm::solver_type::cg_implicit)), naming::pretty_print_parameter_flag_and_value); // clang-format on diff --git a/tests/solver_types.cpp b/tests/solver_types.cpp index acf5a5464..f0fc68e03 100644 --- a/tests/solver_types.cpp +++ b/tests/solver_types.cpp @@ -22,12 +22,13 @@ TEST(SolverType, to_string) { // check conversions to std::string EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::automatic, "automatic"); EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_explicit, "cg_explicit"); + EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_streaming, "cg_streaming"); EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_implicit, "cg_implicit"); } TEST(SolverType, to_string_unknown) { // check conversions to std::string from unknown solver_type - EXPECT_CONVERSION_TO_STRING(static_cast(3), "unknown"); + EXPECT_CONVERSION_TO_STRING(static_cast(4), "unknown"); } // check whether the std::string -> plssvm::solver_type conversions are correct @@ -39,6 +40,8 @@ TEST(SolverType, from_string) { EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::solver_type::automatic); EXPECT_CONVERSION_FROM_STRING("cg_explicit", plssvm::solver_type::cg_explicit); EXPECT_CONVERSION_FROM_STRING("CG_Explicit", plssvm::solver_type::cg_explicit); + EXPECT_CONVERSION_FROM_STRING("cg_streaming", plssvm::solver_type::cg_streaming); + EXPECT_CONVERSION_FROM_STRING("CG_Streaming", plssvm::solver_type::cg_streaming); EXPECT_CONVERSION_FROM_STRING("cg_implicit", plssvm::solver_type::cg_implicit); EXPECT_CONVERSION_FROM_STRING("CG_Implicit", plssvm::solver_type::cg_implicit); } diff --git a/tests/types_to_test.hpp b/tests/types_to_test.hpp index 44db342b3..f8f5fc4de 100644 --- a/tests/types_to_test.hpp +++ b/tests/types_to_test.hpp @@ -475,7 +475,7 @@ constexpr std::array classification_types_to_tes }; /// A list of all available solver types. constexpr std::array solver_types_to_test = { - plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_implicit + plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_streaming, plssvm::solver_type::cg_implicit }; /// A list of all solver types. From 91b75b36568f6dbf1720fbc1e48d82da9c679010 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 14 Oct 2024 16:13:05 +0200 Subject: [PATCH 23/93] Add missing data set size contribution. --- src/plssvm/detail/data_distribution.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/plssvm/detail/data_distribution.cpp b/src/plssvm/detail/data_distribution.cpp index 016260389..9992d113d 100644 --- a/src/plssvm/detail/data_distribution.cpp +++ b/src/plssvm/detail/data_distribution.cpp @@ -205,7 +205,11 @@ std::pair> triangular_data_distribution::c // add up the individual sizes and report the memory size in BYTES // for streaming, the kernel matrix is on the host, while everything else is on the device - res.first += memory_size{ sizeof(real_type) * kernel_matrix_size }; + res.first += memory_size{ sizeof(real_type) }; + if (device_id == 0) { + // we also store the data set, q vector and BLAS matrices on the system + res.first += memory_size{ sizeof(real_type) * (data_set_size + q_red_size + blas_matrices_size) }; + } res.second[device_id] = memory_size{ sizeof(real_type) * (q_red_size + std::max(data_set_size, blas_matrices_size)) }; } From 18691a50edf49837d76abe8b4197eb0f3d4363e5 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 24 May 2025 22:07:03 +0200 Subject: [PATCH 24/93] Improve performance (mainly on AMD GPUs) and change implementations slightly such that the backends are more similar. --- .../cg_explicit/kernel_matrix_assembly.cuh | 118 +++++++------- .../kernel_matrix_assembly.hip.hpp | 118 +++++++------- .../cg_explicit/kernel_matrix_assembly.hpp | 2 +- .../cg_explicit/kernel_matrix_assembly.hpp | 126 +++++++-------- .../cg_explicit/kernel_matrix_assembly.cl | 91 +++++------ .../basic/kernel_matrix_assembly.hpp | 55 +++---- .../hierarchical/kernel_matrix_assembly.hpp | 144 +++++++++--------- .../scoped/kernel_matrix_assembly.hpp | 135 ++++++++-------- .../work_group/kernel_matrix_assembly.hpp | 124 +++++++-------- .../cg_explicit/kernel_matrix_assembly.hpp | 2 +- include/plssvm/constants.hpp | 7 +- src/plssvm/backends/Kokkos/csvm.cpp | 10 +- src/plssvm/backends/OpenCL/detail/utility.cpp | 4 +- .../detail/tracking/performance_tracker.cpp | 4 +- 14 files changed, 482 insertions(+), 458 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh index 8a766b7db..2a3eef5c4 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh @@ -14,20 +14,22 @@ #pragma once #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -37,80 +39,84 @@ namespace plssvm::cuda::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching data point features + __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current thread paying attention to coalesced memory accesses + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data points and wrt the current device + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = device_row_offset + device_global_i; + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = device_row_offset + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp index 75a3cd9a5..f0e01f813 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp @@ -14,23 +14,25 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -40,80 +42,84 @@ namespace plssvm::hip::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching data point features + __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current thread paying attention to coalesced memory accesses + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data points and wrt the current device + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = device_row_offset + device_global_i; + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = device_row_offset + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp index e575c6af2..af1d3c9e2 100644 --- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -15,7 +15,7 @@ #pragma once #include "plssvm/backends/HPX/kernel/kernel_functions.hpp" // plssvm::hpx::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp index 8e42e8b41..2a83b311f 100644 --- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp" // plssvm::kokkos::detail::standard_layout_tuple #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents @@ -41,11 +41,11 @@ class device_kernel_assembly { public: /** * @brief Initialize the Kokkos kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -55,12 +55,12 @@ class device_kernel_assembly { * @param[in] grid_size_x the size of the execution grid in x-dimension * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(device_view_type kernel_matrix_d, device_view_type data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, device_view_type q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(device_view_type kernel_matrix, device_view_type data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, device_view_type q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -78,80 +78,84 @@ class device_kernel_assembly { KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - - // create the shared memory arrays used for caching data point features - constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in block x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in block y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in block x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in block y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching data point features + constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); - Kokkos::mdspan> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_i_cache{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + Kokkos::mdspan> data_j_cache{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further + // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; - data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; - data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; - data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; - } - team.team_barrier(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i), - data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j)); + { + // calculate the indices used in the current thread paying attention to coalesced memory accesses + const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + } + team.team_barrier(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i), + data_j_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j)); + } } } + team.team_barrier(); // wait until all threads performed their part of the calculations } - team.team_barrier(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) + // calculate the indices to access the global data points and wrt the current device const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_i = device_row_offset + device_global_i; const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); + const auto global_j = device_row_offset + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -160,11 +164,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - device_view_type kernel_matrix_d_; - device_view_type data_d_; + device_view_type kernel_matrix_; + device_view_type data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; device_view_type q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl index 481945ca6..99bc02933 100644 --- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl +++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl @@ -14,11 +14,11 @@ /** * @brief Create the explicit kernel matrix using the kernel function determined at runtime. * @details The `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER`, `PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION`, and `PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION` placeholder will be replaced by the correct values upon kernel construction. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -27,78 +27,83 @@ * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen) */ -__kernel void device_kernel_assembly(__global real_type *kernel_matrix_d, const __global real_type *data_d, const ulong num_rows, const ulong device_num_rows, const ulong row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { +__kernel void device_kernel_assembly(__global real_type *kernel_matrix, const __global real_type *data, const ulong num_rows, const ulong device_num_rows, const ulong device_row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; - const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; - const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large // create the local memory arrays used for caching data point features - __local real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_x >= blockIdx_y) { - // create a thread private array used for internal caching + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; + { + // calculate the indices used in the current work-item paying attention to coalesced memory accesses + const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; + const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i]; - data_cache_i[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i]; - data_cache_j[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j]; - data_cache_j[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j]; - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (ulong dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_ul) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const ulong global_i_linear = device_row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; + const ulong global_j_linear = device_row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - // perform the feature reduction calculation - for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_i[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_cache_j[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + // store the values in the local memory + data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i_linear]; + data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j_linear]; + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + + // perform the feature reduction calculation + for (uint block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current work-item + const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; + const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data points and wrt the current device const ulong device_global_i = i + (ulong) internal_i; - const ulong global_i = row_offset + i + (ulong) internal_i; + const ulong global_i = device_row_offset + device_global_i; const ulong device_global_j = j + (ulong) internal_j; - const ulong global_j = row_offset + j + (ulong) internal_j; + const ulong global_j = device_row_offset + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp index 65587ddaa..22b24bae0 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp @@ -14,7 +14,7 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::item @@ -35,11 +35,11 @@ class device_kernel_assembly { public: /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,12 +48,12 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -69,25 +69,27 @@ class device_kernel_assembly { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; + // only calculate the upper triangular matrix if (i >= j) { - // create a work-item private array used for internal caching + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; for (std::size_t dim = 0; dim < num_features_; ++dim) { // perform the feature reduction calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - temp[internal_i][internal_j] += detail::feature_reduce(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], - data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + const auto global_i = device_row_offset_ + i + static_cast(internal_i); + const auto global_j = device_row_offset_ + j + static_cast(internal_j); + temp[internal_i][internal_j] += detail::feature_reduce(data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], + data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); } } } @@ -95,22 +97,23 @@ class device_kernel_assembly { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) + // calculate the indices to access the global data points and wrt the current device const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_i = device_row_offset_ + device_global_i; const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); + const auto global_j = device_row_offset_ + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -119,11 +122,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp index b09fef0f8..d3e37ca54 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp @@ -14,11 +14,12 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item +#include // std::array #include // std::size_t #include // std::tuple, std::make_tuple @@ -35,11 +36,11 @@ class device_kernel_assembly { public: /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,12 +49,12 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -68,67 +69,47 @@ class device_kernel_assembly { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; - - ::sycl::private_memory temp{ group }; - - // initialize private and local variables - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - - // initialize private temp matrix to zero - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; - } - } - }); + // create two local memory arrays used for caching data point features + real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - // implicit group barrier + // create a private memory array used for internal caching + ::sycl::private_memory, INTERNAL_BLOCK_SIZE>, 2> temp{ group }; - // exploit symmetry + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const std::size_t threadIdx_x = idx.get_local_id(0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current work-item paying attention to coalesced memory accesses + const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; } }); @@ -136,14 +117,15 @@ class device_kernel_assembly { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -154,26 +136,40 @@ class device_kernel_assembly { // apply the remaining part of the kernel function and store the value in the output kernel matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const std::size_t threadIdx_x = idx.get_local_id(0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = idx.get_local_id(1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = idx.get_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = idx.get_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data points and wrt the current device + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = device_row_offset_ + device_global_i; + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = device_row_offset_ + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp(idx)[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -183,11 +179,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp index 4ed3764ce..33c725a46 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp @@ -14,11 +14,12 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item +#include // std::array #include // std::size_t #include // std::tuple, std::make_tuple @@ -35,11 +36,11 @@ class device_kernel_assembly { public: /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,12 +49,12 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -71,94 +72,100 @@ class device_kernel_assembly { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); - - // exploit symmetry + ::sycl::require_local_mem(), // data_i_cache + ::sycl::require_local_mem(), // data_j_cache + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), // temp + [&](auto &data_i_cache, auto &data_j_cache, auto &temp) { + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item paying attention to coalesced memory accesses + const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; } }); - // perform calculations + // perform the feature reduction calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); } } } }); } + // apply the remaining part of the kernel function and store the value in the output kernel matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = group.get_logical_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = group.get_logical_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data points and wrt the current device + const auto device_global_i = i + static_cast(internal_i); + const auto global_i = device_row_offset_ + device_global_i; + const auto device_global_j = j + static_cast(internal_j); + const auto global_j = device_row_offset_ + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp(idx)[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -169,11 +176,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp index 96030fbe7..6e7fd2033 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp @@ -14,7 +14,7 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -36,11 +36,11 @@ class device_kernel_assembly { /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -49,14 +49,14 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_i_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_j_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + data_i_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_j_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -76,74 +76,78 @@ class device_kernel_assembly { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current work-item in work-group x-dimension + const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current work-item in work-group y-dimension + const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of work-items in work-group x-dimension + const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of work-items in work-group y-dimension + const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_y >= blockIdx_x) { - // create a work-item private array used for internal caching + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current work-item paying attention to coalesced memory accesses + const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data points, pays attention to coalesced memory accesses + const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; + data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + // perform the feature reduction calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current work-item + const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) + // calculate the indices to access the global data points and wrt the current device const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); + const auto global_i = device_row_offset_ + device_global_i; const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); + const auto global_j = device_row_offset_ + device_global_j; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; // apply the cost on the diagonal if (global_i == global_j) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; } } } @@ -152,16 +156,16 @@ class device_kernel_assembly { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_i_; + ::sycl::local_accessor data_i_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_j_; + ::sycl::local_accessor data_j_cache_; /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp index 93772aab3..51e11a282 100644 --- a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -14,7 +14,7 @@ #pragma once #include "plssvm/backends/stdpar/kernel/kernel_functions.hpp" // plssvm::stdpar::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix diff --git a/include/plssvm/constants.hpp b/include/plssvm/constants.hpp index e99dbeddd..81d992991 100644 --- a/include/plssvm/constants.hpp +++ b/include/plssvm/constants.hpp @@ -38,11 +38,8 @@ constexpr unsigned INTERNAL_BLOCK_SIZE = PLSSVM_INTERNAL_BLOCK_SIZE; constexpr unsigned INTERNAL_BLOCK_SIZE = 4; #endif -/// Global compile time constant used for internal feature caching. -constexpr unsigned FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE; - -/// Padding used for the device w_d matrix to prevent out-of-bounce accesses without ifs. -constexpr unsigned PADDING_SIZE = FEATURE_BLOCK_SIZE > (THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) ? FEATURE_BLOCK_SIZE : (THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE); +/// Padding used for the device arrays and matrices to prevent out-of-bounce accesses without ifs. +constexpr unsigned PADDING_SIZE = THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE; // perform sanity checks static_assert(detail::tuple_contains_v, "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types."); diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp index 2bf512433..e18c88328 100644 --- a/src/plssvm/backends/Kokkos/csvm.cpp +++ b/src/plssvm/backends/Kokkos/csvm.cpp @@ -20,7 +20,7 @@ #include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp" // plssvm::kokkos::detail::device_kernel_assembly #include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp" // plssvm::kokkos::detail::device_kernel_assembly_symm #include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp" // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} -#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE +#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/log_untracked.hpp" // plssvm::detail::log_untracked @@ -414,7 +414,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id] }; // only explicitly store the upper triangular matrix const real_type cost_factor = real_type{ 1.0 } / params.cost; - const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); // save the team size const int team_size = detail::dim_type_to_native(exec.block); @@ -492,7 +492,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const : // get the offset of the data points this device is responsible for const unsigned long long row_offset = data_distribution_->place_row_offset(device_id); // the necessary amount of scratch memory for the kernels - const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); // save the team size const int team_size = detail::dim_type_to_native(exec.block); @@ -592,7 +592,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de const unsigned long long row_offset = data_distribution_->place_row_offset(device_id); const real_type cost_factor = real_type{ 1.0 } / params.cost; - const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); // save the team size const int team_size = detail::dim_type_to_native(exec.block); @@ -702,7 +702,7 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, devices_[device_id] }; - const std::size_t scratch_memory_size = static_cast(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); + const std::size_t scratch_memory_size = static_cast(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type); // save the team size const int team_size = detail::dim_type_to_native(exec.block); diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp index 6b3f686ae..e3202bb6b 100644 --- a/src/plssvm/backends/OpenCL/detail/utility.cpp +++ b/src/plssvm/backends/OpenCL/detail/utility.cpp @@ -13,7 +13,7 @@ #include "plssvm/backends/OpenCL/detail/error_code.hpp" // plssvm::opencl::detail::error_code #include "plssvm/backends/OpenCL/detail/jit_info.hpp" // plssvm::opencl::detail::jit_info #include "plssvm/backends/OpenCL/detail/kernel.hpp" // plssvm::opencl::detail::compute_kernel_name, plssvm::opencl::detail::kernel -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/arithmetic_type_name.hpp" // plssvm::detail::arithmetic_type_name #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked @@ -359,12 +359,10 @@ std::pair, jit_info> create_command_queues(const mpi: // replace constants in kernel_src_string // replace the size_t variants -> BEFORE replacing the "normal" values ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE_ul", fmt::format("(ulong) {}", THREAD_BLOCK_SIZE)); - ::plssvm::detail::replace_all(kernel_src_string, "FEATURE_BLOCK_SIZE_ul", fmt::format("(ulong) {}", FEATURE_BLOCK_SIZE)); ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE_ul", fmt::format("(ulong) {}", INTERNAL_BLOCK_SIZE)); ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE_ul", fmt::format("(ulong) {}", PADDING_SIZE)); // replace the normal variants ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE", fmt::format("{}", THREAD_BLOCK_SIZE)); - ::plssvm::detail::replace_all(kernel_src_string, "FEATURE_BLOCK_SIZE", fmt::format("{}", FEATURE_BLOCK_SIZE)); ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE", fmt::format("{}", INTERNAL_BLOCK_SIZE)); ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE", fmt::format("{}", PADDING_SIZE)); diff --git a/src/plssvm/detail/tracking/performance_tracker.cpp b/src/plssvm/detail/tracking/performance_tracker.cpp index 58b4e975a..8598367dc 100644 --- a/src/plssvm/detail/tracking/performance_tracker.cpp +++ b/src/plssvm/detail/tracking/performance_tracker.cpp @@ -8,7 +8,7 @@ #include "plssvm/detail/tracking/performance_tracker.hpp" -#include "plssvm/constants.hpp" // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE, plssvm::PADDING_SIZE +#include "plssvm/constants.hpp" // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::PADDING_SIZE #include "plssvm/detail/arithmetic_type_name.hpp" // plssvm::detail::arithmetic_type_name #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT, PLSSVM_ASSERT_ENABLED #include "plssvm/detail/cmd/parser_predict.hpp" // plssvm::detail::cmd::parser_predict @@ -280,7 +280,6 @@ void performance_tracker::save(std::ostream &out) { " asserts: {}\n" " enforce_max_mem_alloc_size: {}\n" " THREAD_BLOCK_SIZE: {}\n" - " FEATURE_BLOCK_SIZE: {}\n" " INTERNAL_BLOCK_SIZE: {}\n" " PADDING_SIZE: {}\n", plssvm::detail::current_date_time(), @@ -295,7 +294,6 @@ void performance_tracker::save(std::ostream &out) { assert_enabled, enforce_max_mem_alloc_size, THREAD_BLOCK_SIZE, - FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE); From 6cddbb6e98f46ebd21d07d7e347402d03ad093c0 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 26 May 2025 15:42:48 +0200 Subject: [PATCH 25/93] Additional performance improvement tests. --- .../work_group/kernel_matrix_assembly.hpp | 26 +++++++++++--- src/plssvm/backends/SYCL/DPCPP/csvm.cpp | 34 ++++++++++++++----- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp index 6e7fd2033..560d556ea 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp @@ -16,6 +16,7 @@ #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -27,10 +28,11 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's work-group data parallel kernels. + * @details target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: /** @@ -111,12 +113,26 @@ class device_kernel_assembly { } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::gpu_amd) { + // perform the feature reduction calculation, the block_dim is the slowest moving index + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } + } else { + // perform the feature reduction calculation, the block_dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp[internal_i][internal_j] += sum; } } } diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp index 7c56bcd91..12910a7ae 100644 --- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp @@ -223,10 +223,12 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id) // fit // //***************************************************// -auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { +// TODO: better! +template +auto dispatch_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost, sycl::kernel_invocation_type invocation_type_, Device& devices_, Distribution& data_distribution_) { const std::size_t num_rows_reduced = data_d.shape().x - 1; const std::size_t num_features = data_d.shape().y; - const queue_type &device = devices_[device_id]; + const auto &device = devices_[device_id]; // calculate the number of data points this device is responsible for const std::size_t device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id); @@ -260,7 +262,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), - sycl::detail::work_group::device_kernel_assembly{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); + sycl::detail::work_group::device_kernel_assembly{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x }); }); break; case sycl::kernel_invocation_type::hierarchical: @@ -293,7 +295,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get(params.gamma), params.coef0 }); }); @@ -329,7 +331,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); }); @@ -365,7 +367,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma), params.coef0 }); }); @@ -401,7 +403,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); }); @@ -437,7 +439,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons break; case sycl::kernel_invocation_type::work_group: device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) { - using functor_type = sycl::detail::work_group::device_kernel_assembly; + using functor_type = sycl::detail::work_group::device_kernel_assembly; cgh.parallel_for(detail::get_execution_range(partial_grid_ref, exec.block), functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get(params.gamma) }); }); @@ -467,6 +469,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons return kernel_matrix_d; } +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { + switch (target_) { + case target_platform::automatic: + // error + throw backend_exception{ "Can't determine the target platform!" }; + case target_platform::gpu_nvidia: + return dispatch_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_); + case target_platform::gpu_amd: + return dispatch_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_); + case target_platform::gpu_intel: + return dispatch_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_); + case target_platform::cpu: + return dispatch_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_); + } +} + void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, const real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, const real_type beta, device_ptr_type &C_d) const { const std::size_t num_rhs = B_d.shape().x; const std::size_t num_rows = B_d.shape().y; From a185caf542bc6fd1e65230783e431f158e0633c4 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Mon, 26 May 2025 16:25:07 +0200 Subject: [PATCH 26/93] Preliminary changes. --- .../SYCL/kernel/cg_explicit/basic/blas.hpp | 2 +- .../kernel/cg_explicit/hierarchical/blas.hpp | 30 ++++------ .../SYCL/kernel/cg_explicit/scoped/blas.hpp | 30 ++++------ .../kernel/cg_explicit/work_group/blas.hpp | 32 ++++------ .../basic/kernel_matrix_assembly_blas.hpp | 2 +- .../kernel_matrix_assembly_blas.hpp | 60 +++++++++---------- .../scoped/kernel_matrix_assembly_blas.hpp | 56 +++++++---------- .../kernel_matrix_assembly_blas.hpp | 55 +++++++---------- .../kernel/predict/basic/predict_kernel.hpp | 2 +- .../predict/hierarchical/predict_kernel.hpp | 39 +++++------- .../kernel/predict/scoped/predict_kernel.hpp | 37 +++++------- .../predict/work_group/predict_kernel.hpp | 40 +++++-------- 12 files changed, 151 insertions(+), 234 deletions(-) diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp index 2e528149c..b55b374fe 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "sycl/sycl.hpp" // sycl::item diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp index de6358ec8..5e5803652 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -60,8 +60,8 @@ class device_kernel_symm { */ void operator()(::sycl::group<2> group) const { // allocate shared memory - real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // calculate the indices used in the current work-item ::sycl::private_memory i{ group }; @@ -98,7 +98,7 @@ class device_kernel_symm { }); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -119,15 +119,8 @@ class device_kernel_symm { } else { A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } }); @@ -138,7 +131,7 @@ class device_kernel_symm { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; @@ -227,8 +220,8 @@ class device_kernel_symm_mirror { */ void operator()(::sycl::group<2> group) const { // allocate shared memory - real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // calculate the indices used in the current work-item ::sycl::private_memory i{ group }; @@ -264,7 +257,7 @@ class device_kernel_symm_mirror { }); // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into shared memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -279,12 +272,9 @@ class device_kernel_symm_mirror { const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } }); @@ -295,7 +285,7 @@ class device_kernel_symm_mirror { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp index 9e8500d73..2e6983255 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -62,8 +62,8 @@ class device_kernel_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -88,7 +88,7 @@ class device_kernel_symm { j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; }); - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into shared memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -109,15 +109,8 @@ class device_kernel_symm { } else { A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } }); @@ -126,7 +119,7 @@ class device_kernel_symm { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; @@ -215,8 +208,8 @@ class device_kernel_symm_mirror { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -241,7 +234,7 @@ class device_kernel_symm_mirror { j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; }); - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into shared memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -256,12 +249,9 @@ class device_kernel_symm_mirror { const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } }); @@ -270,7 +260,7 @@ class device_kernel_symm_mirror { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp index ae07f7ec6..965b043a3 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -43,8 +43,8 @@ class device_kernel_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - A_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - B_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, device_specific_num_rows_{ device_specific_num_rows }, @@ -75,7 +75,6 @@ class device_kernel_symm { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -88,7 +87,7 @@ class device_kernel_symm { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; @@ -100,20 +99,13 @@ class device_kernel_symm { } else { A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; @@ -183,8 +175,8 @@ class device_kernel_symm_mirror { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - A_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - B_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, @@ -216,7 +208,6 @@ class device_kernel_symm_mirror { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -229,23 +220,20 @@ class device_kernel_symm_mirror { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } nd_idx.barrier(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp index 7b517a7b1..9c82ad31d 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::item diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp index 1a24024b6..342e8308b 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -80,6 +80,10 @@ class device_kernel_assembly_symm { ::sycl::private_memory j{ group }; ::sycl::private_memory j_linear{ group }; + // create the shared memory arrays used for caching data point features + real_type data_cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + ::sycl::private_memory temp{ group }; // initialize private and local variables @@ -112,12 +116,12 @@ class device_kernel_assembly_symm { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { { - // allocate shared memory - real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_cache_i = reinterpret_cast(data_cache_one); + auto data_cache_j = reinterpret_cast(data_cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -132,11 +136,9 @@ class device_kernel_assembly_symm { const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; } }); @@ -147,7 +149,7 @@ class device_kernel_assembly_symm { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], @@ -189,12 +191,12 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the UPPER triangular matrix { - // allocate shared memory - real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(data_cache_one); + auto C_out_cache = reinterpret_cast(data_cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -208,26 +210,24 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; } }); // implicit group barrier // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } }); @@ -242,13 +242,11 @@ class device_kernel_assembly_symm { const std::size_t threadIdx_y = idx.get_local_id(1); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j(idx) + static_cast(internal); detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1 + THREAD_BLOCK_SIZE]; } }); @@ -274,12 +272,12 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the LOWER triangular matrix { - // allocate shared memory - real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(data_cache_one); + auto C_out_cache = reinterpret_cast(data_cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -293,26 +291,24 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); // implicit group barrier // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } }); @@ -327,13 +323,11 @@ class device_kernel_assembly_symm { const std::size_t threadIdx_x = idx.get_local_id(0); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = row_offset_ + i(idx) + static_cast(internal); detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp index 4391f2f19..c84db480f 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -77,8 +77,8 @@ class device_kernel_assembly_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -106,7 +106,7 @@ class device_kernel_assembly_symm { // exploit symmetry if (group[1] >= group[0]) { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -121,11 +121,9 @@ class device_kernel_assembly_symm { const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; } }); @@ -134,7 +132,7 @@ class device_kernel_assembly_symm { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], @@ -172,11 +170,11 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the UPPER triangular matrix { // rename cached arrays - auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -190,24 +188,22 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the local memory + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; } }); // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } }); @@ -220,13 +216,11 @@ class device_kernel_assembly_symm { const std::size_t threadIdx_y = idx.get_local_id(group, 1); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j(idx) + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE]; + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; } }); } @@ -249,11 +243,11 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the LOWER triangular matrix { // allocate shared memory - auto &B_cache = data_cache_i; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &B_cache = data_cache_i; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -267,26 +261,24 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); // implicit group barrier // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } }); @@ -301,13 +293,11 @@ class device_kernel_assembly_symm { const std::size_t threadIdx_x = idx.get_local_id(group, 0); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = row_offset_ + i(idx) + static_cast(internal); detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp index 34b55fff4..2e6ea3f4f 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -54,8 +54,8 @@ class device_kernel_assembly_symm { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_i_{ ::sycl::range<1>{ static_cast(FEATURE_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - data_cache_j_{ ::sycl::range<1>{ static_cast(FEATURE_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + data_cache_i_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + data_cache_j_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] alpha_{ alpha }, q_{ q }, data_d_{ data_d }, @@ -90,7 +90,6 @@ class device_kernel_assembly_symm { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -106,22 +105,20 @@ class device_kernel_assembly_symm { { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_i_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; data_cache_j_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], @@ -158,29 +155,27 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the UPPER triangular matrix { // rename cached arrays - auto &B_cache = data_cache_i_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + auto &B_cache = data_cache_i_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const std::size_t global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the local memory + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations @@ -189,8 +184,7 @@ class device_kernel_assembly_symm { // add intermediate cached results to C for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE]; + detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wai until all work-items updated C with their values } @@ -211,29 +205,27 @@ class device_kernel_assembly_symm { // calculate C += alpha * temp * B for the LOWER triangular matrix { // rename cached arrays - auto &B_cache = data_cache_i_; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &B_cache = data_cache_i_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = data_cache_j_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the in the shared memory B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations @@ -243,7 +235,6 @@ class device_kernel_assembly_symm { for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_i = row_offset_ + i + static_cast(internal); detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp index c16965cb1..631bf80a1 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::item diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp index 4098c4914..dedfe609e 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -202,8 +202,8 @@ class device_kernel_predict_linear { */ void operator()(::sycl::group<2> group) const { // allocate shared memory - real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_w[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // calculate the indices used in the current work-item ::sycl::private_memory pp_idx{ group }; @@ -241,7 +241,7 @@ class device_kernel_predict_linear { // implicit group barrier // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -256,11 +256,9 @@ class device_kernel_predict_linear { const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; } }); @@ -271,7 +269,7 @@ class device_kernel_predict_linear { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; @@ -356,8 +354,8 @@ class device_kernel_predict { */ void operator()(::sycl::group<2> group) const { // allocate shared memory - real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_cache_sv[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // calculate the indices used in the current work-item ::sycl::private_memory pp_idx{ group }; @@ -393,7 +391,7 @@ class device_kernel_predict { { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -408,11 +406,9 @@ class device_kernel_predict { const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; } }); @@ -423,7 +419,7 @@ class device_kernel_predict { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], @@ -454,7 +450,7 @@ class device_kernel_predict { // auto &out_cache = data_cache_sv; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); @@ -470,15 +466,12 @@ class device_kernel_predict { const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; } else { data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } }); @@ -486,15 +479,15 @@ class device_kernel_predict { // implicit group barrier // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - data_cache_sv[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + data_cache_sv[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); @@ -509,14 +502,12 @@ class device_kernel_predict { const std::size_t threadIdx_x = idx.get_local_id(0); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_pp_idx = pp_idx(idx) + static_cast(internal); detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp index 1a42161f5..e6d56ec56 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -191,8 +191,8 @@ class device_kernel_predict_linear { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -218,7 +218,7 @@ class device_kernel_predict_linear { }); // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -233,11 +233,9 @@ class device_kernel_predict_linear { const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; } }); @@ -246,7 +244,7 @@ class device_kernel_predict_linear { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; @@ -332,8 +330,8 @@ class device_kernel_predict { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), + ::sycl::require_local_mem(), + ::sycl::require_local_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), ::sycl::require_private_mem(), @@ -356,7 +354,7 @@ class device_kernel_predict { }); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -371,11 +369,9 @@ class device_kernel_predict { const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; } }); @@ -384,7 +380,7 @@ class device_kernel_predict { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], @@ -410,7 +406,7 @@ class device_kernel_predict { auto &out_cache = data_cache_sv; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); @@ -426,29 +422,26 @@ class device_kernel_predict { const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } }); // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp index d451ac7d5..6612a10d8 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp @@ -15,7 +15,7 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -159,8 +159,8 @@ class device_kernel_predict_linear { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_w_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_cache_pp_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_cache_w_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, prediction_d_{ prediction_d }, w_d_{ w_d }, rho_d_{ rho_d }, @@ -189,7 +189,6 @@ class device_kernel_predict_linear { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -202,22 +201,20 @@ class device_kernel_predict_linear { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory + // store the values in the local memory data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_w_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { temp[internal_pd][internal_class] += data_cache_w_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; @@ -283,8 +280,8 @@ class device_kernel_predict { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ device_kernel_predict(::sycl::handler &cgh, real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_sv_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_cache_pp_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_cache_sv_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, prediction_d_{ prediction_d }, alpha_d_{ alpha_d }, rho_d_{ rho_d }, @@ -316,7 +313,6 @@ class device_kernel_predict { const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item @@ -329,22 +325,20 @@ class device_kernel_predict { { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory + // store the values in the shared memory data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; data_cache_sv_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; } nd_idx.barrier(); // wait until all work-items loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], @@ -369,31 +363,28 @@ class device_kernel_predict { auto &out_cache = data_cache_sv_; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { const std::size_t global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } nd_idx.barrier(); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += + temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations @@ -404,7 +395,6 @@ class device_kernel_predict { const auto global_pp_idx = pp_idx + static_cast(internal); detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wait until all work-items updated their part of the prediction } From c74aca83b21f21ed12fa6257ca347d35f41997f2 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 28 May 2025 13:48:21 +0200 Subject: [PATCH 27/93] Update CUDA implementation and update comments. --- .../backends/CUDA/kernel/cg_explicit/blas.cuh | 295 +++++++------- .../cg_explicit/kernel_matrix_assembly.cuh | 44 +-- .../kernel_matrix_assembly_blas.cuh | 179 +++++---- .../backends/CUDA/kernel/kernel_functions.cuh | 34 +- .../backends/CUDA/kernel/predict_kernel.cuh | 374 +++++++++--------- 5 files changed, 471 insertions(+), 455 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh index 2f7b37a0f..1a6be4ae8 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh @@ -13,7 +13,7 @@ #define PLSSVM_BACKENDS_CUDA_KERNEL_CG_EXPLICIT_BLAS_CUH_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} namespace plssvm::cuda::detail { @@ -22,8 +22,8 @@ namespace plssvm::cuda::detail { * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for! * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -32,78 +32,77 @@ namespace plssvm::cuda::detail { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> device_specific_num_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j) { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull]; - } - // determine on which side of the diagonal we are located - if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + // determine on which side of the diagonal we are located + if (dim + threadIdx_y < global_j_idx_linear) { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && device_global_j < device_specific_num_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -115,8 +114,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -125,68 +124,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j]; - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j]; - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + device_specific_num_rows + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && partial_global_j < num_mirror_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -200,27 +203,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j]; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -233,27 +238,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale; // SoA } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh index 2a3eef5c4..e4a3fa22d 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh @@ -52,7 +52,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large - // create two shared memory arrays used for caching data point features + // create two shared memory arrays used for caching __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; @@ -62,21 +62,21 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // calculate the indices used in the current thread paying attention to coalesced memory accesses - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // iterate over all features using blocking to be able to cache them for faster memory accesses for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data @@ -94,29 +94,29 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type } // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } // update the upper triangular kernel matrix - kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 62f24d6bf..8e8dd03c2 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -15,7 +15,7 @@ #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh" // atomicAdd for double precision floating point numbers on older CUDA hardware #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type namespace plssvm::cuda::detail { @@ -26,10 +26,10 @@ namespace plssvm::cuda::detail { * @tparam Args the types of the parameters necessary for the specific kernel function * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -41,56 +41,64 @@ namespace plssvm::cuda::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -101,16 +109,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out of bounds accesses (only using the upper triangular matrix) + if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost; } } else { @@ -120,42 +130,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the shared memory + B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE]; + C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j + static_cast(internal); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]); + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal); + + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA } __syncthreads(); // wai until all threads updated C with their values } @@ -164,51 +176,54 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + // store the values in the shared memory + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y]; // SoA C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i + static_cast(internal); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal); + + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA } __syncthreads(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh index 8003a51a3..72a4499ae 100644 --- a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh +++ b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh @@ -57,36 +57,12 @@ template <> * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 204d6bd97..5469b01d9 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -15,166 +15,178 @@ #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh" // atomicAdd for double precision floating point numbers on older CUDA hardware #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type namespace plssvm::cuda::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. - * @param[out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] sv the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_features + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes - data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y]; // SoA - data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y]; // AoS - } - __syncthreads(); // wait until all threads loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv + threadIdx_y]; // SoA + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv + sv_offset + threadIdx_y]; // AoS + } + __syncthreads(); // wait until all threads loaded their part of the data - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class]; + w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - } - __syncthreads(); // wait until all threads loaded their part of the data + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]; + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx]; + prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx]; // AoS } } } /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] sv the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -184,53 +196,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto pp_cache = reinterpret_cast(cache_one); + auto sv_cache = reinterpret_cast(cache_two); + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]); + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); } } } @@ -239,57 +253,61 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type * } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter...); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); } } { - // same shared memory size but with different dimensions - __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - - // the bias (rho) must only be applied once for all support vectors - if (blockIdx_y == 0ull) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y]; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; - } else { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto alpha_cache = reinterpret_cast(cache_one); + auto out_cache = reinterpret_cast(cache_two); + + { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + + // store the values in the shared memory + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS + // the bias (rho) must only be applied once for all support vectors + if (blockIdx_y == std::size_t{ 0 }) { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[dim + threadIdx_y]; + } else { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; + } } - } - __syncthreads(); // wait until all threads loaded their part of the data - - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; + __syncthreads(); // wait until all threads loaded their part of the data + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; + } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations - } - // add intermediate cached results to prediction_d - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx + static_cast(internal); + // atomically add the intermediate cached results to the prediction + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + } + __syncthreads(); // wait until all threads updated their part of the prediction } - __syncthreads(); // wait until all threads updated their part of the prediction } } } From dbc00aed81991c4ff140be152209954e458d4994 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Wed, 28 May 2025 22:23:13 +0200 Subject: [PATCH 28/93] Do not use std::vector directly for the kernel matrix since it sequentially initializes all values to zero. Instead, use a std::unique_ptr together with a C++17 conformant make_unique_for_overwrite implementation followed by an OpenMP parallel zero initialization of all values drastically reducing the overhead. --- .../OpenMP/kernel/cg_explicit/blas.hpp | 6 +- .../cg_explicit/kernel_matrix_assembly.hpp | 3 +- .../detail/make_unique_for_overwrite.hpp | 101 ++++++++++++++++++ src/plssvm/backends/OpenMP/csvm.cpp | 35 ++++-- 4 files changed, 129 insertions(+), 16 deletions(-) create mode 100644 include/plssvm/detail/make_unique_for_overwrite.hpp diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index e1041024a..ff7fc6f36 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -37,9 +37,8 @@ namespace plssvm::openmp::detail { * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); @@ -119,9 +118,8 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index 9403b12a1..9571513b9 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -40,9 +40,8 @@ namespace plssvm::openmp::detail { * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function */ template -void device_kernel_assembly(std::vector &kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { +void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!"); PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp new file mode 100644 index 000000000..51b56e126 --- /dev/null +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -0,0 +1,101 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + */ + +#ifndef PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ +#define PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ + +#include // std::size_t +#include // std::unique_ptr +#include // std::false_type, std::true_type, std::enable_if_t, std::is_array_v + +namespace plssvm::detail { + +/** + * @brief Helper struct to check whether @p T is an unbounded array. + * @tparam T the array type + */ +template +struct is_unbounded_array : std::false_type { }; + +/** + * @brief Specialization of @ref is_unbounded_array for unbounded arrays. + * @tparam T the array type + */ +template +struct is_unbounded_array : std::true_type { }; + +/** + * @brief Shortcut for @ref is_unbounded_array::value. + * @tparam T the array type + */ +template +constexpr bool is_unbounded_array_v = is_unbounded_array::value; + +/** + * @brief Helper struct to check whether @p T is a bounded array. + * @tparam T the array type + */ +template +struct is_bounded_array : std::false_type { }; + +/** + * @brief Specialization of @ref is_bounded_array for unbounded arrays. + * @tparam T the array type + * @tparam N the size of the array + */ +template +struct is_bounded_array : std::true_type { }; + +/** + * @brief Shortcut for @ref is_unbounded_array::value. + * @tparam T the array type + */ +template +constexpr bool is_bounded_array_v = is_bounded_array::value; + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the object to create + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +[[nodiscard]] std::unique_ptr make_unique_for_overwrite() { + return std::unique_ptr(new T); +} + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the objects to create + * @param[in] n the size of the array to create + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +std::unique_ptr make_unique_for_overwrite(const std::size_t n) { + return std::unique_ptr(new std::remove_extent_t[n]); +} + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the object to create + * @tparam Args the types of the constructor arguments + * @param[in] args the arguments to pass to the constructor + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +auto make_unique_for_overwrite(Args &&...args) = delete; + +} // namespace plssvm::detail + +#endif // PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index 7a7c17ef2..656d966f3 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -19,6 +19,7 @@ #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::make_unique_for_overwrite #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/move_only_any.hpp" // plssvm::detail::{move_only_any, move_only_any_cast} #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY @@ -125,26 +126,40 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // get the offset of the data points this device is responsible for const std::size_t row_offset = dist.place_row_offset(0); - std::vector kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); // only explicitly store the upper triangular matrix + // get the number of kernel matrix entries + const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0); + + // only explicitly store the upper triangular matrix + auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite(num_entries); + // initialize kernel matrix to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset +#if defined(_OPENMP) + #pragma omp parallel for + for (std::size_t i = 0; i < num_entries; ++i) { + kernel_matrix[i] = real_type{ 0.0 }; + } +#else + std::memset(kernel_matrix.get(), 0, num_entries * sizeof(real_type)); +#endif + const auto start = std::chrono::steady_clock::now(); switch (params.kernel_type) { case kernel_function_type::linear: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); break; case kernel_function_type::polynomial: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; } const auto end = std::chrono::steady_clock::now(); @@ -202,16 +217,16 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s break; case solver_type::cg_explicit: { - const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); + const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!"); const auto start = std::chrono::steady_clock::now(); - detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows; if (num_mirror_rows > std::size_t{ 0 }) { - detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); } const auto end = std::chrono::steady_clock::now(); From 10d303e3b4fe9835aab75ba3870b5bfe7c276678 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Thu, 29 May 2025 17:20:30 +0200 Subject: [PATCH 29/93] Improve the performance of the OpenMP cg_explicit kernel matrix assembly and BLAS implementation. Align names more to the ones used in the other backends. --- .../OpenMP/kernel/cg_explicit/blas.hpp | 111 ++++++++++-------- .../cg_explicit/kernel_matrix_assembly.hpp | 70 ++++++----- .../OpenMP/kernel/kernel_functions.hpp | 35 +----- 3 files changed, 104 insertions(+), 112 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index ff7fc6f36..ecd80ab1a 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -21,7 +21,6 @@ #include // std::array #include // std::ceil #include // std::size_t -#include // std::vector namespace plssvm::openmp::detail { @@ -29,24 +28,24 @@ namespace plssvm::openmp::detail { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @param[in] num_rows the number of rows and columns in @p A * @param[in] num_rhs the number of rows in @p B and @p C - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -60,28 +59,33 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { // calculate the indices used in the current thread - const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) { + // iterate over all values + for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_row) { - A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }]; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim + block_dim < global_j_idx) { + A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim + block_dim) * (dim + block_dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + block_dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + sum += A_cache * B(global_i_idx, dim + block_dim + device_row_offset); } - temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset); + temp[internal_i][internal_j] += sum; } } } @@ -89,13 +93,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t device_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx); } } } @@ -110,21 +115,21 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); @@ -142,36 +147,42 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { // calculate the indices used in the current thread - const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) { + // iterate over the remaining values + for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row]; - temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + const real_type A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + block_dim - std::size_t{ 1 }) * (dim + block_dim) / std::size_t{ 2 } + device_num_rows - dim + block_dim + global_j_idx]; + sum += A_cache * B(global_i_idx, device_row_offset + dim + block_dim); + } + temp[internal_i][internal_j] += sum; } } } - // apply the (partial) BLAS operation and update C + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t partial_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx); } } } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index 9571513b9..b734a7c1a 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -28,29 +28,29 @@ namespace plssvm::openmp::detail { /** * @brief Assemble the kernel matrix using the @p kernel function. - * @tparam kernel the compile-time kernel function to use + * @tparam kernel_function the compile-time kernel function to use * @tparam Args the types of the potential additional arguments for the @p kernel function * @param[out] kernel_matrix the resulting kernel matrix * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] q the `q` vector * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function */ -template -void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { +template +void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); // calculate constants const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -64,46 +64,52 @@ void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { // calculate the indices used in the current thread - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += detail::feature_reduce(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + real_type temp_ij = temp[internal_j][internal_i]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp index 59fd0f43c..359e2f8ff 100644 --- a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp @@ -27,42 +27,17 @@ namespace plssvm::openmp::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// From 2e64193492090b25529f4fe4e8c30755375f4461 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 11:40:23 +0200 Subject: [PATCH 30/93] Improve the performance of the OpenMP cg_implicit kernel matrix assembly + BLAS implementation. Align names more to the ones used in the other backends. --- .../kernel_matrix_assembly_blas.hpp | 104 +++++++++++------- 1 file changed, 64 insertions(+), 40 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 771689209..60c10de07 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -26,25 +26,25 @@ namespace plssvm::openmp::detail { /** - * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[in] alpha the scalar alpha value * @param[in] q the `q` vector * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] B the matrix @p B * @param[in,out] C the matrix @p C - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { +template +inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!"); PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size()); @@ -53,8 +53,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); const std::size_t num_classes = B.num_rows(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -67,54 +67,78 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { // calculate the indices used in the current thread - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += detail::feature_reduce(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { - temp_ij += cost; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + if (global_i_idx == global_j_idx) { + temp[internal_j][internal_i] += cost; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_j][internal_i] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { #pragma omp atomic - C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_row); + C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx); } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { #pragma omp atomic - C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_col); -// symmetry + C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_j_idx); + // symmetry #pragma omp atomic - C(class_idx, global_col) += alpha * temp_ij * B(class_idx, global_row); + C(dim + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx); } } } From 8aa1c93bbaab8c98310cab60355b539c01f0da66 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 11:40:41 +0200 Subject: [PATCH 31/93] Improve the performance of the OpenMP predict implementation. Align names more to the ones used in the other backends. --- .../backends/OpenMP/kernel/predict_kernel.hpp | 199 +++++++++++++----- 1 file changed, 147 insertions(+), 52 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 407096055..1540397bc 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -31,28 +31,71 @@ namespace plssvm::openmp::detail { * @param[out] w the vector to speedup the linear prediction * @param[in] alpha the previously learned weights * @param[in] support_vectors the support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) { +inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t sv_offset) { PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); - PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_features = support_vectors.num_cols(); -#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(num_classes, num_features, device_specific_num_sv, sv_offset) - for (std::size_t a = 0; a < num_classes; ++a) { - for (std::size_t dim = 0; dim < num_features; ++dim) { - real_type temp{ 0.0 }; -#pragma omp simd reduction(+ : temp) - for (std::size_t idx = 0; idx < device_specific_num_sv; ++idx) { - temp = std::fma(alpha(a, sv_offset + idx), support_vectors(sv_offset + idx, dim), temp); + // calculate constants + const auto blocked_num_features = static_cast(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + +#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset) + for (std::size_t dim = 0; dim < blocked_num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) { + // perform operations on the current block + for (std::size_t dim_block = 0; dim_block < THREAD_BLOCK_SIZE_uz; ++dim_block) { + for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) { + // calculate the indices used in the current thread + const std::size_t feature_idx = (dim + dim_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (unsigned block_sv = 0; block_sv < THREAD_BLOCK_SIZE; ++block_sv) { + sum += alpha(global_class_idx, sv_offset + sv + block_sv) * support_vectors(sv_offset + sv + block_sv, global_feature_idx); + } + temp[internal_class][internal_feature] += sum; + } + } + } + + // store the result back to the w vector + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + if (global_class_idx < num_classes && global_feature_idx < num_features) { + w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; + } + } + } + } } - w(a, dim) = temp; } } } @@ -63,29 +106,73 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) { +inline void device_kernel_predict_linear(aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) { PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size()); PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = prediction.num_cols(); const std::size_t num_features = predict_points.num_cols(); -#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(num_classes, num_features, device_specific_num_predict_points, row_offset) - for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) { - for (std::size_t a = 0; a < num_classes; ++a) { - real_type temp{ 0.0 }; -#pragma omp simd reduction(+ : temp) - for (std::size_t dim = 0; dim < num_features; ++dim) { - temp = std::fma(w(a, dim), predict_points(row_offset + point_index, dim), temp); + // calculate constants + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + +#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset) + for (std::size_t point = 0; point < blocked_device_num_predict_points; point += THREAD_BLOCK_SIZE_uz) { + for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) { + // perform operations on the current block + for (std::size_t point_block = 0; point_block < THREAD_BLOCK_SIZE_uz; ++point_block) { + for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) { + // calculate the indices used in the current thread + const std::size_t pp_idx = (point + point_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += w(global_class_idx, dim + block_dim) * predict_points(global_pp_idx, dim + block_dim); + } + temp[internal_class][internal_pp] += sum; + } + } + } + + // store the result back to the w vector + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto device_global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_pp_idx = device_row_offset + device_global_pp_idx; + const auto global_class_idx = class_idx + static_cast(internal_class); + + if (global_class_idx < num_classes && global_pp_idx < device_num_predict_points) { + prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; + } + } + } + } } - prediction(row_offset + point_index, a) = temp - rho[a]; } } } @@ -99,24 +186,24 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons * @param[in] rho the previously learned bias * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict - * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for - * @param[in] row_offset the first row in @p predict_points the current device is responsible for + * @param[in] device_num_predict_points the number of predict points the current device is responsible for + * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ -template -inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) { +template +inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) { PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size()); PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows @@ -124,34 +211,39 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) { - for (std::size_t a = 0; a < num_classes; ++a) { - prediction(row_offset + point_index, a) -= rho[a]; + for (std::size_t pp_idx = 0; pp_idx < device_num_predict_points; ++pp_idx) { + for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + prediction(device_row_offset + pp_idx, class_idx) -= rho[class_idx]; } } #pragma omp parallel for collapse(2) - for (std::size_t pp = 0; pp < blocked_device_specific_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) { - for (std::size_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) { + for (std::size_t x_block = 0; x_block < blocked_device_specific_num_predict_points; x_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t y_block = 0; y_block < blocked_num_support_vectors; y_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t pp_block = 0; pp_block < THREAD_BLOCK_SIZE_uz; ++pp_block) { - for (std::size_t sv_block = 0; sv_block < THREAD_BLOCK_SIZE_uz; ++sv_block) { + for (std::size_t x_thread = 0; x_thread < THREAD_BLOCK_SIZE_uz; ++x_thread) { + for (std::size_t y_thread = 0; y_thread < THREAD_BLOCK_SIZE_uz; ++y_thread) { // calculate the indices used in the current thread - const std::size_t pp_idx = (pp + pp_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = (sv + sv_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (x_block + x_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (y_block + y_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors(global_sv_idx, dim), predict_points(global_pp_idx, dim)); + real_type sum{ 0.0 }; + for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + sum += detail::feature_reduce(support_vectors(global_sv_idx, dim + block_dim), predict_points(global_pp_idx, dim + block_dim)); + } + temp[internal_pp][internal_sv] += sum; } } } @@ -159,22 +251,25 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // update temp using the respective kernel function for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); } } // add results to prediction - for (std::size_t a = 0; a < num_classes; ++a) { + for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_pp_idx = device_row_offset + device_global_pp_idx; + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out of bounds accesses - if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) { + // be sure to not perform out-of-bounds accesses + if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, a) += alpha(a, global_sv_idx) * temp[internal_pp][internal_sv]; + prediction(global_pp_idx, dim + class_idx) += alpha(dim + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; + } } } } From 51b75b60eeb8431d71b0da2db0de7766ebb536c0 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 12:21:28 +0200 Subject: [PATCH 32/93] Improve variable names and remove some implicit conversions. --- .../OpenMP/kernel/cg_explicit/blas.hpp | 44 +++++++------- .../cg_explicit/kernel_matrix_assembly.hpp | 22 +++---- .../kernel_matrix_assembly_blas.hpp | 26 ++++----- .../backends/OpenMP/kernel/predict_kernel.hpp | 58 +++++++++---------- 4 files changed, 75 insertions(+), 75 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index ecd80ab1a..298962c19 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -53,20 +53,20 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) { - for (std::size_t row = 0; row < blocked_device_specific_num_rows; row += THREAD_BLOCK_SIZE_uz) { + for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_device_specific_num_rows; row_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { + for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { // calculate the indices used in the current thread - const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all values - for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -75,15 +75,15 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num const auto global_j_idx = j_idx + static_cast(internal_j); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { real_type A_cache = 0.0; // determine on which side of the diagonal we are located - if (dim + block_dim < global_j_idx) { - A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim + block_dim) * (dim + block_dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + if (dim_block + dim < global_j_idx) { + A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; } else { - A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + block_dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; + A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; } - sum += A_cache * B(global_i_idx, dim + block_dim + device_row_offset); + sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset); } temp[internal_i][internal_j] += sum; } @@ -141,20 +141,20 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) { - for (std::size_t row = 0; row < blocked_num_mirror_rows; row += THREAD_BLOCK_SIZE_uz) { + for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_num_mirror_rows; row_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { + for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { // calculate the indices used in the current thread - const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over the remaining values - for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -163,9 +163,9 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz const auto global_j_idx = j_idx + static_cast(internal_j); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - const real_type A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + block_dim - std::size_t{ 1 }) * (dim + block_dim) / std::size_t{ 2 } + device_num_rows - dim + block_dim + global_j_idx]; - sum += A_cache * B(global_i_idx, device_row_offset + dim + block_dim); + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - dim_block + dim + global_j_idx]; + sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } temp[internal_i][internal_j] += sum; } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index b734a7c1a..f384645b1 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -58,14 +58,14 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(PADDING_SIZE); #pragma omp parallel for collapse(2) schedule(dynamic) - for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) { - for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { - for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { + for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) { // calculate the indices used in the current thread - const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix if (i_idx >= j_idx) { @@ -73,7 +73,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -82,10 +82,10 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(internal_j); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += detail::feature_reduce(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim)); + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); } - temp[internal_j][internal_i] += sum; + temp[internal_i][internal_j] += sum; } } } @@ -101,7 +101,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix= global_j_idx) { - real_type temp_ij = temp[internal_j][internal_i]; + real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 60c10de07..3ca4e4dc6 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -61,14 +61,14 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) schedule(dynamic) - for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) { - for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { - for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { + for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) { // calculate the indices used in the current thread - const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix if (i_idx >= j_idx) { @@ -76,7 +76,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { // calculate the indices to access the global data @@ -84,8 +84,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += detail::feature_reduce(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim)); + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); } temp[internal_j][internal_i] += sum; } @@ -119,7 +119,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector //*************************************************************************// // calculate C += alpha * temp * B // //*************************************************************************// - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); @@ -129,16 +129,16 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector // only apply once to the diagonal for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { #pragma omp atomic - C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx); + C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } else { // apply it for the upper and lower triangular matrix for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { #pragma omp atomic - C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_j_idx); + C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx); // symmetry #pragma omp atomic - C(dim + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx); + C(class_block + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } } diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 1540397bc..49d98d4da 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -53,19 +53,19 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset) - for (std::size_t dim = 0; dim < blocked_num_features; dim += THREAD_BLOCK_SIZE_uz) { - for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t dim_block = 0; dim_block < THREAD_BLOCK_SIZE_uz; ++dim_block) { - for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) { + for (std::size_t feature_thread = 0; feature_thread < THREAD_BLOCK_SIZE_uz; ++feature_thread) { + for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) { // calculate the indices used in the current thread - const std::size_t feature_idx = (dim + dim_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t feature_idx = (feature_block + feature_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) { + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { @@ -74,8 +74,8 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(internal_class); real_type sum{ 0.0 }; - for (unsigned block_sv = 0; block_sv < THREAD_BLOCK_SIZE; ++block_sv) { - sum += alpha(global_class_idx, sv_offset + sv + block_sv) * support_vectors(sv_offset + sv + block_sv, global_feature_idx); + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha(global_class_idx, sv_offset + sv_block + sv) * support_vectors(sv_offset + sv_block + sv, global_feature_idx); } temp[internal_class][internal_feature] += sum; } @@ -129,19 +129,19 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset) - for (std::size_t point = 0; point < blocked_device_num_predict_points; point += THREAD_BLOCK_SIZE_uz) { - for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) { + for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t point_block = 0; point_block < THREAD_BLOCK_SIZE_uz; ++point_block) { - for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) { + for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) { + for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) { // calculate the indices used in the current thread - const std::size_t pp_idx = (point + point_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { @@ -150,8 +150,8 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons const auto global_class_idx = class_idx + static_cast(internal_class); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += w(global_class_idx, dim + block_dim) * predict_points(global_pp_idx, dim + block_dim); + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature); } temp[internal_class][internal_pp] += sum; } @@ -218,20 +218,20 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m } #pragma omp parallel for collapse(2) - for (std::size_t x_block = 0; x_block < blocked_device_specific_num_predict_points; x_block += THREAD_BLOCK_SIZE_uz) { - for (std::size_t y_block = 0; y_block < blocked_num_support_vectors; y_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t pp_block = 0; pp_block < blocked_device_specific_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t sv_block = 0; sv_block < blocked_num_support_vectors; sv_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t x_thread = 0; x_thread < THREAD_BLOCK_SIZE_uz; ++x_thread) { - for (std::size_t y_thread = 0; y_thread < THREAD_BLOCK_SIZE_uz; ++y_thread) { + for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) { + for (std::size_t sv_thread = 0; sv_thread < THREAD_BLOCK_SIZE_uz; ++sv_thread) { // calculate the indices used in the current thread - const std::size_t pp_idx = (x_block + x_thread) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = (y_block + y_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (sv_block + sv_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { @@ -240,8 +240,8 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m const auto global_sv_idx = sv_idx + static_cast(internal_sv); real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += detail::feature_reduce(support_vectors(global_sv_idx, dim + block_dim), predict_points(global_pp_idx, dim + block_dim)); + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature)); } temp[internal_pp][internal_sv] += sum; } @@ -256,7 +256,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m } // add results to prediction - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { // calculate the indices to access the global data and the data with respect to the current device @@ -268,7 +268,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) { for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, dim + class_idx) += alpha(dim + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; + prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; } } } From 8a570a8adc102e9d2267d5e328b6f9a9a16bb3a8 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 14:39:58 +0200 Subject: [PATCH 33/93] Fix tests after slight API changes. --- tests/backends/generic_base_csvm_tests.hpp | 20 ++++++-- tests/backends/generic_csvm_tests.hpp | 55 ++++++++++------------ 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/tests/backends/generic_base_csvm_tests.hpp b/tests/backends/generic_base_csvm_tests.hpp index fd93963c1..f6c95038a 100644 --- a/tests/backends/generic_base_csvm_tests.hpp +++ b/tests/backends/generic_base_csvm_tests.hpp @@ -41,6 +41,7 @@ #include // std::sqrt, std::abs #include // std::size_t +#include // std::memcpy #include // std::numeric_limits::epsilon #include // std::unique_ptr, std::make_unique #include // std::ignore, std::tuple, std::make_tuple @@ -86,7 +87,10 @@ template == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v == plssvm::backend_type::hpx) { // only a single device for OpenMP, stdpar, and HPX on the CPU - result[0] = plssvm::detail::move_only_any{ calculate_partial_kernel_matrix(0, matr.num_rows()) }; + const std::vector partial_kernel_matrix = calculate_partial_kernel_matrix(0, matr.num_rows()); + auto ptr = std::make_unique(partial_kernel_matrix.size()); + std::memcpy(ptr.get(), partial_kernel_matrix.data(), partial_kernel_matrix.size() * sizeof(real_type)); + result[0] = plssvm::detail::move_only_any{ std::move(ptr) }; } else { for (std::size_t device_id = 0; device_id < csvm.num_available_devices(); ++device_id) { auto &device = csvm.devices_[device_id]; @@ -850,7 +854,8 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) { const mock_csvm_type svm = util::construct_from_tuple(params, csvm_test_type::additional_arguments); const std::size_t num_devices = svm.num_available_devices(); // be sure to use the correct data distribution - svm.data_distribution_ = std::make_unique(plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices); + const plssvm::detail::triangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices }; + svm.data_distribution_ = std::make_unique(dist); // automatic solver type not permitted if constexpr (solver == plssvm::solver_type::automatic) { @@ -880,7 +885,9 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) { // get result based on used backend std::vector kernel_matrix{}; if constexpr (plssvm::csvm_to_backend_type_v == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v == plssvm::backend_type::hpx) { - kernel_matrix = plssvm::detail::move_only_any_cast>(kernel_matrix_d[device_id]); // std::vector + const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast &>(kernel_matrix_d[device_id]); // std::unique_ptr + kernel_matrix.resize(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); + std::memcpy(kernel_matrix.data(), kernel_matrix_d_ptr.get(), kernel_matrix.size() * sizeof(plssvm::real_type)); } else { const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast(kernel_matrix_d[device_id]); // device_ptr -> convert it to a std::vector kernel_matrix.resize(kernel_matrix_d_ptr.size_padded()); @@ -960,7 +967,8 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) { const mock_csvm_type svm = util::construct_from_tuple(params, csvm_test_type::additional_arguments); const std::size_t num_devices = svm.num_available_devices(); // be sure to use the correct data distribution - svm.data_distribution_ = std::make_unique(plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices); + const plssvm::detail::triangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices }; + svm.data_distribution_ = std::make_unique(dist); // automatic solver type not permitted if constexpr (solver == plssvm::solver_type::automatic) { @@ -990,7 +998,9 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) { // get result based on used backend std::vector kernel_matrix{}; if constexpr (plssvm::csvm_to_backend_type_v == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v == plssvm::backend_type::hpx) { - kernel_matrix = plssvm::detail::move_only_any_cast>(kernel_matrix_d[device_id]); // std::vector + const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast &>(kernel_matrix_d[device_id]); // std::unique_ptr + kernel_matrix.resize(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); + std::memcpy(kernel_matrix.data(), kernel_matrix_d_ptr.get(), kernel_matrix.size() * sizeof(plssvm::real_type)); } else { const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast(kernel_matrix_d[device_id]); // device_ptr -> convert it to a std::vector kernel_matrix.resize(kernel_matrix_d_ptr.size_padded()); diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp index 84b9b7ad9..549cd3a68 100644 --- a/tests/backends/generic_csvm_tests.hpp +++ b/tests/backends/generic_csvm_tests.hpp @@ -81,14 +81,15 @@ TYPED_TEST_P(GenericBackendCSVM, blas_level_3_kernel_explicit) { const std::size_t specific_num_rows = dist.place_specific_num_rows(device); const std::size_t row_offset = dist.place_row_offset(device); - device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_temp); + device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_temp); const std::size_t num_mirror_rows = num_rows - row_offset - specific_num_rows; if (num_mirror_rows > 0) { - device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_temp); + device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_temp); } C_res += C_temp; } + C_res.restore_padding(); // calculate correct results const plssvm::aos_matrix kernel_matrix_gemm_padded = ground_truth::assemble_full_kernel_matrix(params, data.data(), q_red, QA_cost); @@ -112,6 +113,7 @@ TYPED_TEST_P(GenericBackendCSVM, calculate_w) { const plssvm::detail::rectangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_data_points(), 1 }; device_kernel_w_linear(w, weights, data.data(), dist.place_specific_num_rows(0), dist.place_row_offset(0)); + w.restore_padding(); // calculate correct results const plssvm::soa_matrix correct_w = ground_truth::calculate_w(weights, data.data()); @@ -160,22 +162,22 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, assemble_kernel_matrix_explicit) switch (kernel) { case plssvm::kernel_function_type::linear: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost); break; case plssvm::kernel_function_type::polynomial: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); break; case plssvm::kernel_function_type::rbf: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case plssvm::kernel_function_type::sigmoid: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); break; case plssvm::kernel_function_type::laplacian: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case plssvm::kernel_function_type::chi_squared: - device_kernel_assembly(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + device_kernel_assembly(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; } const std::vector correct_kernel_matrix = ground_truth::assemble_device_specific_kernel_matrix(params, data_matr, q_red, QA_cost, dist, 0); @@ -297,6 +299,7 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, predict_values) { device_kernel_predict(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get(params.gamma)); break; } + out.restore_padding(); // check out for correctness const plssvm::aos_matrix correct_out = ground_truth::predict_values(params, correct_w, weights, rho, data_matr, predict_points); @@ -337,45 +340,39 @@ TYPED_TEST_P(GenericBackendCSVMDeathTest, blas_level_3_kernel_explicit) { const std::size_t row_offset = dist.place_row_offset(0); { - // the A matrix must have the correct size - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, std::vector{}, B, beta, C), "A matrix may not be empty!"); - // the B matrix must have the correct shape const auto B_wrong = util::generate_random_matrix>(plssvm::shape{ std::min(0ULL, num_rows - 1), std::min(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE }); - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); + EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); // the C matrix must have the correct shape auto C_wrong = util::generate_random_matrix>(plssvm::shape{ std::min(0ULL, num_rows - 1), std::min(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE }); - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); + EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); // the place specific number of rows may not be too large - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, num_rows + 1, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, num_rows + 1, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); // the row offset may not be too large - EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, num_rows + 1, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, num_rows + 1, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); } { const std::size_t num_mirror_rows = num_rows - row_offset - specific_num_rows; - // the A matrix must have the correct size - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, std::vector{}, B, beta, C), "A matrix may not be empty!"); - // the B matrix must have the correct shape const auto B_wrong = util::generate_random_matrix>(plssvm::shape{ std::min(0ULL, num_rows - 1), std::min(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE }); - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); // the C matrix must have the correct shape auto C_wrong = util::generate_random_matrix>(plssvm::shape{ std::min(0ULL, num_rows - 1), std::min(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE }); - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast(num_rows) - 1), std::min(0, static_cast(num_rhs) - 2), num_rows, num_rhs))); // the place specific number of rows may not be too large - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, num_rows + 1, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, num_rows + 1, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); // the mirror number of rows may not be too large - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_rows + 1, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_rows + 1, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); // the row offset may not be too large - EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, num_rows + 1, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); + EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, num_rows + 1, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows))); } } @@ -445,22 +442,22 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, assemble_kernel_matrix_e const auto run_assembly = [=](const plssvm::parameter ¶ms_p, std::vector &kernel_matrix_p, const plssvm::soa_matrix &data_p, const std::size_t device_specific_num_rows_p, const std::size_t row_offset_p, const std::vector &q_red_p, const plssvm::real_type QA_cost_p) { switch (kernel) { case plssvm::kernel_function_type::linear: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost); break; case plssvm::kernel_function_type::polynomial: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get(params_p.gamma), params_p.coef0); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get(params_p.gamma), params_p.coef0); break; case plssvm::kernel_function_type::rbf: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); break; case plssvm::kernel_function_type::sigmoid: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma), params_p.coef0); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma), params_p.coef0); break; case plssvm::kernel_function_type::laplacian: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); break; case plssvm::kernel_function_type::chi_squared: - device_kernel_assembly(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); + device_kernel_assembly(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get(params_p.gamma)); break; } }; From 3025c7606fd3cb67c63f543a28077fe189eda991 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 15:02:20 +0200 Subject: [PATCH 34/93] Remove unnecessary conditions. Improve variable naming. --- .../OpenMP/kernel/cg_explicit/blas.hpp | 8 ++++---- .../backends/OpenMP/kernel/predict_kernel.hpp | 20 ++++++------------- src/plssvm/backends/OpenMP/csvm.cpp | 6 ++++++ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index 298962c19..81f560421 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -45,7 +45,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -54,7 +54,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num #pragma omp parallel for collapse(2) for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) { - for (std::size_t row_block = 0; row_block < blocked_device_specific_num_rows; row_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_device_num_rows; row_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) { for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { @@ -83,7 +83,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num } else { A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; } - sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset); + sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } temp[internal_i][internal_j] += sum; } @@ -164,7 +164,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz real_type sum{ 0.0 }; for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { - const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - dim_block + dim + global_j_idx]; + const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx]; sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } temp[internal_i][internal_j] += sum; diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 49d98d4da..89c0a380c 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -89,9 +89,7 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(internal_feature); const auto global_class_idx = class_idx + static_cast(internal_class); - if (global_class_idx < num_classes && global_feature_idx < num_features) { - w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; - } + w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; } } } @@ -162,13 +160,10 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { // calculate the indices to access the global data - const auto device_global_pp_idx = pp_idx + static_cast(internal_pp); - const auto global_pp_idx = device_row_offset + device_global_pp_idx; + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - if (global_class_idx < num_classes && global_pp_idx < device_num_predict_points) { - prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; - } + prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; } } } @@ -260,16 +255,13 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { // calculate the indices to access the global data and the data with respect to the current device - const auto device_global_pp_idx = pp_idx + static_cast(internal_pp); - const auto global_pp_idx = device_row_offset + device_global_pp_idx; + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); const auto global_sv_idx = sv_idx + static_cast(internal_sv); // be sure to not perform out-of-bounds accesses - if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) { - for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; - } + prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; } } } diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index 656d966f3..d34b25066 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -275,6 +275,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s break; } } + // restore padding entries by setting them to zero + C.restore_padding(); } //***************************************************// @@ -330,6 +332,8 @@ aos_matrix csvm::predict_values(const parameter ¶ms, [[maybe_unused]] const auto duration = std::chrono::duration_cast(end - start); PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration })); } + // restore padding entries by setting them to zero + w.restore_padding(); // reduce w on all MPI ranks comm_.allreduce_inplace(w); @@ -369,6 +373,8 @@ aos_matrix csvm::predict_values(const parameter ¶ms, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "predict_kernel", duration })); } + // restore padding entries by setting them to zero + out.restore_padding(); return out; } From 46a955806df29922477ac79cd5fea07cb0329f3d Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 30 May 2025 15:19:14 +0200 Subject: [PATCH 35/93] Update variable names. --- .../backends/CUDA/kernel/cg_explicit/blas.cuh | 24 +++++------ .../cg_explicit/kernel_matrix_assembly.cuh | 12 +++--- .../kernel_matrix_assembly_blas.cuh | 26 ++++++------ .../backends/CUDA/kernel/predict_kernel.cuh | 40 +++++++++---------- 4 files changed, 51 insertions(+), 51 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh index 1a6be4ae8..d2adc5618 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh @@ -58,7 +58,7 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // iterate over all values using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -67,20 +67,20 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t // store the values in the shared memory // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j_idx_linear) { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + if (dim_block + threadIdx_y < global_j_idx_linear) { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } else { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; } } } @@ -150,7 +150,7 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std: const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows // iterate over the remaining values using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -158,16 +158,16 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std: const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh index e4a3fa22d..70c9b4101 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh @@ -67,7 +67,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -75,17 +75,17 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA - data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 8e8dd03c2..960f61b9f 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -80,7 +80,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -88,17 +88,17 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA - data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -139,15 +139,15 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y]; // SoA - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA + B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA } __syncthreads(); // wait until all threads loaded their part of the data @@ -167,7 +167,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // calculate the indices to access the global data const auto global_j_idx = device_row_offset + j_idx + static_cast(internal); - atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA } __syncthreads(); // wai until all threads updated C with their values } @@ -195,14 +195,14 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y]; // SoA + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } __syncthreads(); // wait until all threads loaded their part of the data @@ -223,7 +223,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // calculate the indices to access the global data const auto global_i_idx = device_row_offset + i_idx + static_cast(internal); - atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA } __syncthreads(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 5469b01d9..9c462127e 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -58,7 +58,7 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) { + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -66,16 +66,16 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv + threadIdx_y]; // SoA - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv + sv_offset + threadIdx_y]; // AoS + feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + sv_offset + threadIdx_y]; // AoS } __syncthreads(); // wait until all threads loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += alpha_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; } } } @@ -137,7 +137,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -145,16 +145,16 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA - w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pp][internal_class] += w_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; + temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; } } } @@ -226,7 +226,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses @@ -234,17 +234,17 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; // store the values in the shared memory - pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA - sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], - pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); } } } @@ -271,17 +271,17 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; // store the values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_y == std::size_t{ 0 }) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[dim + threadIdx_y]; + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; } else { out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } @@ -304,7 +304,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // calculate the indices to access the global data const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); } __syncthreads(); // wait until all threads updated their part of the prediction } From 0c682067bd8a86734bc94c7c5f2c9567a7133aef Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 14:44:36 +0200 Subject: [PATCH 36/93] Update documentation and add missing headers. --- .../backends/CUDA/kernel/cg_explicit/blas.cuh | 2 + .../kernel_matrix_assembly_blas.cuh | 3 + .../backends/CUDA/kernel/kernel_functions.cuh | 1 - .../backends/CUDA/kernel/predict_kernel.cuh | 84 +++++++++---------- .../backends/OpenMP/kernel/predict_kernel.hpp | 10 +-- 5 files changed, 52 insertions(+), 48 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh index d2adc5618..bacc84852 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh @@ -15,6 +15,8 @@ #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include // std::size_t + namespace plssvm::cuda::detail { /** diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 960f61b9f..bf1ee66e5 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -18,6 +18,8 @@ #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** @@ -186,6 +188,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty } } } + //*************************************************************************// // calculate C += alpha * temp * B for the LOWER triangular matrix // //*************************************************************************// diff --git a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh index 72a4499ae..7748c45c8 100644 --- a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh +++ b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh @@ -51,7 +51,6 @@ template <> /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 9c462127e..285cdc3a6 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -18,21 +18,23 @@ #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. * @param[out] w the vector to speedup the linear prediction * @param[in] alpha the previously learned weights - * @param[in] sv the support vectors + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] device_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha) the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { +__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); @@ -67,7 +69,7 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con // store the values in the shared memory feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + sv_offset + threadIdx_y]; // AoS + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y]; // AoS } __syncthreads(); // wait until all threads loaded their part of the data @@ -264,50 +266,48 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al auto alpha_cache = reinterpret_cast(cache_one); auto out_cache = reinterpret_cast(cache_two); - { - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points - // calculate the indices used in the current thread, pays attention to coalesced memory accesses - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors - - // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data, pays attention to coalesced memory accesses - const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // store the values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS - // the bias (rho) must only be applied once for all support vectors - if (blockIdx_y == std::size_t{ 0 }) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; - } else { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - } + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + + // store the values in the shared memory + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS + // the bias (rho) must only be applied once for all support vectors + if (blockIdx_y == std::size_t{ 0 }) { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; + } else { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } - __syncthreads(); // wait until all threads loaded their part of the data - - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; - } + } + __syncthreads(); // wait until all threads loaded their part of the data + + // calculate intermediate results and store them in shared memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; } - __syncthreads(); // wait until all threads performed their part of the calculations } + __syncthreads(); // wait until all threads performed their part of the calculations + } - // atomically add the intermediate cached results to the prediction - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data - const auto global_pp_idx = pp_idx + static_cast(internal); + // atomically add the intermediate cached results to the prediction + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - } - __syncthreads(); // wait until all threads updated their part of the prediction + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); } + __syncthreads(); // wait until all threads updated their part of the prediction } } } diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 89c0a380c..17696bd90 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -32,13 +32,13 @@ namespace plssvm::openmp::detail { * @param[in] alpha the previously learned weights * @param[in] support_vectors the support vectors * @param[in] device_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for + * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t sv_offset) { +inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) { PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); - PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); @@ -52,7 +52,7 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(INTERNAL_BLOCK_SIZE); const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); -#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset) +#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, device_sv_offset) for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) { for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block @@ -75,7 +75,7 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix Date: Sat, 31 May 2025 14:45:38 +0200 Subject: [PATCH 37/93] Update the HIP backend kernels. --- .../HIP/kernel/cg_explicit/blas.hip.hpp | 297 +++++++-------- .../kernel_matrix_assembly.hip.hpp | 52 +-- .../kernel_matrix_assembly_blas.hip.hpp | 180 ++++----- .../HIP/kernel/kernel_functions.hip.hpp | 35 +- .../HIP/kernel/predict_kernel.hip.hpp | 342 +++++++++--------- 5 files changed, 463 insertions(+), 443 deletions(-) diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp index 124688d3a..b2e9c8ce3 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp @@ -13,11 +13,13 @@ #define PLSSVM_BACKENDS_HIP_CG_EXPLICIT_BLAS_HIP_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** @@ -25,8 +27,8 @@ namespace plssvm::hip::detail { * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for! * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -35,78 +37,77 @@ namespace plssvm::hip::detail { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> device_specific_num_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j) { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull]; - } - // determine on which side of the diagonal we are located - if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + // determine on which side of the diagonal we are located + if (dim_block + threadIdx_y < global_j_idx_linear) { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && device_global_j < device_specific_num_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -118,8 +119,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -128,68 +129,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j]; - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j]; - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + device_specific_num_rows + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && partial_global_j < num_mirror_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -203,27 +208,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j]; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -236,27 +243,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale; // SoA } } } diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp index f0e01f813..308867d76 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp @@ -55,7 +55,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large - // create two shared memory arrays used for caching data point features + // create two shared memory arrays used for caching __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; @@ -65,30 +65,30 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // calculate the indices used in the current thread paying attention to coalesced memory accesses - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory - data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -97,29 +97,29 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type } // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } // update the upper triangular kernel matrix - kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp index 77820e35a..97ef0798b 100644 --- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp @@ -14,12 +14,14 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** @@ -28,10 +30,10 @@ namespace plssvm::hip::detail { * @tparam Args the types of the parameters necessary for the specific kernel function * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -43,56 +45,64 @@ namespace plssvm::hip::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -103,16 +113,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out of bounds accesses (only using the upper triangular matrix) + if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost; } } else { @@ -122,42 +134,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the shared memory + B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE]; + C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j + static_cast(internal); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]); + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal); + + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA } __syncthreads(); // wai until all threads updated C with their values } @@ -166,51 +180,55 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + // store the values in the shared memory + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i + static_cast(internal); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal); + + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA } __syncthreads(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp index a98bb0715..1b2be0ae6 100644 --- a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp @@ -51,42 +51,17 @@ template <> /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp index 6e349927e..9aaba6c5e 100644 --- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp @@ -14,169 +14,183 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. - * @param[out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_features + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes - data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y]; // SoA - data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y]; // AoS - } - __syncthreads(); // wait until all threads loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + // store the values in the shared memory + feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y]; // AoS + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class]; + w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - } - __syncthreads(); // wait until all threads loaded their part of the data + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]; + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx]; + prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx]; // AoS } } } /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] sv the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -186,53 +200,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto pp_cache = reinterpret_cast(cache_one); + auto sv_cache = reinterpret_cast(cache_two); + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]); + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); } } } @@ -241,55 +257,57 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type * } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter...); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); } } { - // same shared memory size but with different dimensions - __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto alpha_cache = reinterpret_cast(cache_one); + auto out_cache = reinterpret_cast(cache_two); - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + // store the values in the shared memory + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors - if (blockIdx_y == 0ull) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y]; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + if (blockIdx_y == std::size_t{ 0 }) { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; } else { out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); } __syncthreads(); // wait until all threads updated their part of the prediction } From 45832e70abd46ed5b4042abebef83dcba8c8d32a Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 14:48:38 +0200 Subject: [PATCH 38/93] Fix Doxygen documentation. --- include/plssvm/detail/make_unique_for_overwrite.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp index 51b56e126..06f4cbaa5 100644 --- a/include/plssvm/detail/make_unique_for_overwrite.hpp +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -27,14 +27,14 @@ template struct is_unbounded_array : std::false_type { }; /** - * @brief Specialization of @ref is_unbounded_array for unbounded arrays. + * @brief Specialization of @ref plssvm::detail::is_unbounded_array for unbounded arrays. * @tparam T the array type */ template struct is_unbounded_array : std::true_type { }; /** - * @brief Shortcut for @ref is_unbounded_array::value. + * @brief Shortcut for @ref plssvm::detail::is_unbounded_array. * @tparam T the array type */ template @@ -48,7 +48,7 @@ template struct is_bounded_array : std::false_type { }; /** - * @brief Specialization of @ref is_bounded_array for unbounded arrays. + * @brief Specialization of @ref plssvm::detail::is_bounded_array for unbounded arrays. * @tparam T the array type * @tparam N the size of the array */ @@ -56,7 +56,7 @@ template struct is_bounded_array : std::true_type { }; /** - * @brief Shortcut for @ref is_unbounded_array::value. + * @brief Shortcut for @ref plssvm::detail::is_bounded_array. * @tparam T the array type */ template From 10ff3c26011db0042860d5ca8df6cd663ac988bd Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 17:33:47 +0200 Subject: [PATCH 39/93] Add additional assert. --- include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp | 2 ++ .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp | 1 + 2 files changed, 3 insertions(+) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index 81f560421..3fbbaaa4b 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -38,6 +38,7 @@ namespace plssvm::openmp::detail { */ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); @@ -125,6 +126,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num */ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index f384645b1..381c8adf7 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -41,6 +41,7 @@ namespace plssvm::openmp::detail { */ template void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { + PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!"); PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); From dad55f2688eb18ee01684cf9b605dd529273da54 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 17:34:57 +0200 Subject: [PATCH 40/93] Fix variable names. --- .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp | 4 ++-- .../OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 4 ++-- include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index 381c8adf7..b442288df 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -51,7 +51,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -60,7 +60,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -62,7 +62,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector #pragma omp parallel for collapse(2) schedule(dynamic) for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) { - for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t col_block = 0; col_block < blocked_device_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) { diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 17696bd90..a9fa64d07 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -198,7 +198,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows @@ -213,7 +213,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m } #pragma omp parallel for collapse(2) - for (std::size_t pp_block = 0; pp_block < blocked_device_specific_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { for (std::size_t sv_block = 0; sv_block < blocked_num_support_vectors; sv_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) { From e6b76f2d90c21fb560f4af0b2187c7e4ae195594 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 17:35:18 +0200 Subject: [PATCH 41/93] Use typename instead of class. --- include/plssvm/detail/make_unique_for_overwrite.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp index 06f4cbaa5..8e7603cc1 100644 --- a/include/plssvm/detail/make_unique_for_overwrite.hpp +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -68,7 +68,7 @@ constexpr bool is_bounded_array_v = is_bounded_array::value; * @tparam T the type of the object to create * @return a unique pointer to the newly created object (`[[nodiscard]]`) */ -template , bool> = true> +template , bool> = true> [[nodiscard]] std::unique_ptr make_unique_for_overwrite() { return std::unique_ptr(new T); } @@ -80,7 +80,7 @@ template , bool> = true> * @param[in] n the size of the array to create * @return a unique pointer to the newly created object (`[[nodiscard]]`) */ -template , bool> = true> +template , bool> = true> std::unique_ptr make_unique_for_overwrite(const std::size_t n) { return std::unique_ptr(new std::remove_extent_t[n]); } @@ -93,7 +93,7 @@ std::unique_ptr make_unique_for_overwrite(const std::size_t n) { * @param[in] args the arguments to pass to the constructor * @return a unique pointer to the newly created object (`[[nodiscard]]`) */ -template , bool> = true> +template , bool> = true> auto make_unique_for_overwrite(Args &&...args) = delete; } // namespace plssvm::detail From 5913b5028aebc2f9492f61ad830828eec17a3ee4 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 17:36:06 +0200 Subject: [PATCH 42/93] Move parallel zero memset to header function (used in multiple places). --- .../detail/make_unique_for_overwrite.hpp | 18 ++++++++++++++++++ src/plssvm/backends/OpenMP/csvm.cpp | 13 +++---------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp index 8e7603cc1..fcb205622 100644 --- a/include/plssvm/detail/make_unique_for_overwrite.hpp +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -13,7 +13,10 @@ #ifndef PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ #define PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT + #include // std::size_t +#include // std::memset #include // std::unique_ptr #include // std::false_type, std::true_type, std::enable_if_t, std::is_array_v @@ -96,6 +99,21 @@ std::unique_ptr make_unique_for_overwrite(const std::size_t n) { template , bool> = true> auto make_unique_for_overwrite(Args &&...args) = delete; +template +void parallel_zero_memset(T *dest, const std::size_t count) { + PLSSVM_ASSERT(dest != nullptr, "The destination pointer may not be a nullptr!"); + +// initialize the data pointed to by dest to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset +#if defined(_OPENMP) + #pragma omp parallel for + for (std::size_t i = 0; i < count; ++i) { + dest[i] = T{ 0 }; + } +#else + std::memset(dest, 0, count * sizeof(T)); +#endif +} + } // namespace plssvm::detail #endif // PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp index d34b25066..868ab32e6 100644 --- a/src/plssvm/backends/OpenMP/csvm.cpp +++ b/src/plssvm/backends/OpenMP/csvm.cpp @@ -19,7 +19,7 @@ #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked -#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::make_unique_for_overwrite +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset} #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/move_only_any.hpp" // plssvm::detail::{move_only_any, move_only_any_cast} #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY @@ -131,15 +131,8 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // only explicitly store the upper triangular matrix auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite(num_entries); - // initialize kernel matrix to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset -#if defined(_OPENMP) - #pragma omp parallel for - for (std::size_t i = 0; i < num_entries; ++i) { - kernel_matrix[i] = real_type{ 0.0 }; - } -#else - std::memset(kernel_matrix.get(), 0, num_entries * sizeof(real_type)); -#endif + // initialize kernel matrix to all zeros in parallel + ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries); const auto start = std::chrono::steady_clock::now(); switch (params.kernel_type) { From a67751bd9461722cbed2acf7dbc421722d4a5652 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 18:22:26 +0200 Subject: [PATCH 43/93] Add documentation and rearrange constant declarations. --- .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 1 + include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 067608773..391b9fd90 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -122,6 +122,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index a9fa64d07..1eed9735e 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -43,8 +43,6 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); @@ -65,6 +63,7 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix, INTERNAL_BLOCK_SIZE> temp{}; + // iterate over all support vectors for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { @@ -117,8 +116,6 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons // calculate constants const std::size_t num_classes = prediction.num_cols(); const std::size_t num_features = predict_points.num_cols(); - - // calculate constants const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); @@ -139,6 +136,7 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; + // iterate over all features for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { @@ -197,9 +195,9 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); + const std::size_t num_features = predict_points.num_cols(); const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); - const std::size_t num_features = predict_points.num_cols(); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); @@ -258,7 +256,6 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out-of-bounds accesses for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; From 54741fff26ea2fa0ebbd3e508895451682ad88c1 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 20:12:51 +0200 Subject: [PATCH 44/93] Inverse all temp indices for better consistency. --- .../plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp | 8 ++++---- .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp | 4 ++-- include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index 3fbbaaa4b..01db6a60e 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -86,7 +86,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num } sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } - temp[internal_i][internal_j] += sum; + temp[internal_j][internal_i] += sum; } } } @@ -101,7 +101,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // be sure to not perform out-of-bounds accesses if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { - C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx); + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } @@ -169,7 +169,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx]; sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } - temp[internal_i][internal_j] += sum; + temp[internal_j][internal_i] += sum; } } } @@ -184,7 +184,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz // be sure to not perform out-of-bounds accesses if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { - C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx); + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index b442288df..aa465dead 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -86,7 +86,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); } - temp[internal_i][internal_j] += sum; + temp[internal_j][internal_i] += sum; } } } @@ -102,7 +102,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix= global_j_idx) { - real_type temp_ij = temp[internal_i][internal_j]; + real_type temp_ij = temp[internal_j][internal_i]; // apply the final kernel function temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 1eed9735e..7bea4b3c4 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -236,7 +236,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { sum += detail::feature_reduce(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature)); } - temp[internal_pp][internal_sv] += sum; + temp[internal_sv][internal_pp] += sum; } } } @@ -244,7 +244,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // update temp using the respective kernel function for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + temp[internal_sv][internal_pp] = detail::apply_kernel_function(temp[internal_sv][internal_pp], kernel_function_parameter...); } } @@ -258,7 +258,7 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv]; + prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp]; } } } From 46891d9b43158ce084aad132f9d90947a28ab7bb Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 20:31:54 +0200 Subject: [PATCH 45/93] Add missing doxygen documentation. --- include/plssvm/detail/make_unique_for_overwrite.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp index fcb205622..ca58eec3a 100644 --- a/include/plssvm/detail/make_unique_for_overwrite.hpp +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -99,6 +99,12 @@ std::unique_ptr make_unique_for_overwrite(const std::size_t n) { template , bool> = true> auto make_unique_for_overwrite(Args &&...args) = delete; +/** + * @brief Fill the array @p dest with zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset. + * @tparam T the type of the values + * @param[in,out] dest the array to fill with zeros + * @param[in] count the number of values to fill + */ template void parallel_zero_memset(T *dest, const std::size_t count) { PLSSVM_ASSERT(dest != nullptr, "The destination pointer may not be a nullptr!"); From fa5cea380199ef9c8204c2ffd4890ec389493c87 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 31 May 2025 20:56:51 +0200 Subject: [PATCH 46/93] Update the HPX backend kernels. --- .../backends/HPX/kernel/cg_explicit/blas.hpp | 130 +++++++------- .../cg_explicit/kernel_matrix_assembly.hpp | 87 ++++----- .../kernel_matrix_assembly_blas.hpp | 111 +++++++----- .../backends/HPX/kernel/kernel_functions.hpp | 35 +--- .../backends/HPX/kernel/predict_kernel.hpp | 165 +++++++++--------- src/plssvm/backends/HPX/csvm.cpp | 35 ++-- 6 files changed, 299 insertions(+), 264 deletions(-) diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp index 20cbad247..99aeec376 100644 --- a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp @@ -34,60 +34,63 @@ namespace plssvm::hpx::detail { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a symmetric matrix (memory optimized), @p B and @p C are matrices, and @p alpha and @p beta are scalars. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); +inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // define range over which should be iterated - std::vector range(blocked_num_rhs * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector range(blocked_num_rhs * blocked_device_num_rows); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t rhs = idx / blocked_device_specific_num_rows; - const std::size_t row = idx % blocked_device_specific_num_rows; - - const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) { + // iterate over all values + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_row) { - A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }]; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); } - temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset); + temp[internal_j][internal_i] += sum; } } } @@ -95,13 +98,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t device_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } @@ -113,22 +117,22 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); @@ -136,47 +140,51 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // define range over which should be iterated - std::vector range(blocked_num_rhs * blocked_num_mirror_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector range(blocked_num_rhs * blocked_num_mirror_rows); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t rhs = idx / blocked_num_mirror_rows; - const std::size_t row = idx % blocked_num_mirror_rows; - - const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) { + // iterate over the remaining values + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row]; - temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx]; + sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim); + } + temp[internal_j][internal_i] += sum; } } } - // apply the (partial) BLAS operation and update C + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t partial_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp index af1d3c9e2..f4bf41d0d 100644 --- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -32,82 +32,89 @@ namespace plssvm::hpx::detail { /** - * @brief Assemble the kernel matrix using the @p kernel function. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Assemble the kernel matrix using the @p kernel_function function. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[out] kernel_matrix the resulting kernel matrix * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] q the `q` vector * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -void device_kernel_assembly(std::vector &kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { +template +void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { + PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!"); PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!"); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); // calculate constants const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // count the number of entries in the final index list - std::vector indices(blocked_row_range * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector indices(blocked_row_range * blocked_device_num_rows); std::iota(indices.begin(), indices.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { - // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + real_type temp_ij = temp[internal_j][internal_i]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 06df89dac..78a0f93d1 100644 --- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -34,25 +34,25 @@ namespace plssvm::hpx::detail { /** - * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[in] alpha the scalar alpha value * @param[in] q the `q` vector * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] B the matrix @p B * @param[in,out] C the matrix @p C - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { +template +inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!"); PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size()); @@ -61,64 +61,89 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); const std::size_t num_classes = B.num_rows(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // count the number of entries in the final index list - std::vector indices(blocked_row_range * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector indices(blocked_row_range * blocked_device_num_rows); std::iota(indices.begin(), indices.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { - // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { - temp_ij += cost; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { - atomic_ref{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_row); + if (global_i_idx == global_j_idx) { + temp[internal_j][internal_i] += cost; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_j][internal_i] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + atomic_ref{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { - atomic_ref{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_col); + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + atomic_ref{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx); // symmetry - atomic_ref{ C(class_idx, global_col) } += alpha * temp_ij * B(class_idx, global_row); + atomic_ref{ C(class_block + class_idx, global_j_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } } diff --git a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp index 6c0cd8a43..35e79d01d 100644 --- a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp @@ -28,42 +28,17 @@ namespace plssvm::hpx::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp index 7ea68e172..050425b8a 100644 --- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp @@ -16,7 +16,7 @@ #include "plssvm/backends/HPX/detail/utility.hpp" // plssvm::hpx::detail::atomic_ref #include "plssvm/backends/HPX/kernel/kernel_functions.hpp" // plssvm::hpx::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix, plssvm::soa_matrix @@ -38,59 +38,63 @@ namespace plssvm::hpx::detail { * @param[out] w the vector to speedup the linear prediction * @param[in] alpha the previously learned weights * @param[in] support_vectors the support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) { +inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) { PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); - PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows()); - PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); - const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = support_vectors.num_cols(); const auto blocked_num_features = static_cast(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated + // define the range over which should be iterated std::vector range(blocked_num_classes * blocked_num_features); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t feature = idx / blocked_num_classes; - const std::size_t c = idx % blocked_num_classes; - - const std::size_t feature_idx = feature * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz; + const std::size_t feature_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t sv = 0; sv < device_specific_num_sv; ++sv) { - // perform the feature reduction calculation + // iterate over all support vectors + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_feature_idx = feature_idx + static_cast(internal_feature); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_feature][internal_class] += alpha(global_class_idx, sv_offset + sv) * support_vectors(sv_offset + sv, global_feature_idx); + real_type sum{ 0.0 }; + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha(global_class_idx, device_sv_offset + sv_block + sv) * support_vectors(device_sv_offset + sv_block + sv, global_feature_idx); + } + temp[internal_class][internal_feature] += sum; } } } - // update global array with local one + // store the result back to the w vector for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_feature_idx = feature_idx + static_cast(internal_feature); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w(global_class_idx, global_feature_idx) = temp[internal_feature][internal_class]; + w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; } } }); @@ -102,63 +106,64 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) { +inline void device_kernel_predict_linear(aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) { PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size()); PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_classes = prediction.num_cols(); - const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated - std::vector range(blocked_device_specific_num_predict_points * blocked_num_classes); + // define the range over which should be iterated + std::vector range(blocked_device_num_predict_points * blocked_num_classes); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t pp = idx / blocked_num_classes; - const std::size_t c = idx % blocked_num_classes; - - const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - // perform the feature reduction calculation + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_pp][internal_class] += w(global_class_idx, dim) * predict_points(global_pp_idx, dim); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature); + } + temp[internal_class][internal_pp] += sum; } } } - // perform the dot product calculation + // store the result back to the w vector for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - if (device_global_pp_idx < device_specific_num_predict_points && global_class_idx < num_classes) { - prediction(global_pp_idx, global_class_idx) = temp[internal_pp][internal_class] - rho[global_class_idx]; - } + prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; } } }); @@ -166,61 +171,63 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons /** * @brief Predict the @p predict_points_d using the @p kernel_function. - * @tparam kernel the type of the used kernel function + * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function * @param[out] prediction the predicted values * @param[in] alpha the previously learned weights * @param[in] rho the previously learned bias * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict - * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for - * @param[in] row_offset the first row in @p predict_points the current device is responsible for + * @param[in] device_num_predict_points the number of predict points the current device is responsible for + * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ -template -inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) { +template +inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) { PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size()); PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); - const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); + const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated - std::vector range(blocked_device_specific_num_predict_points * blocked_num_support_vectors); + // define the range over which should be iterated + std::vector range(blocked_device_num_predict_points * blocked_num_support_vectors); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t pp = idx / blocked_num_support_vectors; - const std::size_t sv = idx % blocked_num_support_vectors; - - const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = sv * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (idx / blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (idx % blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors(global_sv_idx, dim), - predict_points(global_pp_idx, dim)); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature)); + } + temp[internal_sv][internal_pp] += sum; } } } @@ -228,25 +235,23 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // update temp using the respective kernel function for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + temp[internal_sv][internal_pp] = detail::apply_kernel_function(temp[internal_sv][internal_pp], kernel_function_parameter...); } } // add results to prediction - for (std::size_t a = 0; a < num_classes; ++a) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out of bounds accesses - if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { if (global_sv_idx == 0) { - atomic_ref{ prediction(global_pp_idx, a) } += -rho[a]; + atomic_ref{ prediction(global_pp_idx, class_block + class_idx) } += -rho[class_block + class_idx]; } - atomic_ref{ prediction(global_pp_idx, a) } += - temp[internal_pp][internal_sv] * alpha(a, global_sv_idx); + atomic_ref{ prediction(global_pp_idx, class_block + class_idx) } += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp]; } } } diff --git a/src/plssvm/backends/HPX/csvm.cpp b/src/plssvm/backends/HPX/csvm.cpp index 71f651688..4c24192dd 100644 --- a/src/plssvm/backends/HPX/csvm.cpp +++ b/src/plssvm/backends/HPX/csvm.cpp @@ -18,6 +18,7 @@ #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset} #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/move_only_any.hpp" // plssvm::detail::{move_only_any, move_only_any_cast} #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY @@ -120,26 +121,33 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // get the offset of the data points this device is responsible for const std::size_t row_offset = dist.place_row_offset(0); - std::vector kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); // only explicitly store the upper triangular matrix + // get the number of kernel matrix entries + const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0); + + // only explicitly store the upper triangular matrix + auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite(num_entries); + // initialize kernel matrix to all zeros in parallel + ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries); + const auto start = std::chrono::steady_clock::now(); switch (params.kernel_type) { case kernel_function_type::linear: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); break; case kernel_function_type::polynomial: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; } const auto end = std::chrono::steady_clock::now(); @@ -200,16 +208,16 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s break; case solver_type::cg_explicit: { - const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); + const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!"); const auto start = std::chrono::steady_clock::now(); - detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows; if (num_mirror_rows > std::size_t{ 0 }) { - detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); } const auto end = std::chrono::steady_clock::now(); @@ -261,6 +269,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s }); // wait until operation is completed wait.get(); + // restore padding entries by setting them to zero + C.restore_padding(); } //***************************************************// @@ -317,6 +327,8 @@ aos_matrix csvm::predict_values(const parameter ¶ms, [[maybe_unused]] const auto duration = std::chrono::duration_cast(end - start); PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration })); } + // restore padding entries by setting them to zero + w.restore_padding(); // reduce w on all MPI ranks comm_.allreduce_inplace(w); @@ -358,6 +370,9 @@ aos_matrix csvm::predict_values(const parameter ¶ms, }); // wait until operation is completed wait.get(); + + // restore padding entries by setting them to zero + out.restore_padding(); return out; } From ff892127bfd52ea44fd498a64a4df558936ebc2a Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 14 Jun 2025 16:13:41 +0200 Subject: [PATCH 47/93] Some small changes: where possible change remaining const to constexpr, remove superfluous braces, add missing static_casts, and use correct THREAD_BLOCK_SIZE_uz. --- .../backends/CUDA/kernel/cg_explicit/blas.cuh | 20 ++++++++-------- .../kernel_matrix_assembly_blas.cuh | 8 +++---- .../backends/CUDA/kernel/predict_kernel.cuh | 24 +++++++++---------- .../HIP/kernel/cg_explicit/blas.hip.hpp | 20 ++++++++-------- .../kernel_matrix_assembly_blas.hip.hpp | 8 +++---- .../HIP/kernel/predict_kernel.hip.hpp | 24 +++++++++---------- 6 files changed, 52 insertions(+), 52 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh index bacc84852..ab6c7b11b 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh @@ -36,9 +36,9 @@ namespace plssvm::cuda::detail { */ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -128,9 +128,9 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t */ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -207,8 +207,8 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std: */ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -242,8 +242,8 @@ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, rea */ __global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index bf1ee66e5..9861f2fb7 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -45,9 +45,9 @@ namespace plssvm::cuda::detail { template __global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -118,7 +118,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto global_j_idx = device_row_offset + device_global_j_idx; // be sure to not perform out of bounds accesses (only using the upper triangular matrix) - if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) { + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 285cdc3a6..9d20863c8 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -36,9 +36,9 @@ namespace plssvm::cuda::detail { */ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -115,9 +115,9 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con */ __global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -200,9 +200,9 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t template __global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -232,8 +232,8 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses - const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA @@ -276,7 +276,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses - const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp index b2e9c8ce3..9f5821634 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp @@ -39,9 +39,9 @@ namespace plssvm::hip::detail { */ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -131,9 +131,9 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t */ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -210,8 +210,8 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std: */ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -245,8 +245,8 @@ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, rea */ __global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp index 97ef0798b..2bc4a230f 100644 --- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp @@ -47,9 +47,9 @@ namespace plssvm::hip::detail { template __global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -120,7 +120,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto global_j_idx = device_row_offset + device_global_j_idx; // be sure to not perform out of bounds accesses (only using the upper triangular matrix) - if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) { + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp index 9aaba6c5e..6ba12a360 100644 --- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp @@ -38,9 +38,9 @@ namespace plssvm::hip::detail { */ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -117,9 +117,9 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con */ __global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -202,9 +202,9 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t template __global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension @@ -234,8 +234,8 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses - const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA @@ -278,7 +278,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { // calculate the indices to access the global data, pays attention to coalesced memory accesses - const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the shared memory alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS From b4d553ab3fd22ee15814e228b1f34bee5313496c Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 14 Jun 2025 16:14:13 +0200 Subject: [PATCH 48/93] Update comments. --- include/plssvm/backends/HPX/kernel/predict_kernel.hpp | 2 +- include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp index 050425b8a..e98d09a58 100644 --- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp @@ -156,7 +156,7 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons } } - // store the result back to the w vector + // update the global array with the local one for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { // calculate the indices to access the global data diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 7bea4b3c4..d8cd4a0be 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -154,7 +154,7 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons } } - // store the result back to the w vector + // update the global array with the local one for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { // calculate the indices to access the global data From 4020339b1eed19e92427d3470b53dd7f72c82709 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 14 Jun 2025 16:55:53 +0200 Subject: [PATCH 49/93] Rename sv to support_vectors for better readability and consistency. --- include/plssvm/backends/CUDA/kernel/predict_kernel.cuh | 6 +++--- include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 9d20863c8..d7ebf45a3 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -187,7 +187,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t * @param[in] prediction the predicted values * @param[in] alpha the previously learned weights * @param[in] rho the previously learned biases - * @param[in] sv the support vectors + * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors @@ -198,7 +198,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); @@ -237,7 +237,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // store the values in the shared memory pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA - sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp index 6ba12a360..9ee22edc4 100644 --- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp @@ -189,7 +189,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t * @param[in] prediction the predicted values * @param[in] alpha the previously learned weights * @param[in] rho the previously learned biases - * @param[in] sv the support vectors + * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors @@ -200,7 +200,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); @@ -239,7 +239,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al // store the values in the shared memory pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA - sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data From 39513f8a36be4c9bc821226bf32c0cd3e3323c08 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 20 Jun 2025 12:01:10 +0200 Subject: [PATCH 50/93] Update some comments. --- .../kernel/cg_implicit/kernel_matrix_assembly_blas.cuh | 6 +++--- .../cg_implicit/kernel_matrix_assembly_blas.hip.hpp | 6 +++--- .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 5 ++++- .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 9 ++++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 9861f2fb7..186400757 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -58,10 +58,10 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // calculate the indices used in the current thread const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset - const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // calculate the indices used in the current thread, pays attention to coalesced memory accesses - const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // create two shared memory arrays used for caching @@ -117,7 +117,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto device_global_j_idx = j_idx + static_cast(internal_j); const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses (only using the upper triangular matrix) + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp index 2bc4a230f..b2bee8d46 100644 --- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp @@ -60,10 +60,10 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // calculate the indices used in the current thread const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset - const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // calculate the indices used in the current thread, pays attention to coalesced memory accesses - const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows // create two shared memory arrays used for caching @@ -119,7 +119,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty const auto device_global_j_idx = j_idx + static_cast(internal_j); const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses (only using the upper triangular matrix) + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 78a0f93d1..d6abc8cab 100644 --- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -82,6 +82,9 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// // iterate over all features for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { @@ -108,7 +111,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const auto device_global_j_idx = j_idx + static_cast(internal_j); const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 391b9fd90..952225c06 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -75,6 +75,9 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// // iterate over all features for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { @@ -101,7 +104,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const auto device_global_j_idx = j_idx + static_cast(internal_j); const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { // apply the final kernel function temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; @@ -128,13 +131,13 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector if (global_i_idx == global_j_idx) { // only apply once to the diagonal - for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } else { // apply it for the upper and lower triangular matrix - for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx); // symmetry From 3ef281db60488a7cbe891f9c704afb87a041074d Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 20 Jun 2025 15:25:21 +0200 Subject: [PATCH 51/93] Also use trimmed names in performance tracking output. --- src/plssvm/backends/SYCL/DPCPP/csvm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp index 12910a7ae..861344f5b 100644 --- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp @@ -147,7 +147,7 @@ void csvm::init(const target_platform target) { " [{}, {}]\n", device, trimmed_device_name); - device_names.emplace_back(device_name); + device_names.emplace_back(trimmed_device_name); } } From a20d76d2059ef822a97f3165c5db9193374850c7 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 20 Jun 2025 15:26:09 +0200 Subject: [PATCH 52/93] Always use a loop for the custom powi function. --- .../backends/SYCL/kernel/kernel_functions.hpp | 35 +++---------------- 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp index 97c5c6248..6cfa159bc 100644 --- a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp @@ -30,42 +30,17 @@ namespace plssvm::sycl::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// From 1c55fb151d2e7b2f8e29a4a9ec9e8cf3c3015098 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Fri, 20 Jun 2025 15:27:18 +0200 Subject: [PATCH 53/93] The get_default_queue now honors the default target platform. --- src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp b/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp index 28742b23f..6f14f9271 100644 --- a/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp +++ b/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp @@ -10,6 +10,7 @@ #include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp" // plssvm::adaptivecpp::detail::queue #include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp" // plssvm::dpcpp::detail::queue (PImpl implementation) +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/string_utility.hpp" // plssvm::detail::{as_lower_case, contains} #include "plssvm/detail/utility.hpp" // plssvm::detail::contains #include "plssvm/exceptions/exceptions.hpp" // plssvm::platform_devices_empty @@ -101,9 +102,11 @@ void device_synchronize(const queue &q) { } queue get_default_queue() { - queue q; - q.impl = std::make_shared(); - return q; + const auto &[devices, target] = detail::get_device_list(determine_default_target_platform()); + // at least one platform must be present + PLSSVM_ASSERT(!devices.empty(), "At least one device must be available!"); + // per default, use the first device for the tests + return devices.front(); } std::string get_dpcpp_version() { From b6b98fc6e5f59bc22fe4a928cdc33473a02051bf Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sat, 21 Jun 2025 17:38:25 +0200 Subject: [PATCH 54/93] Improve the AdaptiveCpp device pointer creation performance on CPUs with the OpenMP backend. --- .../backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp index 0338d10c9..44a9b9108 100644 --- a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp +++ b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp @@ -12,6 +12,7 @@ #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp" // plssvm::adaptivecpp::detail::queue (PImpl implementation) #include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::adaptivecpp::backend_exception #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::parallel_zero_memset #include "plssvm/matrix.hpp" // plssvm::aos_matrix #include "plssvm/shape.hpp" // plssvm::shape @@ -56,7 +57,14 @@ void device_ptr::memset(const int pattern, const size_type pos, const size_ty throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) }; } const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type)); - queue_.impl->sycl_queue.memset(static_cast(data_ + pos), pattern, rnum_bytes).wait(); + + ::sycl::queue &queue = queue_.impl->sycl_queue; + // using our OpenMP enhanced 0 memset functions has dramatically better performance on the OpenMP CPU backend + if (pattern == 0 && queue.get_device().is_cpu() && queue.get_device().get_backend() == ::sycl::backend::omp) { + ::plssvm::detail::parallel_zero_memset(data_ + pos, rnum_bytes / sizeof(value_type)); + } else { + queue.memset(static_cast(data_ + pos), pattern, rnum_bytes).wait(); + } } template From 88e5d80ad781c9ef9051e0de06b77896e5afb65f Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sun, 22 Jun 2025 14:00:22 +0200 Subject: [PATCH 55/93] Based on the provided CPU target architectures, set the correct preferred vector width. Reason: GCC and clang refuse to use AVX-512 for Intel CPUs in their auto-vectorizers even on new Intel CPUs that fully support it. --- CMakeLists.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b5c16f86..10de8e060 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -638,6 +638,37 @@ if (PLSSVM_ENABLE_LTO) endif () endif () +######################################################################################################################## +# enable the requested vectorization widths for the auto-vectorizers # +######################################################################################################################## +# GCC and clang both do not automatically auto-vectorize for AVX-512 (only AVX2) +# -> enable it if "cpu:avx512" was passed as PLSSVM_TARGET_PLATFORMS +if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1) + if (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx512") + message(STATUS "Enabling AVX512 support for the auto-vectorizers (-mprefer-vector-width=512).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=512>> + ) + elseif (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx2" OR ${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx") + message(STATUS "Enabling AVX/AVX2 support for the auto-vectorizers (-mprefer-vector-width=256).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=256>> + ) + elseif (${PLSSVM_CPU_TARGET_ARCHS} MATCHES "^sse") + message(STATUS "Enabling SSE for the auto-vectorizers (-mprefer-vector-width=128).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=128>> + ) + else () + message(FATAL_ERROR "Unrecognized CPU target architecture \"${PLSSVM_CPU_TARGET_ARCHS}\". Allowed values are: avx512, avx2, avx, sse.") + endif () +else () + # automatically use the "optimal" auto-vectorizer width +endif () + ######################################################################################################################## # check for optional and necessary dependencies # ######################################################################################################################## From 56a0f7d7a903ceb54aa16f73051cea5800307a8c Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Sun, 22 Jun 2025 14:40:27 +0200 Subject: [PATCH 56/93] Update the SYCL backend kernels. Now: some parts of the kernels are specialized for the CPU for better performance. --- .../SYCL/kernel/cg_explicit/basic/blas.hpp | 230 ++-- .../basic/kernel_matrix_assembly.hpp | 77 +- .../kernel/cg_explicit/hierarchical/blas.hpp | 359 +++--- .../hierarchical/kernel_matrix_assembly.hpp | 110 +- .../SYCL/kernel/cg_explicit/scoped/blas.hpp | 353 +++--- .../scoped/kernel_matrix_assembly.hpp | 106 +- .../kernel/cg_explicit/work_group/blas.hpp | 311 +++-- .../work_group/kernel_matrix_assembly.hpp | 94 +- .../basic/kernel_matrix_assembly_blas.hpp | 136 ++- .../kernel_matrix_assembly_blas.hpp | 237 ++-- .../scoped/kernel_matrix_assembly_blas.hpp | 283 +++-- .../kernel_matrix_assembly_blas.hpp | 183 +-- .../kernel/predict/basic/predict_kernel.hpp | 315 +++-- .../predict/hierarchical/predict_kernel.hpp | 483 +++++--- .../kernel/predict/scoped/predict_kernel.hpp | 513 ++++---- .../predict/work_group/predict_kernel.hpp | 419 ++++--- src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp | 1071 ++++------------- src/plssvm/backends/SYCL/DPCPP/csvm.cpp | 894 +++----------- 18 files changed, 2974 insertions(+), 3200 deletions(-) diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp index b55b374fe..4d19c4746 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::basic { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -59,33 +66,63 @@ class device_kernel_symm { * @param[in] idx indices representing the current point in the execution space */ void operator()(::sycl::item<2> idx) const { - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); ++dim) { - // perform the dot product calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_j) { - A_val = A_[dim * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + // iterate over all values using blocking + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + real_type A_val = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_val = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_val = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + + sum += A_val * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type A_val = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_val = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_val = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + + temp[internal_i][internal_j] += A_val * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } } - - temp[internal_i][internal_j] += A_val * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } } } @@ -93,13 +130,14 @@ class device_kernel_symm { // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -109,8 +147,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -125,16 +163,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -143,12 +186,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -162,25 +205,49 @@ class device_kernel_symm_mirror { * @param[in] idx indices representing the current point in the execution space */ void operator()(::sycl::item<2> idx) const { - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); - - temp[internal_i][internal_j] += A_[(dim) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows_ - dim + global_j] * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + } } } } @@ -188,13 +255,14 @@ class device_kernel_symm_mirror { // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -205,8 +273,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -223,6 +291,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -244,19 +315,21 @@ class device_kernel_inplace_matrix_add { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -277,6 +350,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -298,19 +374,21 @@ class device_kernel_inplace_matrix_scale { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp index 22b24bae0..f808c56fc 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp @@ -14,8 +14,10 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -27,12 +29,16 @@ namespace plssvm::sycl::detail::basic { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[out] kernel_matrix the calculated kernel matrix @@ -60,7 +66,7 @@ class device_kernel_assembly { cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -74,22 +80,45 @@ class device_kernel_assembly { constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // only calculate the upper triangular matrix - if (i >= j) { + if (i_idx >= j_idx) { // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = device_row_offset_ + i + static_cast(internal_i); - const auto global_j = device_row_offset_ + j + static_cast(internal_j); - temp[internal_i][internal_j] += detail::feature_reduce(data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], - data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + } } } } @@ -97,23 +126,23 @@ class device_kernel_assembly { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset_ + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset_ + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } // update the upper triangular kernel matrix - kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp index 5e5803652..627eaadbe 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -59,36 +66,15 @@ class device_kernel_symm { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // create two local memory arrays used for caching + real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -97,30 +83,44 @@ class device_kernel_symm { } }); - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(THREAD_BLOCK_SIZE)) { + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } else { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); @@ -128,13 +128,28 @@ class device_kernel_symm { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } @@ -145,17 +160,31 @@ class device_kernel_symm { // apply the (partial) BLAS operation and update C group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -166,8 +195,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -182,16 +211,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -200,12 +234,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -219,36 +253,15 @@ class device_kernel_symm_mirror { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // create two local memory arrays used for caching + real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices and diagonal condition - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; @@ -256,39 +269,67 @@ class device_kernel_symm_mirror { } }); - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(THREAD_BLOCK_SIZE)) { - // load data into shared memory + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); // implicit barrier - // perform the feature reduction calculation + // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } @@ -299,17 +340,31 @@ class device_kernel_symm_mirror { // apply the (remaining) BLAS operation and update C group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto partial_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -321,8 +376,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -339,6 +394,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -361,25 +419,27 @@ class device_kernel_inplace_matrix_add { void operator()(::sycl::group<2> group) const { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); - const std::size_t threadIdx_y = idx.get_local_id(1); - const std::size_t blockDim_x = idx.get_local_range(0); - const std::size_t blockDim_y = idx.get_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; - - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } }); @@ -401,6 +461,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -423,25 +486,27 @@ class device_kernel_inplace_matrix_scale { void operator()(::sycl::group<2> group) const { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); - const std::size_t threadIdx_y = idx.get_local_id(1); - const std::size_t blockDim_x = idx.get_local_range(0); - const std::size_t blockDim_y = idx.get_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; - - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp index d3e37ca54..3bc6d0878 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp @@ -14,8 +14,10 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -28,12 +30,16 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[out] kernel_matrix the calculated kernel matrix @@ -61,7 +67,7 @@ class device_kernel_assembly { cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -69,7 +75,7 @@ class device_kernel_assembly { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // create two local memory arrays used for caching data point features + // create two local memory arrays used for caching real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; @@ -78,8 +84,17 @@ class device_kernel_assembly { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { + // initialize private temp matrix to zero + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } + } + }); + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { // cast values to 32-bit unsigned int values to prevent implicit conversions @@ -91,25 +106,25 @@ class device_kernel_assembly { constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = idx.get_local_id(0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current work-item paying attention to coalesced memory accesses - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); @@ -121,11 +136,26 @@ class device_kernel_assembly { const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -140,36 +170,36 @@ class device_kernel_assembly { constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = idx.get_local_id(0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset_ + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset_ + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp(idx)[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } // update the upper triangular kernel matrix - kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp index 2e6983255..9d3d6bef8 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -62,85 +69,111 @@ class device_kernel_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // the indices used in the current work-item + ::sycl::require_local_mem(), // A_cache + ::sycl::require_local_mem(), // B_cache - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(THREAD_BLOCK_SIZE)) { - // load data into shared memory + // create two local memory arrays used for caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &A_cache, auto &B_cache, auto &temp) { + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } else { - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); - // perform calculations + // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } }); } + // apply the (partial) BLAS operation and update C ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -152,8 +185,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -168,16 +201,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -186,12 +224,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -208,80 +246,105 @@ class device_kernel_symm_mirror { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // the indices used in the current work-item + ::sycl::require_local_mem(), // A_cache + ::sycl::require_local_mem(), // B_cache - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(THREAD_BLOCK_SIZE)) { - // load data into shared memory + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &A_cache, auto &B_cache, auto &temp) { + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); - // perform calculations + // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } }); } + // apply the (remaining) BLAS operation and update C ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto partial_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -294,8 +357,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -312,6 +375,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -336,28 +402,29 @@ class device_kernel_inplace_matrix_add { void operator()(T group) const { ::sycl::memory_environment(group, [&]() { - // scale ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(group, 0); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); - const std::size_t blockDim_x = group.get_logical_local_range(0); - const std::size_t blockDim_y = group.get_logical_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } }); @@ -380,6 +447,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -404,28 +474,29 @@ class device_kernel_inplace_matrix_scale { void operator()(T group) const { ::sycl::memory_environment(group, [&]() { - // scale ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(group, 0); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); - const std::size_t blockDim_x = group.get_logical_local_range(0); - const std::size_t blockDim_y = group.get_logical_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp index 33c725a46..b882cdead 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp @@ -14,8 +14,10 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -28,12 +30,16 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[out] kernel_matrix the calculated kernel matrix @@ -61,7 +67,7 @@ class device_kernel_assembly { cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -72,14 +78,17 @@ class device_kernel_assembly { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), // data_i_cache - ::sycl::require_local_mem(), // data_j_cache + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // data_i_cache + ::sycl::require_local_mem(), // data_j_cache + + // create a private memory array used for internal caching ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), // temp [&](auto &data_i_cache, auto &data_j_cache, auto &temp) { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { // cast values to 32-bit unsigned int values to prevent implicit conversions @@ -91,25 +100,25 @@ class device_kernel_assembly { constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current work-item paying attention to coalesced memory accesses - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); @@ -119,11 +128,26 @@ class device_kernel_assembly { const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -136,36 +160,36 @@ class device_kernel_assembly { constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset_ + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset_ + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp(idx)[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } // update the upper triangular kernel matrix - kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp index 965b043a3..5c0949c34 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -24,16 +26,21 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -42,13 +49,13 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -67,64 +74,85 @@ class device_kernel_symm { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += THREAD_BLOCK_SIZE_uz) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + // determine on which side of the diagonal we are located + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -139,8 +167,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -155,17 +183,22 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -174,14 +207,14 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -200,59 +233,79 @@ class device_kernel_symm_mirror { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += THREAD_BLOCK_SIZE_uz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // store the values in the local memory - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - } - nd_idx.barrier(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -268,8 +321,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -286,6 +339,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -307,25 +363,27 @@ class device_kernel_inplace_matrix_add { */ void operator()(::sycl::nd_item<2> nd_idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // # num_rows - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // # num_rhs + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -346,6 +404,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -367,25 +428,27 @@ class device_kernel_inplace_matrix_scale { */ void operator()(::sycl::nd_item<2> nd_idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // # num_rows - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // # num_rhs + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp index 560d556ea..ec9fc1773 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp @@ -14,6 +14,7 @@ #pragma once #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform @@ -28,13 +29,16 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's work-group data parallel kernels. - * @details target the target platform + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ template class device_kernel_assembly { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory @@ -65,7 +69,7 @@ class device_kernel_assembly { cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -82,12 +86,12 @@ class device_kernel_assembly { constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current work-item in work-group x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current work-item in work-group y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of work-items in work-group x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of work-items in work-group y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_y >= blockIdx_x) { @@ -95,75 +99,75 @@ class device_kernel_assembly { real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // calculate the indices used in the current work-item paying attention to coalesced memory accesses - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - // calculate the indices to access the global data points, pays attention to coalesced memory accesses - const auto global_i_linear = device_row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j_linear = device_row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear]; - data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear]; + data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - if constexpr (target == target_platform::gpu_amd) { - // perform the feature reduction calculation, the block_dim is the slowest moving index - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); - } - } - } - } else { - // perform the feature reduction calculation, the block_dim is the fastest moving index + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { real_type sum{ 0.0 }; - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - sum += detail::feature_reduce(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); } temp[internal_i][internal_j] += sum; } } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations } } // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the global data points and wrt the current device - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = device_row_offset_ + device_global_i; - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = device_row_offset_ + device_global_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; // apply the final kernel function - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } // update the upper triangular kernel matrix - kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp index 9c82ad31d..c07186c37 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::basic { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,7 +73,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -75,28 +81,53 @@ class device_kernel_assembly_symm { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // only calculate the upper triangular matrix - if (i >= j) { + if (i_idx >= j_idx) { // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - temp[internal_i][internal_j] += detail::feature_reduce(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], - data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + } } } } @@ -104,28 +135,48 @@ class device_kernel_assembly_symm { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { - temp_ij += cost_; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) { - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + if (global_i_idx == global_j_idx) { + temp[internal_i][internal_j] += cost_; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) { - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; // symmetry - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; } } } @@ -137,11 +188,12 @@ class device_kernel_assembly_symm { private: /// @cond Doxygen_suppress const real_type alpha_; + const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp index 342e8308b..ea9197444 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,41 +73,45 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // the indices used in the current work-item + ::sycl::private_memory i_idx{ group }; // num_rows - device_row_offset + ::sycl::private_memory j_idx{ group }; // device_num_rows + + ::sycl::private_memory i_idx_linear{ group }; // num_rows - device_row_offset + ::sycl::private_memory j_idx_linear{ group }; // device_num_rows - // create the shared memory arrays used for caching data point features - real_type data_cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create two local memory arrays used for caching + real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; // initialize private and local variables group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { @@ -115,30 +125,36 @@ class device_kernel_assembly_symm { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto data_cache_i = reinterpret_cast(data_cache_one); - auto data_cache_j = reinterpret_cast(data_cache_two); + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); @@ -146,14 +162,30 @@ class device_kernel_assembly_symm { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -167,16 +199,18 @@ class device_kernel_assembly_symm { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - const auto device_global_j = j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx(idx) + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx(idx) + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] += cost_; } } else { @@ -189,38 +223,44 @@ class device_kernel_assembly_symm { // implicit group barrier - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // reinterpret the arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] - auto B_cache = reinterpret_cast(data_cache_one); - auto C_out_cache = reinterpret_cast(data_cache_two); + // reinterpret the local memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 }; + B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 }; // SoA } }); // implicit group barrier - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -235,18 +275,22 @@ class device_kernel_assembly_symm { // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_y = idx.get_local_id(1); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j(idx) + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1]; // SoA } }); @@ -258,10 +302,11 @@ class device_kernel_assembly_symm { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal_j); - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; } } @@ -270,38 +315,44 @@ class device_kernel_assembly_symm { // implicit group barrier - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto B_cache = reinterpret_cast(data_cache_one); - auto C_out_cache = reinterpret_cast(data_cache_two); + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the shared memory - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + // store the values in the local memory + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); // implicit group barrier - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); @@ -316,18 +367,22 @@ class device_kernel_assembly_symm { // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); @@ -341,10 +396,10 @@ class device_kernel_assembly_symm { /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp index c84db480f..c833b19da 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,7 +73,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -77,86 +83,124 @@ class device_kernel_assembly_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), + // the indices used in the current work-item + ::sycl::require_private_mem(), // num_rows - device_row_offset + ::sycl::require_private_mem(), // device_num_rows + + ::sycl::require_private_mem(), // num_rows - device_row_offset + ::sycl::require_private_mem(), // device_num_rows + + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // cache_one + ::sycl::require_local_mem(), // cache_two + + // create a private memory array used for internal caching ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { + [&](auto &i_idx, auto &j_idx, auto &i_idx_linear, auto &j_idx_linear, auto &cache_one, auto &cache_two, auto &temp) { // initialize private and local variables ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; }); - // exploit symmetry + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { - // load data into local memory - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + { + // rename cached arrays + auto &data_i_cache = cache_one; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &data_j_cache = cache_two; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the local memory - data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - } - }); - - // perform the feature reduction calculation - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + // store the values in the local memory + data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + }); + + // perform the feature reduction calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } } } - } - }); + }); + } } // apply the remaining part of the kernel function and store the value in the output kernel matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - const auto device_global_j = j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx(idx) + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx(idx) + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] += cost_; } } else { @@ -167,36 +211,42 @@ class device_kernel_assembly_symm { } }); - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &B_cache = cache_one; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; // SoA } }); - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -209,18 +259,22 @@ class device_kernel_assembly_symm { }); } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j(idx) + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); } @@ -230,48 +284,53 @@ class device_kernel_assembly_symm { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal_j); - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; } } } }); - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // allocate shared memory - auto &B_cache = data_cache_i; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + // rename local memory + auto &B_cache = cache_one; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the shared memory - B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + // store the values in the local memory + B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); - // implicit group barrier - - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); @@ -282,26 +341,26 @@ class device_kernel_assembly_symm { } } }); - - // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); - - // implicit group barrier } } } @@ -312,10 +371,10 @@ class device_kernel_assembly_symm { /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp index 2e6ea3f4f..509e6cb25 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -28,21 +30,25 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -53,15 +59,15 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_i_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - data_cache_j_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + cache_one_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + cache_two_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -70,7 +76,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -82,47 +88,72 @@ class device_kernel_assembly_symm { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_y >= blockIdx_x) { // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { + // rename cached arrays + auto &data_i_cache = cache_one_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &data_j_cache = cache_two_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_cache_i_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -133,16 +164,18 @@ class device_kernel_assembly_symm { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost_; } } else { @@ -152,25 +185,28 @@ class device_kernel_assembly_symm { } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &B_cache = cache_one_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -181,10 +217,12 @@ class device_kernel_assembly_symm { nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; // SoA } nd_idx.barrier(); // wai until all work-items updated C with their values } @@ -193,34 +231,39 @@ class device_kernel_assembly_symm { // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &B_cache = cache_one_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the in the shared memory - B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; + // store the values in the local memory + B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // calculate intermediate results and store them in shared memory + // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -231,10 +274,12 @@ class device_kernel_assembly_symm { nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } nd_idx.barrier(); // wait until all threads updated C with their values } @@ -244,17 +289,17 @@ class device_kernel_assembly_symm { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_i_; + ::sycl::local_accessor cache_one_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_j_; + ::sycl::local_accessor cache_two_; /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp index 631bf80a1..07d1a79dc 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::basic { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -60,77 +67,106 @@ class device_kernel_w_linear { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; ++sv) { - // perform the dot product calculation + // iterate over all support vectors using blocking + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + + real_type sum{ 0.0 }; + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] * // AoS + support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv]; // SoA + } + temp[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + + temp[internal_feature][internal_class] += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] * // AoS + support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv]; // SoA + } + } + } + } + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_feature][internal_class] += alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_] * sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } - - // update global array with local one - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; - } - } } private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -143,46 +179,70 @@ class device_kernel_predict_linear { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the dot product calculation - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); - - temp[internal_pd][internal_class] += w_d_[dim * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; // SoA + } + temp[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + temp[internal_pp][internal_class] += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; // SoA + } + } } } } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } } private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -192,21 +252,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -215,19 +279,19 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -235,54 +299,83 @@ class device_kernel_predict { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_support_vectors // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_sv_idx = sv_idx + static_cast(internal_sv); - - temp[internal_pd][internal_sv] += detail::feature_reduce(sv_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], - predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); // SoA + } + temp[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); // SoA + } + } } } } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter_); } } - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; ++dim) { + // iterate over all classes using blocking + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { if (sv_idx == 0) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += -rho_d_[dim]; + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + // calculate the index to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += -rho_[class_block + class_idx]; + } } } - // calculate intermediate results and store them in local memory - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // atomically add the results to the prediction + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_sv_idx = sv_idx + static_cast(internal_sv); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += - temp[internal_pd][internal_sv] * alpha_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += + temp[internal_pp][internal_sv] * alpha_[(class_block + class_idx) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + } } } } @@ -290,11 +383,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp index dedfe609e..1bb93cc3c 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -59,36 +66,15 @@ class device_kernel_w_linear { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory feature_idx{ group }; - ::sycl::private_memory feature_idx_linear{ group }; - ::sycl::private_memory class_idx{ group }; - ::sycl::private_memory class_idx_linear{ group }; + // create two local memory arrays used for caching + real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; @@ -99,23 +85,36 @@ class device_kernel_w_linear { // implicit group barrier // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS + feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS } }); @@ -123,13 +122,28 @@ class device_kernel_w_linear { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp(idx)[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } } @@ -138,16 +152,30 @@ class device_kernel_w_linear { // implicit group barrier } - // update global array with local one + // update the global w-vector with the locally cached values group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_feature_idx = feature_idx(idx) + static_cast(internal_feature); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; // SoA } } }); @@ -155,41 +183,46 @@ class device_kernel_w_linear { private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -201,35 +234,15 @@ class device_kernel_predict_linear { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_w[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory pp_idx{ group }; - ::sycl::private_memory pp_idx_linear{ group }; - ::sycl::private_memory class_idx{ group }; - ::sycl::private_memory class_idx_linear{ group }; + // create two local memory arrays used for caching + real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private variable group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -240,25 +253,38 @@ class device_kernel_predict_linear { // implicit group barrier - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - // load data into shared memory + // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA } }); @@ -266,13 +292,28 @@ class device_kernel_predict_linear { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp(idx)[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } } @@ -281,16 +322,30 @@ class device_kernel_predict_linear { // implicit group barrier } - // update global array with local one + // update the global array with the local one group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_pp_idx = pp_idx(idx) + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } }); @@ -298,10 +353,10 @@ class device_kernel_predict_linear { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -311,21 +366,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -334,51 +393,34 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_sv[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory pp_idx{ group }; - ::sycl::private_memory pp_idx_linear{ group }; - ::sycl::private_memory sv_idx_linear{ group }; + // create two local memory arrays used for caching + real_type cache_one[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type cache_two[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private variable group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -390,25 +432,42 @@ class device_kernel_predict { // implicit group barrier { + // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception + // auto &pp_cache = cache_one; + // auto &sv_cache = cache_two; + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the shared memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // store the values in the local memory + cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; } }); @@ -416,14 +475,30 @@ class device_kernel_predict { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp(idx)[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pp][internal_sv] += detail::feature_reduce(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } } } } @@ -435,9 +510,9 @@ class device_kernel_predict { // update temp using the respective kernel function group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_); + temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_); } } }); @@ -445,33 +520,42 @@ class device_kernel_predict { // implicit group barrier { - // rename cached arrays -> can't rename the arrays due to AdaptiveCpp runtime exception - // auto &alpha_cache = data_cache_pp; - // auto &out_cache = data_cache_sv; + // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception + // auto &alpha_cache = cache_one; + // auto &out_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } }); @@ -481,13 +565,14 @@ class device_kernel_predict { // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - data_cache_sv[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + cache_two[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pp][internal_sv] * cache_one[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); @@ -495,19 +580,29 @@ class device_kernel_predict { // implicit group barrier } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx(idx) + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); @@ -518,11 +613,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp index e6d56ec56..a62418057 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -62,78 +69,101 @@ class device_kernel_w_linear { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_feature, auto &data_cache_alpha, auto &feature_idx, auto &feature_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // feature_cache + ::sycl::require_local_mem(), // alpha_cache + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &feature_cache, auto &alpha_cache, auto &temp) { // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS + feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS } }); // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp(idx)[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } } }); } - // update global array with local one + // update the global w-vector with the locally cached values ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_feature_idx = feature_idx(idx) + static_cast(internal_feature); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; // SoA } } }); @@ -142,41 +172,46 @@ class device_kernel_w_linear { private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -191,79 +226,102 @@ class device_kernel_predict_linear { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_pp, auto &data_cache_w, auto &pp_idx, auto &pp_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // pp_cache + ::sycl::require_local_mem(), // w_cache - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &pp_cache, auto &w_cache, auto &temp) { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // load data into shared memory + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes + + // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; // store the values in the local memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA } }); // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp(idx)[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } } }); } - // update global array with local one + // update the global array with the local one ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_pp_idx = pp_idx(idx) + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } }); @@ -272,10 +330,10 @@ class device_kernel_predict_linear { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -285,21 +343,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -308,19 +370,19 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -330,102 +392,130 @@ class device_kernel_predict { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // cache_one + ::sycl::require_local_mem(), // cache_two + + // create a private memory array used for internal caching ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_pp, auto &data_cache_sv, auto &pp_idx, auto &pp_idx_linear, auto &sv_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + [&](auto &cache_one, auto &cache_two, auto &temp) { + { + // rename cached arrays + auto &pp_cache = cache_one; + auto &sv_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(THREAD_BLOCK_SIZE)) { - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - // store the values in the shared memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - } - }); + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the feature reduction calculation - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + // store the values in the local memory + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; + sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; + } + }); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + // perform the feature reduction calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp(idx)[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + } } } - } - }); + }); + } } // update temp using the respective kernel function ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_); + temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_); } } }); { // rename cached arrays - auto &alpha_cache = data_cache_pp; - auto &out_cache = data_cache_sv; + auto &alpha_cache = cache_one; + auto &out_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(THREAD_BLOCK_SIZE)) { + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; + out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } @@ -435,33 +525,42 @@ class device_kernel_predict { // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx(idx) + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); } @@ -471,11 +570,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp index 6612a10d8..25bec3f13 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -28,32 +30,37 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(::sycl::handler &cgh, real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - data_cache_feature_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_alpha_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(::sycl::handler &cgh, real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + feature_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + alpha_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -67,104 +74,130 @@ class device_kernel_w_linear { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - data_cache_feature_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + // store the values in the local memory + feature_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current work-item + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_feature_; + ::sycl::local_accessor feature_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_alpha_; + ::sycl::local_accessor alpha_cache_; /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_w_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + pp_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + w_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -181,71 +214,91 @@ class device_kernel_predict_linear { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - // store the values in the local memory - data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } } private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_pp_; + ::sycl::local_accessor pp_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_w_; + ::sycl::local_accessor w_cache_; /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -255,22 +308,26 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL kernel invocation type. + constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -279,21 +336,21 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(::sycl::handler &cgh, real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_sv_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(::sycl::handler &cgh, real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + cache_one_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + cache_two_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -305,44 +362,63 @@ class device_kernel_predict { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { + // rename cached arrays + auto &pp_cache = cache_one_; + auto &sv_cache = cache_two_; + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // store the values in the shared memory - data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // store the values in the local memory + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } } } } @@ -351,28 +427,34 @@ class device_kernel_predict { } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter_); } } { // rename cached arrays - auto &alpha_cache = data_cache_pp_; - auto &out_cache = data_cache_sv_; + auto &alpha_cache = cache_one_; + auto &out_cache = cache_two_; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) { + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; + out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } @@ -381,20 +463,21 @@ class device_kernel_predict { // calculate intermediate results and store them in local memory for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wait until all work-items updated their part of the prediction } @@ -403,16 +486,16 @@ class device_kernel_predict { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_pp_; + ::sycl::local_accessor cache_one_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_sv_; + ::sycl::local_accessor cache_two_; /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp index 6f0772db0..c03aa46b0 100644 --- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp +++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp @@ -67,6 +67,144 @@ #include // std::get #include // std::vector +namespace { + +/** + * @brief Run the kernel functor on the given device. + * @tparam KernelFunctor the type of the kernel functor to run + * @tparam QueueType the type of the SYCL queue to run the kernel on + * @tparam Args the types of the parameters necessary for the specific kernel functor + * @param[in] device the SYCL queue to run the kernel on + * @param[in] partial_grid the number of work-groups in each dimension of the execution grid + * @param[in] block the number of work-items in each dimension per work-group + * @param[in] args the parameters necessary for the specific kernel functor + */ +template +void run_kernel_functor(const QueueType &device, const plssvm::detail::dim_type partial_grid, const plssvm::detail::dim_type block, Args &&...args) { + constexpr plssvm::sycl::kernel_invocation_type invocation = KernelFunctor::invocation_type; + + if constexpr (invocation == plssvm::sycl::kernel_invocation_type::basic) { + device.impl->sycl_queue.submit([&](::sycl::handler &cgh) { + cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range(partial_grid, block), + KernelFunctor{ std::forward(args)... }); + }); + } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::work_group) { + device.impl->sycl_queue.submit([&](::sycl::handler &cgh) { + cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range(partial_grid, block), + KernelFunctor{ cgh, std::forward(args)... }); + }); + } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::hierarchical) { +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&](::sycl::handler &cgh) { + const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range(partial_grid, block); + cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward(args)... }); + }); +#else + throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" }; +#endif + } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::scoped) { +#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) + device.impl->sycl_queue.submit([&](::sycl::handler &cgh) { + const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range(partial_grid, block); + cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward(args)... }); + }); +#else + throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" }; +#endif + } else { + static_assert(::plssvm::detail::always_false_v, "Unsupported kernel function!"); + } +} + +/** + * @brief Dispatch the kernel functor to the correct kernel function type. + * @tparam KernelFunctor the type of the kernel functor to run + * @tparam target the target platform to run the kernel on + * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple` + * @param[in] params the parameters used to determine the kernel function type + * @param[in] args the parameters necessary for the specific kernel functor + */ +template