|
| 1 | +// Copyright (C) 2020 NVIDIA CORPORATION. All rights reserved. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | +// ============================================================================= |
| 15 | + |
| 16 | +#include "cuda_kernels.h" |
| 17 | + |
| 18 | +#include <stdexcept> |
| 19 | +#include <cuda_fp16.h> |
| 20 | + |
| 21 | +namespace bluefog { |
| 22 | +namespace common { |
| 23 | + |
| 24 | +template<typename T, typename TS> |
| 25 | +__global__ void scale_buffer_k(T* buffer, int64_t num_elements, const TS scale_factor) { |
| 26 | + |
| 27 | + const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x; |
| 28 | + |
| 29 | + for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { |
| 30 | + buffer[i] *= scale_factor; |
| 31 | + } |
| 32 | +} |
| 33 | + |
| 34 | +// Specialization for half2 |
| 35 | +__global__ void scale_buffer_half2_k(__half* buffer, int64_t num_elements, const __half scale_factor) { |
| 36 | + |
| 37 | + const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x; |
| 38 | + |
| 39 | +#if __CUDA_ARCH__ > 530 |
| 40 | + __half2* buffer_h2 = reinterpret_cast<__half2 *>(buffer); |
| 41 | + const __half2 scale_factor_h2 = __halves2half2(scale_factor, scale_factor); |
| 42 | + |
| 43 | + for (size_t i = idx; i < num_elements / 2; i += gridDim.x * blockDim.x) { |
| 44 | + buffer_h2[i] = __hmul2(scale_factor_h2, buffer_h2[i]); |
| 45 | + } |
| 46 | + |
| 47 | + // Deal with last element if num_elements is odd |
| 48 | + if (idx == 0 && num_elements % 2) { |
| 49 | + buffer[num_elements - 1] = __hmul(scale_factor, buffer[num_elements - 1]); |
| 50 | + } |
| 51 | +#else |
| 52 | + for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { |
| 53 | + buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i])); |
| 54 | + } |
| 55 | +#endif |
| 56 | +} |
| 57 | + |
| 58 | +// Specialization for architectures without __half compute |
| 59 | +template<> |
| 60 | +__global__ void scale_buffer_k(__half* buffer, int64_t num_elements, const __half scale_factor) { |
| 61 | + |
| 62 | + const size_t idx = static_cast<size_t>(blockDim.x) * blockIdx.x + threadIdx.x; |
| 63 | + |
| 64 | +#if __CUDA_ARCH__ > 530 |
| 65 | + for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { |
| 66 | + buffer[i] *= scale_factor; |
| 67 | + } |
| 68 | +#else |
| 69 | + for (size_t i = idx; i < num_elements; i += gridDim.x * blockDim.x) { |
| 70 | + buffer[i] = __float2half(__half2float(scale_factor) * __half2float(buffer[i])); |
| 71 | + } |
| 72 | +#endif |
| 73 | +} |
| 74 | + |
| 75 | +#define NTHREADS_SCALE_BUFFER_KERNEL 512 |
| 76 | +void ScaleBufferCudaImpl(double scale_factor, void* buffer_data, const int64_t num_elements, |
| 77 | + DataType dtype, cudaStream_t stream) { |
| 78 | + const int64_t blocks = (num_elements + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL; |
| 79 | + const int threads = NTHREADS_SCALE_BUFFER_KERNEL; |
| 80 | + switch (dtype) { |
| 81 | + case DataType::BLUEFOG_UINT8: |
| 82 | + scale_buffer_k<<<blocks, threads, 0, stream>>>((uint8_t*) buffer_data, num_elements, scale_factor); |
| 83 | + break; |
| 84 | + case DataType::BLUEFOG_INT8: |
| 85 | + scale_buffer_k<<<blocks, threads, 0, stream>>>((int8_t*) buffer_data, num_elements, scale_factor); |
| 86 | + break; |
| 87 | + case DataType::BLUEFOG_INT32: |
| 88 | + scale_buffer_k<<<blocks, threads, 0, stream>>>((int32_t*) buffer_data, num_elements, scale_factor); |
| 89 | + break; |
| 90 | + case DataType::BLUEFOG_INT64: |
| 91 | + scale_buffer_k<<<blocks, threads, 0, stream>>>((int64_t*) buffer_data, num_elements, scale_factor); |
| 92 | + break; |
| 93 | + case DataType::BLUEFOG_FLOAT16: |
| 94 | + { |
| 95 | + __half scale_factor_half = __float2half((float) scale_factor); |
| 96 | + if ((size_t) buffer_data % 4 == 0) { |
| 97 | + // If alignment allows, use half2 specialized kernel |
| 98 | + int64_t num_elements_h2 = (num_elements + 1) / 2; |
| 99 | + int64_t blocks_h2 = (num_elements_h2 + NTHREADS_SCALE_BUFFER_KERNEL - 1) / NTHREADS_SCALE_BUFFER_KERNEL; |
| 100 | + scale_buffer_half2_k<<<blocks_h2, threads, 0, stream>>>((__half*) buffer_data, num_elements, scale_factor_half); |
| 101 | + } else { |
| 102 | + scale_buffer_k<<<blocks, threads, 0, stream>>>((__half*) buffer_data, num_elements, scale_factor_half); |
| 103 | + } |
| 104 | + break; |
| 105 | + } |
| 106 | + case DataType::BLUEFOG_FLOAT32: |
| 107 | + scale_buffer_k<<<blocks, threads, 0, stream>>>((float*) buffer_data, num_elements, (float) scale_factor); |
| 108 | + break; |
| 109 | + case DataType::BLUEFOG_FLOAT64: |
| 110 | + scale_buffer_k<<<blocks, threads, 0, stream>>>((double*) buffer_data, num_elements, scale_factor); |
| 111 | + break; |
| 112 | + default: |
| 113 | + throw std::logic_error("Type " + DataType_Name(dtype) + |
| 114 | + " not supported by ScaleBufferCudaImpl."); |
| 115 | + } |
| 116 | +} |
| 117 | + |
| 118 | +} // namespace common |
| 119 | +} // namespace bluefog |
| 120 | + |
0 commit comments