Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
264 changes: 6 additions & 258 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include <sycl/nd_range.hpp>
#include <sycl/property_list.hpp>
#include <sycl/range.hpp>
#include <sycl/range_rounding.hpp>
#include <sycl/sampler.hpp>

#include <assert.h>
Expand Down Expand Up @@ -262,106 +263,6 @@ __SYCL_EXPORT void *getValueFromDynamicParameter(
ext::oneapi::experimental::detail::dynamic_parameter_base
&DynamicParamBase);

template <int Dims> class RoundedRangeIDGenerator {
id<Dims> Id;
id<Dims> InitId;
range<Dims> UserRange;
range<Dims> RoundedRange;
bool Done = false;

public:
RoundedRangeIDGenerator(const id<Dims> &Id, const range<Dims> &UserRange,
const range<Dims> &RoundedRange)
: Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) {
for (int i = 0; i < Dims; ++i)
if (Id[i] >= UserRange[i])
Done = true;
}

explicit operator bool() { return !Done; }

void updateId() {
for (int i = 0; i < Dims; ++i) {
Id[i] += RoundedRange[i];
if (Id[i] < UserRange[i])
return;
Id[i] = InitId[i];
}
Done = true;
}

id<Dims> getId() { return Id; }

template <typename KernelType> auto getItem() {
if constexpr (std::is_invocable_v<KernelType, item<Dims> &> ||
std::is_invocable_v<KernelType, item<Dims> &, kernel_handler>)
return detail::Builder::createItem<Dims, true>(UserRange, getId(), {});
else {
static_assert(std::is_invocable_v<KernelType, item<Dims, false> &> ||
std::is_invocable_v<KernelType, item<Dims, false> &,
kernel_handler>,
"Kernel must be invocable with an item!");
return detail::Builder::createItem<Dims, false>(UserRange, getId());
}
}
};

// TODO: The wrappers can be optimized further so that the body
// essentially looks like this:
// for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2))
// for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1))
// for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0))
// KernelFunc({x,y,z});
template <typename TransformedArgType, int Dims, typename KernelType>
class RoundedRangeKernel {
public:
range<Dims> UserRange;
KernelType KernelFunc;
void operator()(item<Dims> It) const {
auto RoundedRange = It.get_range();
for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
Gen.updateId()) {
auto item = Gen.template getItem<KernelType>();
KernelFunc(item);
}
}

// Copy the properties_tag getter from the original kernel to propagate
// property(s)
template <
typename T = KernelType,
typename = std::enable_if_t<ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<T>::value>>
auto get(ext::oneapi::experimental::properties_tag) const {
return KernelFunc.get(ext::oneapi::experimental::properties_tag{});
}
};

template <typename TransformedArgType, int Dims, typename KernelType>
class RoundedRangeKernelWithKH {
public:
range<Dims> UserRange;
KernelType KernelFunc;
void operator()(item<Dims> It, kernel_handler KH) const {
auto RoundedRange = It.get_range();
for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
Gen.updateId()) {
auto item = Gen.template getItem<KernelType>();
KernelFunc(item, KH);
}
}

// Copy the properties_tag getter from the original kernel to propagate
// property(s)
template <
typename T = KernelType,
typename = std::enable_if_t<ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<T>::value>>
auto get(ext::oneapi::experimental::properties_tag) const {
return KernelFunc.get(ext::oneapi::experimental::properties_tag{});
}
};

using std::enable_if_t;
using sycl::detail::queue_impl;

Expand Down Expand Up @@ -881,142 +782,15 @@ class __SYCL_EXPORT handler {

bool eventNeeded() const;

device get_device() const;

template <int Dims, typename LambdaArgType> struct TransformUserItemType {
using type = std::conditional_t<
std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,
std::conditional_t<std::is_convertible_v<item<Dims>, LambdaArgType>,
item<Dims>, LambdaArgType>>;
};

std::optional<std::array<size_t, 3>> getMaxWorkGroups();
// We need to use this version to support gcc 7.5.0. Remove when minimal
// supported gcc version is bumped.
std::tuple<std::array<size_t, 3>, bool> getMaxWorkGroups_v2();

template <int Dims>
std::tuple<range<Dims>, bool> getRoundedRange(range<Dims> UserRange) {
range<Dims> RoundedRange = UserRange;
// Disable the rounding-up optimizations under these conditions:
// 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
// 2. The kernel is provided via an interoperability method (this uses a
// different code path).
// 3. The range is already a multiple of the rounding factor.
//
// Cases 2 and 3 could be supported with extra effort.
// As an optimization for the common case it is an
// implementation choice to not support those scenarios.
// Note that "this_item" is a free function, i.e. not tied to any
// specific id or item. When concurrent parallel_fors are executing
// on a device it is difficult to tell which parallel_for the call is
// being made from. One could replicate portions of the
// call-graph to make this_item calls kernel-specific but this is
// not considered worthwhile.

// Perform range rounding if rounding-up is enabled.
if (this->DisableRangeRounding())
return {range<Dims>{}, false};

// Range should be a multiple of this for reasonable performance.
size_t MinFactorX = 16;
// Range should be a multiple of this for improved performance.
size_t GoodFactor = 32;
// Range should be at least this to make rounding worthwhile.
size_t MinRangeX = 1024;

// Check if rounding parameters have been set through environment:
// SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
this->GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX);

// In SYCL, each dimension of a global range size is specified by
// a size_t, which can be up to 64 bits. All backends should be
// able to accept a kernel launch with a 32-bit global range size
// (i.e. do not throw an error). The OpenCL CPU backend will
// accept every 64-bit global range, but the GPU backends will not
// generally accept every 64-bit global range. So, when we get a
// non-32-bit global range, we wrap the old kernel in a new kernel
// that has each work item peform multiple invocations the old
// kernel in a 32-bit global range.
id<Dims> MaxNWGs = [&] {
auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2();
if (!HasMaxWGs) {
id<Dims> Default;
for (int i = 0; i < Dims; ++i)
Default[i] = (std::numeric_limits<int32_t>::max)();
return Default;
}

id<Dims> IdResult;
size_t Limit = (std::numeric_limits<int>::max)();
for (int i = 0; i < Dims; ++i)
IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]);
return IdResult;
}();
auto M = (std::numeric_limits<uint32_t>::max)();
range<Dims> MaxRange;
for (int i = 0; i < Dims; ++i) {
auto DesiredSize = MaxNWGs[i] * GoodFactor;
MaxRange[i] =
DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor;
}

bool DidAdjust = false;
auto Adjust = [&](int Dim, size_t Value) {
if (this->RangeRoundingTrace())
std::cout << "parallel_for range adjusted at dim " << Dim << " from "
<< RoundedRange[Dim] << " to " << Value << std::endl;
RoundedRange[Dim] = Value;
DidAdjust = true;
};

#ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
size_t GoodExpFactor = 1;
switch (Dims) {
case 1:
GoodExpFactor = 32; // Make global range multiple of {32}
break;
case 2:
GoodExpFactor = 16; // Make global range multiple of {16, 16}
break;
case 3:
GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
break;
}

// Check if rounding parameters have been set through environment:
// SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
this->GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);

for (auto i = 0; i < Dims; ++i)
if (UserRange[i] % GoodExpFactor) {
Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
}
#else
// Perform range rounding if there are sufficient work-items to
// need rounding and the user-specified range is not a multiple of
// a "good" value.
if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) {
// It is sufficient to round up just the first dimension.
// Multiplying the rounded-up value of the first dimension
// by the values of the remaining dimensions (if any)
// will yield a rounded-up value for the total range.
Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
}
#endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
// If we are forcing range rounding kernels to be used, we always want the
// rounded range kernel to be generated, even if rounding isn't needed
DidAdjust = true;
#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__

for (int i = 0; i < Dims; ++i)
if (RoundedRange[i] > MaxRange[i])
Adjust(i, MaxRange[i]);

if (!DidAdjust)
return {range<Dims>{}, false};
return {RoundedRange, true};
}

/// Defines and invokes a SYCL kernel function for the specified range.
///
/// The SYCL kernel function is defined as a lambda function or a named
Expand Down Expand Up @@ -1078,11 +852,12 @@ class __SYCL_EXPORT handler {
// Range rounding is supported only for newer SYCL standards.
#if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \
SYCL_LANGUAGE_VERSION >= 202012L
auto [RoundedRange, HasRoundedRange] = getRoundedRange(UserRange);
auto [RoundedRange, HasRoundedRange] =
detail::getRoundedRange(UserRange, get_device());
if (HasRoundedRange) {
using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
auto Wrapper =
getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
detail::getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
KernelFunc, UserRange);

using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
Expand Down Expand Up @@ -2944,33 +2719,6 @@ class __SYCL_EXPORT handler {
friend class ext::oneapi::experimental::detail::dynamic_parameter_impl;
friend class ext::oneapi::experimental::detail::dynamic_command_group_impl;

bool DisableRangeRounding();

bool RangeRoundingTrace();

void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
size_t &MinRange);

template <typename WrapperT, typename TransformedArgType, int Dims,
typename KernelType,
std::enable_if_t<detail::KernelLambdaHasKernelHandlerArgT<
KernelType, TransformedArgType>::value> * = nullptr>
auto getRangeRoundedKernelLambda(KernelType KernelFunc,
range<Dims> UserRange) {
return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
KernelType>{UserRange, KernelFunc};
}

template <typename WrapperT, typename TransformedArgType, int Dims,
typename KernelType,
std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
KernelType, TransformedArgType>::value> * = nullptr>
auto getRangeRoundedKernelLambda(KernelType KernelFunc,
range<Dims> UserRange) {
return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
UserRange, KernelFunc};
}

detail::context_impl &getContextImpl() const;

// Checks if 2D memory operations are supported by the underlying platform.
Expand Down
Loading