intel · sarnex · Dec 16, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
@@ -46,6 +46,7 @@
 #include <sycl/nd_range.hpp>
 #include <sycl/property_list.hpp>
 #include <sycl/range.hpp>
+#include <sycl/range_rounding.hpp>
 #include <sycl/sampler.hpp>
 
 #include <assert.h>
@@ -262,106 +263,6 @@ __SYCL_EXPORT void *getValueFromDynamicParameter(
     ext::oneapi::experimental::detail::dynamic_parameter_base
         &DynamicParamBase);
 
-template <int Dims> class RoundedRangeIDGenerator {
-  id<Dims> Id;
-  id<Dims> InitId;
-  range<Dims> UserRange;
-  range<Dims> RoundedRange;
-  bool Done = false;
-
-public:
-  RoundedRangeIDGenerator(const id<Dims> &Id, const range<Dims> &UserRange,
-                          const range<Dims> &RoundedRange)
-      : Id(Id), InitId(Id), UserRange(UserRange), RoundedRange(RoundedRange) {
-    for (int i = 0; i < Dims; ++i)
-      if (Id[i] >= UserRange[i])
-        Done = true;
-  }
-
-  explicit operator bool() { return !Done; }
-
-  void updateId() {
-    for (int i = 0; i < Dims; ++i) {
-      Id[i] += RoundedRange[i];
-      if (Id[i] < UserRange[i])
-        return;
-      Id[i] = InitId[i];
-    }
-    Done = true;
-  }
-
-  id<Dims> getId() { return Id; }
-
-  template <typename KernelType> auto getItem() {
-    if constexpr (std::is_invocable_v<KernelType, item<Dims> &> ||
-                  std::is_invocable_v<KernelType, item<Dims> &, kernel_handler>)
-      return detail::Builder::createItem<Dims, true>(UserRange, getId(), {});
-    else {
-      static_assert(std::is_invocable_v<KernelType, item<Dims, false> &> ||
-                        std::is_invocable_v<KernelType, item<Dims, false> &,
-                                            kernel_handler>,
-                    "Kernel must be invocable with an item!");
-      return detail::Builder::createItem<Dims, false>(UserRange, getId());
-    }
-  }
-};
-
-// TODO: The wrappers can be optimized further so that the body
-// essentially looks like this:
-//   for (auto z = it[2]; z < UserRange[2]; z += it.get_range(2))
-//     for (auto y = it[1]; y < UserRange[1]; y += it.get_range(1))
-//       for (auto x = it[0]; x < UserRange[0]; x += it.get_range(0))
-//         KernelFunc({x,y,z});
-template <typename TransformedArgType, int Dims, typename KernelType>
-class RoundedRangeKernel {
-public:
-  range<Dims> UserRange;
-  KernelType KernelFunc;
-  void operator()(item<Dims> It) const {
-    auto RoundedRange = It.get_range();
-    for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
-         Gen.updateId()) {
-      auto item = Gen.template getItem<KernelType>();
-      KernelFunc(item);
-    }
-  }
-
-  // Copy the properties_tag getter from the original kernel to propagate
-  // property(s)
-  template <
-      typename T = KernelType,
-      typename = std::enable_if_t<ext::oneapi::experimental::detail::
-                                      HasKernelPropertiesGetMethod<T>::value>>
-  auto get(ext::oneapi::experimental::properties_tag) const {
-    return KernelFunc.get(ext::oneapi::experimental::properties_tag{});
-  }
-};
-
-template <typename TransformedArgType, int Dims, typename KernelType>
-class RoundedRangeKernelWithKH {
-public:
-  range<Dims> UserRange;
-  KernelType KernelFunc;
-  void operator()(item<Dims> It, kernel_handler KH) const {
-    auto RoundedRange = It.get_range();
-    for (RoundedRangeIDGenerator Gen(It.get_id(), UserRange, RoundedRange); Gen;
-         Gen.updateId()) {
-      auto item = Gen.template getItem<KernelType>();
-      KernelFunc(item, KH);
-    }
-  }
-
-  // Copy the properties_tag getter from the original kernel to propagate
-  // property(s)
-  template <
-      typename T = KernelType,
-      typename = std::enable_if_t<ext::oneapi::experimental::detail::
-                                      HasKernelPropertiesGetMethod<T>::value>>
-  auto get(ext::oneapi::experimental::properties_tag) const {
-    return KernelFunc.get(ext::oneapi::experimental::properties_tag{});
-  }
-};
-
 using std::enable_if_t;
 using sycl::detail::queue_impl;
 
@@ -881,142 +782,15 @@ class __SYCL_EXPORT handler {
 
   bool eventNeeded() const;
 
+  device get_device() const;
+
   template <int Dims, typename LambdaArgType> struct TransformUserItemType {
     using type = std::conditional_t<
         std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,
         std::conditional_t<std::is_convertible_v<item<Dims>, LambdaArgType>,
                            item<Dims>, LambdaArgType>>;
   };
 
-  std::optional<std::array<size_t, 3>> getMaxWorkGroups();
-  // We need to use this version to support gcc 7.5.0. Remove when minimal
-  // supported gcc version is bumped.
-  std::tuple<std::array<size_t, 3>, bool> getMaxWorkGroups_v2();
-
-  template <int Dims>
-  std::tuple<range<Dims>, bool> getRoundedRange(range<Dims> UserRange) {
-    range<Dims> RoundedRange = UserRange;
-    // Disable the rounding-up optimizations under these conditions:
-    // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
-    // 2. The kernel is provided via an interoperability method (this uses a
-    // different code path).
-    // 3. The range is already a multiple of the rounding factor.
-    //
-    // Cases 2 and 3 could be supported with extra effort.
-    // As an optimization for the common case it is an
-    // implementation choice to not support those scenarios.
-    // Note that "this_item" is a free function, i.e. not tied to any
-    // specific id or item. When concurrent parallel_fors are executing
-    // on a device it is difficult to tell which parallel_for the call is
-    // being made from. One could replicate portions of the
-    // call-graph to make this_item calls kernel-specific but this is
-    // not considered worthwhile.
-
-    // Perform range rounding if rounding-up is enabled.
-    if (this->DisableRangeRounding())
-      return {range<Dims>{}, false};
-
-    // Range should be a multiple of this for reasonable performance.
-    size_t MinFactorX = 16;
-    // Range should be a multiple of this for improved performance.
-    size_t GoodFactor = 32;
-    // Range should be at least this to make rounding worthwhile.
-    size_t MinRangeX = 1024;
-
-    // Check if rounding parameters have been set through environment:
-    // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
-    this->GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX);
-
-    // In SYCL, each dimension of a global range size is specified by
-    // a size_t, which can be up to 64 bits.  All backends should be
-    // able to accept a kernel launch with a 32-bit global range size
-    // (i.e. do not throw an error).  The OpenCL CPU backend will
-    // accept every 64-bit global range, but the GPU backends will not
-    // generally accept every 64-bit global range.  So, when we get a
-    // non-32-bit global range, we wrap the old kernel in a new kernel
-    // that has each work item peform multiple invocations the old
-    // kernel in a 32-bit global range.
-    id<Dims> MaxNWGs = [&] {
-      auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2();
-      if (!HasMaxWGs) {
-        id<Dims> Default;
-        for (int i = 0; i < Dims; ++i)
-          Default[i] = (std::numeric_limits<int32_t>::max)();
-        return Default;
-      }
-
-      id<Dims> IdResult;
-      size_t Limit = (std::numeric_limits<int>::max)();
-      for (int i = 0; i < Dims; ++i)
-        IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]);
-      return IdResult;
-    }();
-    auto M = (std::numeric_limits<uint32_t>::max)();
-    range<Dims> MaxRange;
-    for (int i = 0; i < Dims; ++i) {
-      auto DesiredSize = MaxNWGs[i] * GoodFactor;
-      MaxRange[i] =
-          DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor;
-    }
-
-    bool DidAdjust = false;
-    auto Adjust = [&](int Dim, size_t Value) {
-      if (this->RangeRoundingTrace())
-        std::cout << "parallel_for range adjusted at dim " << Dim << " from "
-                  << RoundedRange[Dim] << " to " << Value << std::endl;
-      RoundedRange[Dim] = Value;
-      DidAdjust = true;
-    };
-
-#ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
-    size_t GoodExpFactor = 1;
-    switch (Dims) {
-    case 1:
-      GoodExpFactor = 32; // Make global range multiple of {32}
-      break;
-    case 2:
-      GoodExpFactor = 16; // Make global range multiple of {16, 16}
-      break;
-    case 3:
-      GoodExpFactor = 8; // Make global range multiple of {8, 8, 8}
-      break;
-    }
-
-    // Check if rounding parameters have been set through environment:
-    // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
-    this->GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX);
-
-    for (auto i = 0; i < Dims; ++i)
-      if (UserRange[i] % GoodExpFactor) {
-        Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor);
-      }
-#else
-    // Perform range rounding if there are sufficient work-items to
-    // need rounding and the user-specified range is not a multiple of
-    // a "good" value.
-    if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) {
-      // It is sufficient to round up just the first dimension.
-      // Multiplying the rounded-up value of the first dimension
-      // by the values of the remaining dimensions (if any)
-      // will yield a rounded-up value for the total range.
-      Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor);
-    }
-#endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__
-#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
-    // If we are forcing range rounding kernels to be used, we always want the
-    // rounded range kernel to be generated, even if rounding isn't needed
-    DidAdjust = true;
-#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__
-
-    for (int i = 0; i < Dims; ++i)
-      if (RoundedRange[i] > MaxRange[i])
-        Adjust(i, MaxRange[i]);
-
-    if (!DidAdjust)
-      return {range<Dims>{}, false};
-    return {RoundedRange, true};
-  }
-
   /// Defines and invokes a SYCL kernel function for the specified range.
   ///
   /// The SYCL kernel function is defined as a lambda function or a named
@@ -1078,11 +852,12 @@ class __SYCL_EXPORT handler {
     // Range rounding is supported only for newer SYCL standards.
 #if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) &&                  \
     SYCL_LANGUAGE_VERSION >= 202012L
-    auto [RoundedRange, HasRoundedRange] = getRoundedRange(UserRange);
+    auto [RoundedRange, HasRoundedRange] =
+        detail::getRoundedRange(UserRange, get_device());
     if (HasRoundedRange) {
       using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
       auto Wrapper =
-          getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
+          detail::getRangeRoundedKernelLambda<NameWT, TransformedArgType, Dims>(
               KernelFunc, UserRange);
 
       using KName = std::conditional_t<std::is_same<KernelType, NameT>::value,
@@ -2944,33 +2719,6 @@ class __SYCL_EXPORT handler {
   friend class ext::oneapi::experimental::detail::dynamic_parameter_impl;
   friend class ext::oneapi::experimental::detail::dynamic_command_group_impl;
 
-  bool DisableRangeRounding();
-
-  bool RangeRoundingTrace();
-
-  void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
-                                size_t &MinRange);
-
-  template <typename WrapperT, typename TransformedArgType, int Dims,
-            typename KernelType,
-            std::enable_if_t<detail::KernelLambdaHasKernelHandlerArgT<
-                KernelType, TransformedArgType>::value> * = nullptr>
-  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
-                                   range<Dims> UserRange) {
-    return detail::RoundedRangeKernelWithKH<TransformedArgType, Dims,
-                                            KernelType>{UserRange, KernelFunc};
-  }
-
-  template <typename WrapperT, typename TransformedArgType, int Dims,
-            typename KernelType,
-            std::enable_if_t<!detail::KernelLambdaHasKernelHandlerArgT<
-                KernelType, TransformedArgType>::value> * = nullptr>
-  auto getRangeRoundedKernelLambda(KernelType KernelFunc,
-                                   range<Dims> UserRange) {
-    return detail::RoundedRangeKernel<TransformedArgType, Dims, KernelType>{
-        UserRange, KernelFunc};
-  }
-
   detail::context_impl &getContextImpl() const;
 
   // Checks if 2D memory operations are supported by the underlying platform.