Skip to content

Commit 0bc7341

Browse files
dmitriy-sobolevdanhoeflingerSergeyKopienko
authored
Fix synchronization issues in radix sort and histogram (#2054)
Co-authored-by: Dan Hoeflinger <[email protected]> Co-authored-by: Sergey Kopienko <[email protected]>
1 parent ad4c54e commit 0bc7341

File tree

3 files changed

+61
-21
lines changed

3 files changed

+61
-21
lines changed

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h

+6-4
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ class __histo_kernel_private_glocal_atomics;
130130
template <typename _HistAccessor, typename _OffsetT, typename _Size>
131131
void
132132
__clear_wglocal_histograms(const _HistAccessor& __local_histogram, const _OffsetT& __offset, _Size __num_bins,
133-
const sycl::nd_item<1>& __self_item)
133+
const sycl::nd_item<1>& __self_item,
134+
__dpl_sycl::__fence_space_t __fence_space = __dpl_sycl::__fence_space_local)
134135
{
135136
using _BinUint_t =
136137
::std::conditional_t<(sizeof(_Size) >= sizeof(::std::uint32_t)), ::std::uint64_t, ::std::uint32_t>;
@@ -148,7 +149,7 @@ __clear_wglocal_histograms(const _HistAccessor& __local_histogram, const _Offset
148149
{
149150
__local_histogram[__offset + __gSize * __k + __self_lidx] = 0;
150151
}
151-
__dpl_sycl::__group_barrier(__self_item);
152+
__dpl_sycl::__group_barrier(__self_item, __fence_space);
152153
}
153154

154155
template <typename _BinIdxType, typename _ValueType, typename _HistReg, typename _BinFunc>
@@ -444,7 +445,8 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option
444445
const ::std::size_t __wgroup_idx = __self_item.get_group(0);
445446
const ::std::size_t __seg_start = __work_group_size * __iters_per_work_item * __wgroup_idx;
446447

447-
__clear_wglocal_histograms(__hacc_private, __wgroup_idx * __num_bins, __num_bins, __self_item);
448+
__clear_wglocal_histograms(__hacc_private, __wgroup_idx * __num_bins, __num_bins, __self_item,
449+
__dpl_sycl::__fence_space_global);
448450

449451
if (__seg_start + __work_group_size * __iters_per_work_item < __n)
450452
{
@@ -469,7 +471,7 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option
469471
}
470472
}
471473

472-
__dpl_sycl::__group_barrier(__self_item);
474+
__dpl_sycl::__group_barrier(__self_item, __dpl_sycl::__fence_space_global);
473475

474476
__reduce_out_histograms<_bin_type, ::std::uint32_t>(__hacc_private, __wgroup_idx * __num_bins,
475477
__bins, __num_bins, __self_item);

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h

+20-10
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,12 @@ struct __subgroup_radix_sort
8080
{
8181
return __dpl_sycl::__local_accessor<_KeyT>(__buf_size, __cgh);
8282
}
83+
84+
inline static constexpr auto
85+
get_fence()
86+
{
87+
return __dpl_sycl::__fence_space_local;
88+
}
8389
};
8490

8591
template <typename _KeyT>
@@ -94,6 +100,12 @@ struct __subgroup_radix_sort
94100
{
95101
return sycl::accessor(__buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{});
96102
}
103+
104+
inline constexpr static auto
105+
get_fence()
106+
{
107+
return __dpl_sycl::__fence_space_global;
108+
}
97109
};
98110

99111
template <typename _ValueT, typename _Wi, typename _Src, typename _Values>
@@ -175,8 +187,9 @@ struct __subgroup_radix_sort
175187

176188
//copy(move) values construction
177189
__block_load<_ValT>(__wi, __src, __values.__v, __n);
190+
// TODO: check if the barrier can be removed
191+
__dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence());
178192

179-
__dpl_sycl::__group_barrier(__it);
180193
while (true)
181194
{
182195
uint16_t __indices[__block_size]; //indices for indirect access in the "re-order" phase
@@ -205,7 +218,7 @@ struct __subgroup_radix_sort
205218
__indices[__i] = *__counters[__i];
206219
*__counters[__i] = __indices[__i] + 1;
207220
}
208-
__dpl_sycl::__group_barrier(__it);
221+
__dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence());
209222

210223
//2. scan phase
211224
{
@@ -218,8 +231,8 @@ struct __subgroup_radix_sort
218231
_ONEDPL_PRAGMA_UNROLL
219232
for (uint16_t __i = 1; __i < __bin_count; ++__i)
220233
__bin_sum[__i] = __bin_sum[__i - 1] + __counter_lacc[__wi * __bin_count + __i];
234+
__dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence());
221235

222-
__dpl_sycl::__group_barrier(__it);
223236
//exclusive scan local sum
224237
uint16_t __sum_scan = __dpl_sycl::__exclusive_scan_over_group(
225238
__it.get_group(), __bin_sum[__bin_count - 1], __dpl_sycl::__plus<uint16_t>());
@@ -230,7 +243,7 @@ struct __subgroup_radix_sort
230243

231244
if (__wi == 0)
232245
__counter_lacc[0] = 0;
233-
__dpl_sycl::__group_barrier(__it);
246+
__dpl_sycl::__group_barrier(__it, decltype(__buf_count)::get_fence());
234247
}
235248

236249
_ONEDPL_PRAGMA_UNROLL
@@ -244,7 +257,7 @@ struct __subgroup_radix_sort
244257
__begin_bit += __radix;
245258

246259
//3. "re-order" phase
247-
__dpl_sycl::__group_barrier(__it);
260+
__dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence());
248261
if (__begin_bit >= __end_bit)
249262
{
250263
// the last iteration - writing out the result
@@ -268,7 +281,6 @@ struct __subgroup_radix_sort
268281
if (__idx < __n)
269282
__exchange_lacc[__idx].~_ValT();
270283
}
271-
272284
return;
273285
}
274286

@@ -293,8 +305,7 @@ struct __subgroup_radix_sort
293305
__exchange_lacc[__r] = ::std::move(__values.__v[__i]);
294306
}
295307
}
296-
297-
__dpl_sycl::__group_barrier(__it);
308+
__dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence());
298309

299310
_ONEDPL_PRAGMA_UNROLL
300311
for (uint16_t __i = 0; __i < __block_size; ++__i)
@@ -303,8 +314,7 @@ struct __subgroup_radix_sort
303314
if (__idx < __n)
304315
__values.__v[__i] = ::std::move(__exchange_lacc[__idx]);
305316
}
306-
307-
__dpl_sycl::__group_barrier(__it);
317+
__dpl_sycl::__group_barrier(__it, decltype(__buf_val)::get_fence());
308318
}
309319
}));
310320
});

include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h

+35-7
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
#define _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
7070
#define _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
7171
#define _ONEDPL_SYCL2020_REQD_SUB_GROUP_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
72+
#define _ONEDPL_SYCL2020_GROUP_BARRIER_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
7273
#define _ONEDPL_SYCL2020_TARGET_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50400))
7374
#define _ONEDPL_SYCL2020_TARGET_DEVICE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50400))
7475
#define _ONEDPL_SYCL2020_ATOMIC_REF_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50500))
@@ -225,17 +226,44 @@ __get_accessor_size(const _Accessor& __accessor)
225226
#endif
226227
}
227228

229+
// TODO: switch to SYCL 2020 with DPC++ compiler.
230+
// SYCL 1.2.1 version is used due to having an API with a local memory fence,
231+
// which gives better performance on Intel GPUs.
232+
// The performance gap is negligible since
233+
// https://github.com/intel/intel-graphics-compiler/commit/ed639f68d142bc963a7b626badc207a42fb281cb (Aug 20, 2024)
234+
// But the fix is not a part of the LTS GPU drivers (Linux) yet.
235+
//
236+
// This macro may also serve as a temporary workaround to strengthen the barriers
237+
// if there are cases where the memory ordering is not strong enough.
238+
#if !defined(_ONEDPL_SYCL121_GROUP_BARRIER)
239+
# if _ONEDPL_LIBSYCL_VERSION
240+
# define _ONEDPL_SYCL121_GROUP_BARRIER 1
241+
# else
242+
// For safety, assume that other SYCL implementations comply with SYCL 2020, which is a oneDPL requirement.
243+
# define _ONEDPL_SYCL121_GROUP_BARRIER 0
244+
# endif
245+
#endif
246+
247+
#if _ONEDPL_SYCL121_GROUP_BARRIER
248+
using __fence_space_t = sycl::access::fence_space;
249+
inline constexpr __fence_space_t __fence_space_local = sycl::access::fence_space::local_space;
250+
inline constexpr __fence_space_t __fence_space_global = sycl::access::fence_space::global_space;
251+
#else
252+
struct __fence_space_t{}; // No-op dummy type since SYCL 2020 does not specify memory fence spaces in group barriers
253+
inline constexpr __fence_space_t __fence_space_local{};
254+
inline constexpr __fence_space_t __fence_space_global{};
255+
#endif // _ONEDPL_SYCL121_GROUP_BARRIER
256+
228257
template <typename _Item>
229-
constexpr void
230-
__group_barrier(_Item __item)
258+
void
259+
__group_barrier(_Item __item, [[maybe_unused]] __dpl_sycl::__fence_space_t __space = __fence_space_local)
231260
{
232-
#if 0 // !defined(_ONEDPL_LIBSYCL_VERSION) || _ONEDPL_LIBSYCL_VERSION >= 50300
233-
//TODO: usage of sycl::group_barrier: probably, we have to revise SYCL parallel patterns which use a group_barrier.
234-
// 1) sycl::group_barrier() implementation is not ready
235-
// 2) sycl::group_barrier and sycl::item::group_barrier are not quite equivalent
261+
#if _ONEDPL_SYCL121_GROUP_BARRIER
262+
__item.barrier(__space);
263+
#elif _ONEDPL_SYCL2020_GROUP_BARRIER_PRESENT
236264
sycl::group_barrier(__item.get_group(), sycl::memory_scope::work_group);
237265
#else
238-
__item.barrier(sycl::access::fence_space::local_space);
266+
# error "sycl::group_barrier is not supported, and no alternative is available"
239267
#endif
240268
}
241269

0 commit comments

Comments
 (0)