Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix to decay Policy for __result_and_scratch_storage #2031

Merged
merged 7 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 25 additions & 22 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -615,8 +615,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend

// Although we do not actually need result storage in this case, we need to construct
// a placeholder here to match the return type of the non-single-work-group implementation
using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _ValueType>;
__result_and_scratch_storage_t __dummy_result_and_scratch{__exec, 0, 0};
__result_and_scratch_storage<_ExecutionPolicy, _ValueType> __dummy_result_and_scratch{__exec, 0, 0};

if (__max_wg_size >= __targeted_wg_size)
{
Expand Down Expand Up @@ -1093,7 +1092,7 @@ struct __write_to_id_if_else
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
typename _BinaryOperation, typename _Inclusive>
auto
__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __in_rng, _Range2&& __out_rng, std::size_t __n, _UnaryOperation __unary_op,
_InitType __init, _BinaryOperation __binary_op, _Inclusive)
{
Expand Down Expand Up @@ -1122,9 +1121,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384;
if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit))
{
return __parallel_transform_scan_single_group(__backend_tag, __exec, std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __n, __unary_op, __init,
__binary_op, _Inclusive{});
return __parallel_transform_scan_single_group(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
}
}
#if _ONEDPL_COMPILE_KERNEL
Expand Down Expand Up @@ -1161,7 +1160,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_NoOpFunctor __get_data_op;

return __parallel_transform_scan_base(
__backend_tag, __exec, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __init,
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __init,
// local scan
unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner,
_NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op,
Expand Down Expand Up @@ -1283,7 +1283,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinaryPredicate>
auto
__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred)
{
using _Assign = oneapi::dpl::__internal::__pstl_assign;
Expand Down Expand Up @@ -1316,8 +1316,9 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
decltype(__n)>;
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result),
__n, _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
std::forward<_Range2>(__result), __n,
_CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
_CopyOp{_ReduceOp{}, _Assign{}});
}

Expand Down Expand Up @@ -1357,7 +1358,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
auto
__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
{
oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size();
Expand All @@ -1383,14 +1384,14 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result),
__n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
}

template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
typename _Assign = oneapi::dpl::__internal::__pstl_assign>
auto
__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{})
{
using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
Expand Down Expand Up @@ -1440,8 +1441,9 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
/*inclusive*/ std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
__n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred},
_CopyOp{_ReduceOp{}, __assign});
}

#if _ONEDPL_COMPILE_KERNEL
Expand Down Expand Up @@ -1534,7 +1536,7 @@ __parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference __is_op_difference)
{
Expand All @@ -1552,8 +1554,9 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, c
}
}
#endif
return __parallel_set_scan(__backend_tag, __exec, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
std::forward<_Range3>(__result), __comp, __is_op_difference);
return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
__is_op_difference);
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -2467,8 +2470,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
typename _BinaryPredicate, typename _BinaryOperator>
oneapi::dpl::__internal::__difference_t<_Range3>
__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, const _ExecutionPolicy& __exec,
_Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys,
_Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values,
_BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
{
// The algorithm reduces values in __values where the
Expand Down Expand Up @@ -2506,7 +2509,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, cons
}
#endif
return __parallel_reduce_by_segment_fallback(
oneapi::dpl::__internal::__device_backend_tag{}, __exec,
oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
std::forward<_Range4>(__out_values), __binary_pred, __binary_op,
oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,12 @@ template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename..
struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize,
__internal::__optional_kernel_name<_KernelName...>>
{
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _TransformOp,
typename _ExecutionPolicy2, typename... _Ranges>
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _TransformOp, typename... _Ranges>
auto
operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n,
const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
_TransformOp __transform_op,
const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container,
const __result_and_scratch_storage<_ExecutionPolicy, _Tp>& __scratch_container,
_Ranges&&... __rngs) const
{
auto __transform_pattern =
Expand All @@ -215,7 +214,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)),
[=](sycl::nd_item<1> __item_id) {
auto __temp_ptr =
__result_and_scratch_storage<_ExecutionPolicy2, _Tp>::__get_usm_or_buffer_accessor_ptr(
__result_and_scratch_storage<_ExecutionPolicy, _Tp>::__get_usm_or_buffer_accessor_ptr(
__temp_acc);
__device_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full, __n_groups,
__transform_pattern, __reduce_pattern, __temp_local, __temp_ptr,
Expand All @@ -235,12 +234,11 @@ template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename..
struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize,
__internal::__optional_kernel_name<_KernelName...>>
{
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _InitType,
typename _ExecutionPolicy2>
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _InitType>
auto
operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event,
const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
_InitType __init, const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const
_InitType __init, const __result_and_scratch_storage<_ExecutionPolicy, _Tp>& __scratch_container) const
{
using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;
auto __transform_pattern =
Expand All @@ -250,7 +248,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative

const bool __is_full = __n == __work_group_size * __iters_per_work_item;

using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy2, _Tp>;
using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _Tp>;

__reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) {
__cgh.depends_on(__reduce_event);
Expand Down
18 changes: 12 additions & 6 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ struct __result_and_scratch_storage_base
};

template <typename _ExecutionPolicy, typename _T>
struct __result_and_scratch_storage : __result_and_scratch_storage_base
struct __result_and_scratch_storage_impl : __result_and_scratch_storage_base
{
private:
using __sycl_buffer_t = sycl::buffer<_T, 1>;
Expand Down Expand Up @@ -576,10 +576,10 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
}

public:
__result_and_scratch_storage(const _ExecutionPolicy& __exec_, std::size_t __result_n, std::size_t __scratch_n)
__result_and_scratch_storage_impl(const _ExecutionPolicy& __exec_, std::size_t __result_n, std::size_t __scratch_n)
: __exec{__exec_}, __result_n{__result_n}, __scratch_n{__scratch_n},
__use_USM_host{__use_USM_host_allocations(__exec.queue())}, __supports_USM_device{
__use_USM_allocations(__exec.queue())}
__use_USM_host{__use_USM_host_allocations(__exec.queue())},
__supports_USM_device{__use_USM_allocations(__exec.queue())}
{
const std::size_t __total_n = __scratch_n + __result_n;
// Skip in case this is a dummy container
Expand Down Expand Up @@ -696,6 +696,9 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
}
};

template <typename _ExecutionPolicy, typename _T>
using __result_and_scratch_storage = __result_and_scratch_storage_impl<std::decay_t<_ExecutionPolicy>, _T>;

// Tag __async_mode describe a pattern call mode which should be executed asynchronously
struct __async_mode
{
Expand Down Expand Up @@ -725,9 +728,12 @@ class __future : private std::tuple<_Args...>
return __buf.get_host_access(sycl::read_only)[0];
}

template <typename _ExecutionPolicy, typename _T>
// Here we use __result_and_scratch_storage_impl rather than __result_and_scratch_storage because we need to
// match the type with the overload and are deducing the policy type. If we used __result_and_scratch_storage,
// it would cause issues in type deduction due to decay of the policy in that using statement.
template <typename _DecayedExecutionPolicy, typename _T>
constexpr auto
__wait_and_get_value(const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage)
__wait_and_get_value(const __result_and_scratch_storage_impl<_DecayedExecutionPolicy, _T>& __storage)
{
return __storage.__wait_and_get_value(__my_event);
}
Expand Down
Loading