diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index ae9094f721a..8bd2a1a3644 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1197,6 +1197,25 @@ __brick_copy_if(_ForwardIterator __first, _ForwardIterator __last, _OutputIterat return ::std::copy_if(__first, __last, __result, __pred); } +template +std::pair<_ForwardIterator, _OutputIterator> +__brick_copy_if(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, + typename std::iterator_traits<_OutputIterator>::difference_type __m, _UnaryPredicate __pred, + /*vector=*/::std::false_type) noexcept +{ + for(; __first != __last && __m > 0; ++__first) + { + const auto& __v = *__first; + if(__pred(__v)) + { + *__result = __v; + ++__result; + --__m; + } + } + return {__first, __result}; +} + template _RandomAccessIterator2 __brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, @@ -1206,7 +1225,20 @@ __brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _ #if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) return __unseq_backend::__simd_copy_if(__first, __last - __first, __result, __pred); #else - return ::std::copy_if(__first, __last, __result, __pred); + return __brick_copy_if(__first, __last, __result, __pred, std::false_type{}); +#endif +} + +template +std::pair<_RandomAccessIterator1, _RandomAccessIterator2> +__brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, + typename std::iterator_traits<_RandomAccessIterator2>::difference_type __m, _UnaryPredicate __pred, + /*vector=*/::std::true_type) noexcept +{ +#if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) + return __unseq_backend::__simd_copy_if(__first, __last - __first, __result, __m, __pred); +#else + return __brick_copy_if(__first, __last, __result, __m, __pred, std::false_type{}); #endif } @@ -1233,6 +1265,28 @@ __brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, bool* __r return ::std::make_pair(__count_true, __size - __count_true); } +template +std::pair<_DifferenceType, _ForwardIterator> +__brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, _Bound __m, bool* __restrict __mask, _UnaryPredicate __pred, + /*vector=*/::std::false_type) noexcept +{ + auto __count_true = _DifferenceType(0); + auto __size = __last - __first; + + static_assert(__is_random_access_iterator_v<_ForwardIterator>, + "Pattern-brick error. Should be a random access iterator."); + + for (; __first != __last && __count_true < __m; ++__first, ++__mask) + { + *__mask = __pred(*__first); + if (*__mask) + { + ++__count_true; + } + } + return {__count_true, __first}; +} + template ::std::pair<_DifferenceType, _DifferenceType> __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, bool* __mask, _UnaryPredicate __pred, @@ -1242,6 +1296,15 @@ __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, return ::std::make_pair(__result, (__last - __first) - __result); } +template +std::pair<_DifferenceType, _RandomAccessIterator> +__brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, _Bound __m, bool* __mask, _UnaryPredicate __pred, + /*vector=*/::std::true_type) noexcept +{ + auto __result = __unseq_backend::__simd_calc_mask_1(__first, __last - __first, __m, __mask, __pred); + return {__result.first, __first + __result.second}; +} + template void __brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, bool* __mask, @@ -1257,6 +1320,23 @@ __brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputI } } +template +std::pair<_ForwardIterator, _OutputIterator> +__brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, _Bound __m, bool* __mask, + _Assigner __assigner, /*vector=*/::std::false_type) noexcept +{ + for (; __first != __last && __m > 0; ++__first, ++__mask) + { + if (*__mask) + { + __assigner(__first, __result); + ++__result; + --__m; + } + } + return {__first, __result}; +} + template void __brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, @@ -1269,6 +1349,18 @@ __brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __la #endif } +template +auto +__brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, + _Bound __m, bool* __restrict __mask, _Assigner __assigner, /*vector=*/::std::true_type) noexcept +{ +#if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) + return __unseq_backend::__simd_copy_by_mask(__first, __last - __first, __result, __m, __mask, __assigner); +#else + return __internal::__brick_copy_by_mask(__first, __last, __result, __m, __mask, __assigner, ::std::false_type()); +#endif +} + template void __brick_partition_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator1 __out_true, @@ -1312,6 +1404,16 @@ __pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _ForwardIt return __internal::__brick_copy_if(__first, __last, __result, __pred, typename _Tag::__is_vector{}); } +template +std::pair<_ForwardIterator, _OutputIterator> +__pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, + _UnaryPredicate __pred, typename std::iterator_traits<_OutputIterator>::difference_type __n_out) noexcept +{ + static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); + + return __internal::__brick_copy_if(__first, __last, __result, __n_out, __pred, typename _Tag::__is_vector{}); +} + template _RandomAccessIterator2 @@ -1322,6 +1424,7 @@ __pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA typedef typename ::std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; const _DifferenceType __n = __last - __first; + if (_DifferenceType(1) < __n) { __par_backend::__buffer<_ExecutionPolicy, bool> __mask_buf(__exec, __n); @@ -1349,6 +1452,50 @@ __pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA return __internal::__brick_copy_if(__first, __last, __result, __pred, _IsVector{}); } +template +std::pair<_RandomAccessIterator1, _RandomAccessIterator2> +__pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, + _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, _UnaryPredicate __pred, + typename std::iterator_traits<_RandomAccessIterator2>::difference_type __n_out) +{ + using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; + + typedef typename ::std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; + const _DifferenceType __n = __last - __first; + + if(__n_out < 0) + __n_out = __n; + + if (_DifferenceType(1) < __n) + { + __par_backend::__buffer<_ExecutionPolicy, bool> __mask_buf(__exec, __n); + return __internal::__except_handler([&__exec, __n, __first, __result, __pred, &__mask_buf, __n_out]() { + bool* __mask = __mask_buf.get(); + _DifferenceType __res_in{}, __res_out{}; + __par_backend::__parallel_strict_scan( + __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), + [=](_DifferenceType __i, _DifferenceType __len) { // Reduce + return __internal::__brick_calc_mask_1<_DifferenceType>(__first + __i, __first + (__i + __len), + __n_out, __mask + __i, __pred, _IsVector{}) + .first; + }, + ::std::plus<_DifferenceType>(), // Combine + [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial, _DifferenceType __len_out) { // Scan + auto res = __internal::__brick_copy_by_mask( + __first + __i, __first + (__i + __len), __result + __initial, __len_out, __mask + __i, + [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{}); + return std::make_pair(res.first - (__first + __i), res.second - (__result + __initial)); + }, + [&__res_in, &__res_out](auto __total_in, auto __total_out) { __res_in = __total_in; __res_out = __total_out; }, + __n_out); + return std::make_pair(__first + __res_in, __result + __res_out); + }); + } + // trivial sequence - use serial algorithm + return __internal::__brick_copy_if(__first, __last, __result, __n_out, __pred, _IsVector{}); +} + //------------------------------------------------------------------------ // count //------------------------------------------------------------------------ diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h index 55d29a56be8..540e58ecd55 100644 --- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h @@ -420,14 +420,14 @@ __pattern_copy_if_ranges(_Tag __tag, _ExecutionPolicy&& __exec, _InRange&& __in_ auto __pred_1 = [__pred, __proj](auto&& __val) { return std::invoke(__pred, std::invoke(__proj, std::forward(__val)));}; - auto __res_idx = oneapi::dpl::__internal::__pattern_copy_if(__tag, std::forward<_ExecutionPolicy>(__exec), + auto __res = oneapi::dpl::__internal::__pattern_copy_if(__tag, std::forward<_ExecutionPolicy>(__exec), std::ranges::begin(__in_r), std::ranges::begin(__in_r) + std::ranges::size(__in_r), - std::ranges::begin(__out_r), __pred_1) - std::ranges::begin(__out_r); + std::ranges::begin(__out_r), __pred_1, std::ranges::size(__out_r)); using __return_type = std::ranges::copy_if_result, std::ranges::borrowed_iterator_t<_OutRange>>; - return __return_type{std::ranges::begin(__in_r) + std::ranges::size(__in_r), std::ranges::begin(__out_r) + __res_idx}; + return __return_type{__res.first, __res.second}; } template @@ -435,7 +435,21 @@ auto __pattern_copy_if_ranges(__serial_tag, _ExecutionPolicy&& __exec, _InRange&& __in_r, _OutRange&& __out_r, _Pred __pred, _Proj __proj) { - return std::ranges::copy_if(std::forward<_InRange>(__in_r), std::ranges::begin(__out_r), __pred, __proj); + using __return_type = std::ranges::copy_if_result, + std::ranges::borrowed_iterator_t<_OutRange>>; + + auto __it_in = std::ranges::begin(__in_r); + auto __it_out = std::ranges::begin(__out_r); + for(; __it_in != std::ranges::end(__in_r) && __it_out != std::ranges::end(__out_r); ++__it_in) + { + if (std::invoke(__pred, std::invoke(__proj, *__it_in))) + { + *__it_out = *__it_in; + ++__it_out; + } + } + + return __return_type{__it_in, __it_out}; } //--------------------------------------------------------------------------------------------------------------------- diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 65bf99c8777..a609867dee8 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -901,15 +901,14 @@ __pattern_copy_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato if (__first == __last) return __result_first; - _It1DifferenceType __n = __last - __first; - + auto __n = __last - __first; auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); auto __buf1 = __keep1(__first, __last); auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); auto __buf2 = __keep2(__result_first, __result_first + __n); auto __res = __par_backend_hetero::__parallel_copy_if(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), __n, __pred); + __buf1.all_view(), __buf2.all_view(), __pred); ::std::size_t __num_copied = __res.get(); //is a blocking call return __result_first + __num_copied; diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index da7820b91a2..b03b73c78e7 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -536,19 +536,22 @@ __pattern_count_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ template -oneapi::dpl::__internal::__difference_t<_Range2> +std::pair, oneapi::dpl::__internal::__difference_t<_Range2>> __pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Predicate __pred, _Assign __assign) { - oneapi::dpl::__internal::__difference_t<_Range2> __n = __rng1.size(); - if (__n == 0) - return 0; + using _Index = std::size_t; //TODO + _Index __n = __rng1.size(); + if (__n == 0 || __rng2.empty()) + return {0, 0}; - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if_out_lim( _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), __n, __pred, __assign); + std::forward<_Range2>(__rng2), __pred, __assign); - return __res.get(); //is a blocking call + std::array<_Index, 2> __idx; + __res.get_values(__idx); //a blocking call + return {__idx[1], __idx[0]}; //__parallel_copy_if_out_lim returns {last index in output, last index in input} } #if _ONEDPL_CPP20_RANGES_PRESENT @@ -561,7 +564,7 @@ __pattern_copy_if_ranges(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e auto __pred_1 = [__pred, __proj](auto&& __val) { return std::invoke(__pred, std::invoke(__proj, std::forward(__val)));}; - auto __res_idx = oneapi::dpl::__internal::__ranges::__pattern_copy_if(__tag, + auto __res = oneapi::dpl::__internal::__ranges::__pattern_copy_if(__tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::views::all_read(__in_r), oneapi::dpl::__ranges::views::all_write(__out_r), __pred_1, oneapi::dpl::__internal::__pstl_assign()); @@ -569,7 +572,7 @@ __pattern_copy_if_ranges(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e using __return_t = std::ranges::copy_if_result, std::ranges::borrowed_iterator_t<_OutRange>>; - return __return_t{std::ranges::begin(__in_r) + std::ranges::size(__in_r), std::ranges::begin(__out_r) + __res_idx}; + return __return_t{std::ranges::begin(__in_r) + __res.first, std::ranges::begin(__out_r) + __res.second}; } #endif //_ONEDPL_CPP20_RANGES_PRESENT diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c7b72625315..6f72994eca1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -316,7 +316,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name // Storage for the results of scan for each workgroup using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _Type>; - __result_and_scratch_storage_t __result_and_scratch{__exec, 1, __n_groups + 1}; + __result_and_scratch_storage_t __result_and_scratch{__exec, 2, __n_groups + 1}; _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); @@ -1228,13 +1228,14 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _ /*_Inclusive=*/std::true_type{}, __is_unique_pattern); } -template auto __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op, + _InRng&& __in_rng, _OutRng&& __out_rng, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) { + using _Size = decltype(__out_rng.size()); using _ReduceOp = std::plus<_Size>; using _Assigner = unseq_backend::__scan_assigner; using _NoAssign = unseq_backend::__scan_no_assign; @@ -1248,7 +1249,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag _MaskAssigner __add_mask_op; // temporary buffer to store boolean mask - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n); + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __in_rng.size()); return __parallel_transform_scan_base( __backend_tag, std::forward<_ExecutionPolicy>(__exec), @@ -1299,7 +1300,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, + std::forward<_Range2>(__result), _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, _CopyOp{_ReduceOp{}, _Assign{}}); } @@ -1360,16 +1361,35 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); + std::forward<_Range2>(__result), _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); } } -template +auto +__parallel_copy_if_out_lim(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{}) +{ + using _Size = decltype(__out_rng.size()); + using _ReduceOp = std::plus<_Size>; + using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>; + using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, + /*inclusive*/ std::true_type, 1>; + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); +} + +template auto __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{}) + _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{}) { + auto __n = __in_rng.size(); + using _Size = decltype(__n); using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; // Next power of 2 greater than or equal to __n @@ -1413,7 +1433,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, /*inclusive*/ std::true_type, 1>; return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 3d9d923d80d..c36032617fe 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -22,7 +22,7 @@ #include #include -#include "../../iterator_impl.h" +#include "../../iterator_impl.h" #include "sycl_defs.h" #include "execution_sycl_defs.h" @@ -665,6 +665,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base _T __get_value(size_t idx = 0) const { + assert(__result_n > 0); assert(idx < __result_n); if (__use_USM_host && __supports_USM_device) { @@ -682,6 +683,26 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base } } + template + void get_values(std::array<_T, _N>& __arr) const + { + assert(__result_n > 0); + assert(_N == __result_n); + if (__use_USM_host && __supports_USM_device) + { + std::copy_n(__result_buf.get(), __result_n, __arr.begin()); + } + else if (__supports_USM_device) + { + __exec.queue().memcpy(__arr.begin(), __scratch_buf.get() + __scratch_n, __result_n * sizeof(_T)).wait(); + } + else + { + auto _acc_h = __sycl_buf->get_host_access(sycl::read_only); + std::copy_n(_acc_h.begin() + __scratch_n, __result_n, __arr.begin()); + } + } + template _T __wait_and_get_value(_Event&& __event, size_t idx = 0) const @@ -691,6 +712,49 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base return __get_value(idx); } + + template + void + __wait_and_get_value(_Event&& __event, std::array<_T, _N>& __arr) const + { + if (is_USM()) + __event.wait_and_throw(); + + get_values(__arr); + } +}; + +// The type specifies the polymorphic behaviour for different value types via the overloads +struct __wait_and_get_value +{ + template + constexpr auto + operator()(auto&& /*__event*/, const sycl::buffer<_T>& __buf) + { + return __buf.get_host_access(sycl::read_only)[0]; + } + + template + constexpr auto + operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage) + { + return __storage.__wait_and_get_value(__event); + } + + template + constexpr void + operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage, std::array<_T, _N>& __arr) + { + __storage.__wait_and_get_value(__event, __arr); + } + + template + constexpr auto + operator()(auto&& __event, const _T& __val) + { + __event.wait_and_throw(); + return __val; + } }; // Tag __async_mode describe a pattern call mode which should be executed asynchronously @@ -714,29 +778,6 @@ class __future : private std::tuple<_Args...> { _Event __my_event; - template - constexpr auto - __wait_and_get_value(const sycl::buffer<_T>& __buf) - { - //according to a contract, returned value is one-element sycl::buffer - return __buf.get_host_access(sycl::read_only)[0]; - } - - template - constexpr auto - __wait_and_get_value(const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage) - { - return __storage.__wait_and_get_value(__my_event); - } - - template - constexpr auto - __wait_and_get_value(const _T& __val) - { - wait(); - return __val; - } - public: __future(_Event __e, _Args... __args) : std::tuple<_Args...>(__args...), __my_event(__e) {} __future(_Event __e, std::tuple<_Args...> __t) : std::tuple<_Args...>(__t), __my_event(__e) {} @@ -770,13 +811,22 @@ class __future : private std::tuple<_Args...> #endif } + template + void + get_values(std::array<_T, _N>& __arr) + { + static_assert(sizeof...(_Args) > 0); + auto& __val = std::get<0>(*this); + __wait_and_get_value{}(event(), __val, __arr); + } + auto get() { if constexpr (sizeof...(_Args) > 0) { auto& __val = std::get<0>(*this); - return __wait_and_get_value(__val); + return __wait_and_get_value{}(event(), __val); } else wait(); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index e0b57260ee0..9755c6affc7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -624,12 +624,24 @@ struct __copy_by_mask // NOTE: we only need this explicit conversion when we have internal::tuple and // ::std::tuple as operands, in all the other cases this is not necessary and no conversion // is performed(i.e. __typle_type is the same type as its operand). - __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]); + if(__out_idx < __out_acc.size()) + { + __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]); + auto __last_out_idx = __wg_sums_ptr[(__n - 1) / __size_per_wg]; + if(__out_idx + 1 == __last_out_idx) + { + __ret_ptr[0] = __item_idx + 1, __ret_ptr[1] = __last_out_idx; + } + } + else if(__out_idx == __out_acc.size()) + { + __ret_ptr[0] = __item_idx, __ret_ptr[1] = __out_idx; + } } if (__item_idx == 0) { //copy final result to output - *__ret_ptr = __wg_sums_ptr[(__n - 1) / __size_per_wg]; + __ret_ptr[1] = __wg_sums_ptr[(__n - 1) / __size_per_wg]; } } }; diff --git a/include/oneapi/dpl/pstl/omp/parallel_scan.h b/include/oneapi/dpl/pstl/omp/parallel_scan.h index c3bc022cb2e..d4abe26aff0 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_scan.h +++ b/include/oneapi/dpl/pstl/omp/parallel_scan.h @@ -79,6 +79,37 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi } } +template +std::pair<_Index, _Index> +__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, + _Sp __scan, _Index __n_out) +{ + std::pair<_Index, _Index> __res{}; + if (__m == 1) + { + if(__initial < __n_out) + __scan(__i * __tilesize, __lastsize, __initial, __n_out - __initial); + } + else + { + const _Index __k = __split(__m); + std::pair<_Index, _Index> __res_1{}, __res_2{}; + oneapi::dpl::__omp_backend::__parallel_invoke_body( + [=, &__res_1] { + __res_1 = oneapi::dpl::__omp_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, + __scan, __n_out); + }, + // Assumes that __combine never throws. + // TODO: Consider adding a requirement for user functors to be constant. + [=, &__combine, &__res_2] { + __res_2 = oneapi::dpl::__omp_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, + __combine(__initial, __r[__k - 1]), __combine, __scan, __n_out); + }); + __res = std::make_pair(__res_1.first + __res_2.first, __res_1.second + __res_2.second); + } + return __res; +} + template void @@ -107,6 +138,35 @@ __parallel_strict_scan_body(_ExecutionPolicy&& __exec, _Index __n, _Tp __initial __initial, __combine, __scan); } + +template +void +__parallel_strict_scan_body(_ExecutionPolicy&& __exec, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, + _Sp __scan, _Ap __apex, _Index __n_out) +{ + _Index __p = omp_get_num_threads(); + const _Index __slack = 4; + _Index __tilesize = (__n - 1) / (__slack * __p) + 1; + _Index __m = (__n - 1) / __tilesize; + __buffer<_ExecutionPolicy, _Tp> __buf(::std::forward<_ExecutionPolicy>(__exec), __m + 1); + _Tp* __r = __buf.get(); + + oneapi::dpl::__omp_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, + __combine); + + std::size_t __k = __m + 1; + _Tp __t = __r[__k - 1]; + while ((__k &= __k - 1)) + { + __t = __combine(__r[__k - 1], __t); + } + + auto __res = oneapi::dpl::__omp_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, + __initial, __combine, __scan, __n_out); + __apex(__res.first, __res.second); +} + template void __parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial, @@ -143,6 +203,42 @@ __parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPol } } +template +void +__parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial, + _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out) +{ + if(__n_out == 0) + return; + else if(__n_out < 0) + __n_out = __n; + + if (__n <= __default_chunk_size) + { + if (__n) + { + auto __res = __scan(_Index(0), __n, __initial, __n_out); + __apex(__res.first, __res.second); + } + return; + } + + if (omp_in_parallel()) + { + oneapi::dpl::__omp_backend::__parallel_strict_scan_body(::std::forward<_ExecutionPolicy>(__exec), __n, + __initial, __reduce, __combine, __scan, __apex, __n_out); + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + { + oneapi::dpl::__omp_backend::__parallel_strict_scan_body(::std::forward<_ExecutionPolicy>(__exec), __n, + __initial, __reduce, __combine, __scan, __apex, __n_out); + } + } +} + } // namespace __omp_backend } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h index 05b91087078..668f596bb78 100644 --- a/include/oneapi/dpl/pstl/onedpl_config.h +++ b/include/oneapi/dpl/pstl/onedpl_config.h @@ -162,8 +162,10 @@ # define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _PSTL_PRAGMA_SIMD_EARLYEXIT #elif _ONEDPL_EARLYEXIT_PRESENT # define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _ONEDPL_PRAGMA(omp simd early_exit) +# define _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(PRM) _ONEDPL_PRAGMA(omp simd early_exit reduction(PRM)) #else # define _ONEDPL_PRAGMA_SIMD_EARLYEXIT +# define _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(PRM) #endif #define _ONEDPL_MONOTONIC_PRESENT (__INTEL_COMPILER >= 1800) diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 6acd4b617f9..50ab0a2cd01 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -86,6 +86,23 @@ __parallel_strict_scan(oneapi::dpl::__internal::__serial_backend_tag, _Execution __scan(_Index(0), __n, __initial); } +template +void +__parallel_strict_scan(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __n, _Tp __initial, + _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out) +{ + if(__n_out == 0) + return; + else if(__n_out < 0) + __n_out = __n; + + if (__n) + { + auto __res = __scan(_Index(0), __n, __initial, __n_out); + __apex(__res.first, __res.second); + } +} + template _Tp __parallel_transform_scan(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __n, _UnaryOp, diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 59821e98156..e23b166bd6b 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -337,6 +337,36 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi } } +template +std::pair<_Index, _Index> +__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, + _Sp __scan, _OutBound __n_out) +{ + std::pair<_Index, _Index> __res{}; + if (__m == 1) + { + if(__initial < __n_out) + __res = __scan(__i * __tilesize, __lastsize, __initial, __n_out - __initial); + } + else + { + const _Index __k = __split(__m); + auto __start = __combine(__initial, __r[__k - 1]); + + std::pair<_Index, _Index> __res_1{}, __res_2{}; + tbb::parallel_invoke( + [=, &__res_1] { __res_1 = __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan, __n_out); }, + // Assumes that __combine never throws. + //TODO: Consider adding a requirement for user functors to be constant. + [=, &__combine, &__res_2] { + __res_2 = __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, + __start, __combine, __scan, __n_out); + }); + __res = std::make_pair(__res_1.first + __res_2.first, __res_1.second + __res_2.second); + } + return __res; +} + // Adapted from Intel(R) Cilk(TM) version from cilkpub. // Let i:len denote a counted interval of length n starting at i. s denotes a generalized-sum value. // Expected actions of the functors are: @@ -344,6 +374,7 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi // combine(s1,s2) -> s -- return merged sum // apex(s) -- do any processing necessary between reduce and scan. // scan(i,len,initial) -- perform scan over i:len starting with initial. +// [n_out -- limit for output range] // The initial range 0:n is partitioned into consecutive subranges. // reduce and scan are each called exactly once per subrange. // Thus callers can rely upon side effects in reduce. @@ -391,6 +422,50 @@ __parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPol }); } +template +void +__parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial, + _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out) +{ + if(__n_out == 0) + return; + else if(__n_out < 0) + __n_out = __n; + tbb::this_task_arena::isolate([=, &__combine]() { + if (__n > 1) + { + _Index __p = tbb::this_task_arena::max_concurrency(); + const _Index __slack = 4; + _Index __tilesize = (__n - 1) / (__slack * __p) + 1; + _Index __m = (__n - 1) / __tilesize; + __tbb_backend::__buffer<_ExecutionPolicy, _Tp> __buf(__exec, __m + 1); + _Tp* __r = __buf.get(); + __tbb_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, + __combine); + + // When __apex is a no-op and __combine has no side effects, a good optimizer + // should be able to eliminate all code between here and __apex. + // Alternatively, provide a default value for __apex that can be + // recognized by metaprogramming that conditionlly executes the following. + size_t __k = __m + 1; + _Tp __t = __r[__k - 1]; + while ((__k &= __k - 1)) + __t = __combine(__r[__k - 1], __t); + + auto __res = __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial, + __combine, __scan, __n_out); + __apex(__res.first, __res.second); + return; + } + // Fewer than 2 elements in sequence, or out of memory. Handle has single block. + if (__n) + { + auto __res = __scan(_Index(0), __n, __initial, __n_out); + __apex(__res.first, __res.second); + } + }); +} + template _Tp __parallel_transform_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __n, _Up __u, diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index 7e454c80268..c5b1db856a8 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -302,6 +302,52 @@ __simd_copy_if(_InputIterator __first, _DifferenceType __n, _OutputIterator __re return __result + __cnt; } +//const _Size __block_size = __lane_size / sizeof(_Tp); +template +std::pair<_InputIt, _OutputIt> +__simd_copy_if(_InputIt __first, _DiffTypeIn __n, _OutputIt __result, _DiffTypeOut __m, _UnaryPredicate __pred) noexcept +{ + _DiffTypeIn __i = 0; + _DiffTypeOut __cnt = 0; + if(__m >= __n) + { + _ONEDPL_PRAGMA_SIMD + for (__i = 0; __i < __n; ++__i) + { + _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) + if (__pred(__first[__i])) + { + __result[__cnt] = __first[__i]; + ++__cnt; + } + } + } + else // __m < __n + { + _ONEDPL_PRAGMA_SIMD + for (__i = 0; __i < __m; ++__i) + { + _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) + if (__pred(__first[__i])) + { + __result[__cnt] = __first[__i]; + ++__cnt; + } + } + + //process the remaining (__n - __m) elements + for (__i = __m; __i < __n && __cnt < __m; ++__i) + { + if (__pred(__first[__i])) + { + __result[__cnt] = __first[__i]; + ++__cnt; + } + } + } + return {__first + __i, __result + __cnt}; +} + template _DifferenceType __simd_calc_mask_2(_InputIterator __first, _DifferenceType __n, bool* __mask, _BinaryPredicate __pred) noexcept @@ -332,6 +378,25 @@ __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, bool* __mask, _U return __count; } +template +std::pair<_DifferenceType, _DifferenceType> +__simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, _Bound __m, bool* __mask, _UnaryPredicate __pred) noexcept +{ + _DifferenceType __count = 0; + _DifferenceType __i = 0; + + _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(+ : __count) + for (__i = 0; __i < __n; ++__i) + { + if(__count >= __m) + break; + + __mask[__i] = __pred(__first[__i]); + __count += __mask[__i]; + } + return {__count, __i}; +} + template void __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, bool* __mask, @@ -352,6 +417,55 @@ __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator } } +template +void +__simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _Bound __m, bool* __mask, + _Assigner __assigner) noexcept +{ + _DifferenceType __cnt = 0; + _DifferenceType __i = 0; + if(__m >= __n) + { + _ONEDPL_PRAGMA_SIMD + for (__i = 0; __i < __n; ++__i) + { + if (__mask[__i]) + { + _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) + { + __assigner(__first + __i, __result + __cnt); + ++__cnt; + } + } + } + } + else // __m < __n + { + _ONEDPL_PRAGMA_SIMD + for (__i = 0; __i < __m; ++__i) + { + if (__mask[__i]) + { + _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) + { + __assigner(__first + __i, __result + __cnt); + ++__cnt; + } + } + } + + //process the remaining (__n - __m) elements + for (__i = __m; __i < __n && __cnt < __m; ++__i) + { + if (__mask[__i]) + { + __assigner(__first + __i, __result + __cnt); + ++__cnt; + } + } + } +} + template void __simd_partition_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true, diff --git a/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp b/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp index 842d9038b52..d343d0d6ac3 100644 --- a/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp +++ b/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp @@ -23,17 +23,25 @@ main() namespace dpl_ranges = oneapi::dpl::ranges; auto copy_if_checker = [](std::ranges::random_access_range auto&& r_in, - std::ranges::random_access_range auto&& r_out, auto&&... args) + std::ranges::random_access_range auto&& r_out, auto pred, auto proj) { - auto res = std::ranges::copy_if(std::forward(r_in), std::ranges::begin(r_out), - std::forward(args)...); - using ret_type = std::ranges::copy_if_result, std::ranges::borrowed_iterator_t>; - return ret_type{res.in, res.out}; + + auto it_in = std::ranges::begin(r_in); + auto it_out = std::ranges::begin(r_out); + for(; it_in != std::ranges::end(r_in) && it_out != std::ranges::end(r_out); ++it_in) + { + if (std::invoke(pred, std::invoke(proj, *it_in))) + { + *it_out = *it_in; + ++it_out; + } + } + return ret_type{it_in, it_out}; }; - test_range_algo<0, int, data_in_out>{big_sz}(dpl_ranges::copy_if, copy_if_checker, pred); + test_range_algo<0, int, data_in_out>{big_sz}(dpl_ranges::copy_if, copy_if_checker, pred, std::identity{}); test_range_algo<1, int, data_in_out>{}(dpl_ranges::copy_if, copy_if_checker, pred, proj); test_range_algo<2, P2, data_in_out>{}(dpl_ranges::copy_if, copy_if_checker, pred, &P2::x); test_range_algo<3, P2, data_in_out>{}(dpl_ranges::copy_if, copy_if_checker, pred, &P2::proj);