diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h
index ae9094f721a..8bd2a1a3644 100644
--- a/include/oneapi/dpl/pstl/algorithm_impl.h
+++ b/include/oneapi/dpl/pstl/algorithm_impl.h
@@ -1197,6 +1197,25 @@ __brick_copy_if(_ForwardIterator __first, _ForwardIterator __last, _OutputIterat
     return ::std::copy_if(__first, __last, __result, __pred);
 }
 
+template <class _ForwardIterator, class _OutputIterator, class _UnaryPredicate>
+std::pair<_ForwardIterator, _OutputIterator>
+__brick_copy_if(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result,
+                typename std::iterator_traits<_OutputIterator>::difference_type __m, _UnaryPredicate __pred,
+                /*vector=*/::std::false_type) noexcept
+{
+    for(; __first != __last && __m > 0; ++__first)
+    {
+        const auto& __v = *__first;
+        if(__pred(__v))
+        {
+            *__result = __v;
+            ++__result;
+            --__m;
+        }
+    }
+    return {__first, __result};
+}
+
 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _UnaryPredicate>
 _RandomAccessIterator2
 __brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result,
@@ -1206,7 +1225,20 @@ __brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _
 #if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT)
     return __unseq_backend::__simd_copy_if(__first, __last - __first, __result, __pred);
 #else
-    return ::std::copy_if(__first, __last, __result, __pred);
+    return __brick_copy_if(__first, __last, __result, __pred, std::false_type{});
+#endif
+}
+
+template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _UnaryPredicate>
+std::pair<_RandomAccessIterator1, _RandomAccessIterator2>
+__brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result,
+                typename std::iterator_traits<_RandomAccessIterator2>::difference_type __m, _UnaryPredicate __pred,
+                /*vector=*/::std::true_type) noexcept
+{
+#if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT)
+    return __unseq_backend::__simd_copy_if(__first, __last - __first, __result, __m, __pred);
+#else
+    return __brick_copy_if(__first, __last, __result, __m, __pred, std::false_type{});
 #endif
 }
 
@@ -1233,6 +1265,28 @@ __brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, bool* __r
     return ::std::make_pair(__count_true, __size - __count_true);
 }
 
+template <class _DifferenceType, class _ForwardIterator, class _Bound, class _UnaryPredicate>
+std::pair<_DifferenceType, _ForwardIterator>
+__brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, _Bound __m, bool* __restrict __mask, _UnaryPredicate __pred,
+                    /*vector=*/::std::false_type) noexcept
+{
+    auto __count_true = _DifferenceType(0);
+    auto __size = __last - __first;
+
+    static_assert(__is_random_access_iterator_v<_ForwardIterator>,
+                  "Pattern-brick error. Should be a random access iterator.");
+
+    for (; __first != __last && __count_true < __m; ++__first, ++__mask)
+    {
+        *__mask = __pred(*__first);
+        if (*__mask)
+        {
+            ++__count_true;
+        }
+    }
+    return {__count_true, __first};
+}
+
 template <class _DifferenceType, class _RandomAccessIterator, class _UnaryPredicate>
 ::std::pair<_DifferenceType, _DifferenceType>
 __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, bool* __mask, _UnaryPredicate __pred,
@@ -1242,6 +1296,15 @@ __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last,
     return ::std::make_pair(__result, (__last - __first) - __result);
 }
 
+template <class _DifferenceType, class _RandomAccessIterator, class _Bound, class _UnaryPredicate>
+std::pair<_DifferenceType, _RandomAccessIterator>
+__brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, _Bound __m, bool* __mask, _UnaryPredicate __pred,
+                    /*vector=*/::std::true_type) noexcept
+{
+    auto __result = __unseq_backend::__simd_calc_mask_1(__first, __last - __first, __m, __mask, __pred);
+    return {__result.first, __first + __result.second};
+}
+
 template <class _ForwardIterator, class _OutputIterator, class _Assigner>
 void
 __brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, bool* __mask,
@@ -1257,6 +1320,23 @@ __brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputI
     }
 }
 
+template <class _ForwardIterator, class _OutputIterator, class _Bound, class _Assigner>
+std::pair<_ForwardIterator, _OutputIterator>
+__brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, _Bound __m, bool* __mask,
+                     _Assigner __assigner, /*vector=*/::std::false_type) noexcept
+{
+    for (; __first != __last && __m > 0; ++__first, ++__mask)
+    {
+        if (*__mask)
+        {
+            __assigner(__first, __result);
+            ++__result;
+            --__m;
+        }
+    }
+    return {__first, __result};
+}
+
 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _Assigner>
 void
 __brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result,
@@ -1269,6 +1349,18 @@ __brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __la
 #endif
 }
 
+template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _Bound, class _Assigner>
+auto
+__brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result,
+                     _Bound __m, bool* __restrict __mask, _Assigner __assigner, /*vector=*/::std::true_type) noexcept
+{
+#if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT)
+    return __unseq_backend::__simd_copy_by_mask(__first, __last - __first, __result, __m, __mask, __assigner);
+#else
+    return __internal::__brick_copy_by_mask(__first, __last, __result, __m, __mask, __assigner, ::std::false_type());
+#endif
+}
+
 template <class _ForwardIterator, class _OutputIterator1, class _OutputIterator2>
 void
 __brick_partition_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator1 __out_true,
@@ -1312,6 +1404,16 @@ __pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _ForwardIt
     return __internal::__brick_copy_if(__first, __last, __result, __pred, typename _Tag::__is_vector{});
 }
 
+template <class _Tag, class _ExecutionPolicy, class _ForwardIterator, class _OutputIterator, class _UnaryPredicate>
+std::pair<_ForwardIterator, _OutputIterator>
+__pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result,
+                  _UnaryPredicate __pred, typename std::iterator_traits<_OutputIterator>::difference_type __n_out) noexcept
+{
+    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
+
+    return __internal::__brick_copy_if(__first, __last, __result, __n_out, __pred, typename _Tag::__is_vector{});
+}
+
 template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
           class _UnaryPredicate>
 _RandomAccessIterator2
@@ -1322,6 +1424,7 @@ __pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA
 
     typedef typename ::std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType;
     const _DifferenceType __n = __last - __first;
+
     if (_DifferenceType(1) < __n)
     {
         __par_backend::__buffer<_ExecutionPolicy, bool> __mask_buf(__exec, __n);
@@ -1349,6 +1452,50 @@ __pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA
     return __internal::__brick_copy_if(__first, __last, __result, __pred, _IsVector{});
 }
 
+template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _RandomAccessIterator2,
+          class _UnaryPredicate>
+std::pair<_RandomAccessIterator1, _RandomAccessIterator2>
+__pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first,
+                  _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, _UnaryPredicate __pred,
+                  typename std::iterator_traits<_RandomAccessIterator2>::difference_type __n_out)
+{
+    using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
+
+    typedef typename ::std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType;
+    const _DifferenceType __n = __last - __first;
+
+    if(__n_out < 0)
+        __n_out = __n;
+
+    if (_DifferenceType(1) < __n)
+    {
+        __par_backend::__buffer<_ExecutionPolicy, bool> __mask_buf(__exec, __n);
+        return __internal::__except_handler([&__exec, __n, __first, __result, __pred, &__mask_buf, __n_out]() {
+            bool* __mask = __mask_buf.get();
+            _DifferenceType __res_in{}, __res_out{};
+            __par_backend::__parallel_strict_scan(
+                __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0),
+                [=](_DifferenceType __i, _DifferenceType __len) { // Reduce
+                    return __internal::__brick_calc_mask_1<_DifferenceType>(__first + __i, __first + (__i + __len),
+                                                                            __n_out, __mask + __i, __pred, _IsVector{})
+                        .first;
+                },
+                ::std::plus<_DifferenceType>(),                                              // Combine
+                [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial, _DifferenceType __len_out) { // Scan
+                    auto res = __internal::__brick_copy_by_mask(
+                        __first + __i, __first + (__i + __len), __result + __initial, __len_out, __mask + __i,
+                        [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{});
+                    return std::make_pair(res.first - (__first + __i), res.second - (__result + __initial));
+                },
+                [&__res_in, &__res_out](auto __total_in, auto __total_out) { __res_in = __total_in; __res_out = __total_out; },
+                __n_out);
+            return std::make_pair(__first + __res_in, __result + __res_out);
+        });
+    }
+    // trivial sequence - use serial algorithm
+    return __internal::__brick_copy_if(__first, __last, __result, __n_out, __pred, _IsVector{});
+}
+
 //------------------------------------------------------------------------
 // count
 //------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h
index 55d29a56be8..540e58ecd55 100644
--- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h
+++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h
@@ -420,14 +420,14 @@ __pattern_copy_if_ranges(_Tag __tag, _ExecutionPolicy&& __exec, _InRange&& __in_
     auto __pred_1 = [__pred, __proj](auto&& __val) { return std::invoke(__pred, std::invoke(__proj,
         std::forward<decltype(__val)>(__val)));};
 
-    auto __res_idx = oneapi::dpl::__internal::__pattern_copy_if(__tag, std::forward<_ExecutionPolicy>(__exec),
+    auto __res = oneapi::dpl::__internal::__pattern_copy_if(__tag, std::forward<_ExecutionPolicy>(__exec),
         std::ranges::begin(__in_r), std::ranges::begin(__in_r) + std::ranges::size(__in_r),
-        std::ranges::begin(__out_r), __pred_1) - std::ranges::begin(__out_r);
+        std::ranges::begin(__out_r), __pred_1, std::ranges::size(__out_r));
 
     using __return_type = std::ranges::copy_if_result<std::ranges::borrowed_iterator_t<_InRange>,
         std::ranges::borrowed_iterator_t<_OutRange>>;
 
-    return __return_type{std::ranges::begin(__in_r) + std::ranges::size(__in_r), std::ranges::begin(__out_r) + __res_idx};
+    return __return_type{__res.first, __res.second};
 }
 
 template<typename _ExecutionPolicy, typename _InRange, typename _OutRange, typename _Pred, typename _Proj>
@@ -435,7 +435,21 @@ auto
 __pattern_copy_if_ranges(__serial_tag</*IsVector*/std::false_type>, _ExecutionPolicy&& __exec, _InRange&& __in_r, _OutRange&& __out_r,
                          _Pred __pred, _Proj __proj)
 {
-    return std::ranges::copy_if(std::forward<_InRange>(__in_r), std::ranges::begin(__out_r), __pred, __proj);
+    using __return_type = std::ranges::copy_if_result<std::ranges::borrowed_iterator_t<_InRange>,
+        std::ranges::borrowed_iterator_t<_OutRange>>;
+
+    auto __it_in = std::ranges::begin(__in_r);
+    auto __it_out = std::ranges::begin(__out_r);
+    for(; __it_in != std::ranges::end(__in_r) && __it_out != std::ranges::end(__out_r); ++__it_in)
+    {
+         if (std::invoke(__pred, std::invoke(__proj, *__it_in)))
+         {
+             *__it_out = *__it_in;
+             ++__it_out;
+         }
+    }
+
+    return __return_type{__it_in, __it_out};
 }
 
 //---------------------------------------------------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index 65bf99c8777..a609867dee8 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -901,15 +901,14 @@ __pattern_copy_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     if (__first == __last)
         return __result_first;
 
-    _It1DifferenceType __n = __last - __first;
-
+    auto __n = __last - __first;
     auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
     auto __buf1 = __keep1(__first, __last);
     auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>();
     auto __buf2 = __keep2(__result_first, __result_first + __n);
 
     auto __res = __par_backend_hetero::__parallel_copy_if(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          __buf1.all_view(), __buf2.all_view(), __n, __pred);
+                                                          __buf1.all_view(), __buf2.all_view(), __pred);
 
     ::std::size_t __num_copied = __res.get(); //is a blocking call
     return __result_first + __num_copied;
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index da7820b91a2..b03b73c78e7 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -536,19 +536,22 @@ __pattern_count_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _
 
 template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Predicate,
           typename _Assign = oneapi::dpl::__internal::__pstl_assign>
-oneapi::dpl::__internal::__difference_t<_Range2>
+std::pair<oneapi::dpl::__internal::__difference_t<_Range1>, oneapi::dpl::__internal::__difference_t<_Range2>>
 __pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
                   _Predicate __pred, _Assign __assign)
 {
-    oneapi::dpl::__internal::__difference_t<_Range2> __n = __rng1.size();
-    if (__n == 0)
-        return 0;
+    using _Index = std::size_t; //TODO
+    _Index __n = __rng1.size();
+    if (__n == 0 || __rng2.empty())
+        return {0, 0};
 
-    auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if(
+    auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if_out_lim(
         _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
-        std::forward<_Range2>(__rng2), __n, __pred, __assign);
+        std::forward<_Range2>(__rng2), __pred, __assign);
 
-    return __res.get(); //is a blocking call
+    std::array<_Index, 2> __idx;
+    __res.get_values(__idx); //a blocking call
+    return {__idx[1], __idx[0]}; //__parallel_copy_if_out_lim returns {last index in output, last index in input}
 }
 
 #if _ONEDPL_CPP20_RANGES_PRESENT
@@ -561,7 +564,7 @@ __pattern_copy_if_ranges(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e
     auto __pred_1 = [__pred, __proj](auto&& __val) { return std::invoke(__pred, std::invoke(__proj,
         std::forward<decltype(__val)>(__val)));};
 
-    auto __res_idx = oneapi::dpl::__internal::__ranges::__pattern_copy_if(__tag,
+    auto __res = oneapi::dpl::__internal::__ranges::__pattern_copy_if(__tag,
         std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::views::all_read(__in_r),
         oneapi::dpl::__ranges::views::all_write(__out_r), __pred_1,
         oneapi::dpl::__internal::__pstl_assign());
@@ -569,7 +572,7 @@ __pattern_copy_if_ranges(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e
     using __return_t = std::ranges::copy_if_result<std::ranges::borrowed_iterator_t<_InRange>,
         std::ranges::borrowed_iterator_t<_OutRange>>;
 
-    return __return_t{std::ranges::begin(__in_r) + std::ranges::size(__in_r), std::ranges::begin(__out_r) + __res_idx};
+    return __return_t{std::ranges::begin(__in_r) + __res.first, std::ranges::begin(__out_r) + __res.second};
 }
 #endif //_ONEDPL_CPP20_RANGES_PRESENT
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index c7b72625315..6f72994eca1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -316,7 +316,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
         // Storage for the results of scan for each workgroup
 
         using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _Type>;
-        __result_and_scratch_storage_t __result_and_scratch{__exec, 1, __n_groups + 1};
+        __result_and_scratch_storage_t __result_and_scratch{__exec, 2, __n_groups + 1};
 
         _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu);
 
@@ -1228,13 +1228,14 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _
         /*_Inclusive=*/std::true_type{}, __is_unique_pattern);
 }
 
-template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _CreateMaskOp,
           typename _CopyByMaskOp>
 auto
 __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                     _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op,
+                     _InRng&& __in_rng, _OutRng&& __out_rng, _CreateMaskOp __create_mask_op,
                      _CopyByMaskOp __copy_by_mask_op)
 {
+    using  _Size = decltype(__out_rng.size());
     using _ReduceOp = std::plus<_Size>;
     using _Assigner = unseq_backend::__scan_assigner;
     using _NoAssign = unseq_backend::__scan_no_assign;
@@ -1248,7 +1249,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
     _MaskAssigner __add_mask_op;
 
     // temporary buffer to store boolean mask
-    oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n);
+    oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __in_rng.size());
 
     return __parallel_transform_scan_base(
         __backend_tag, std::forward<_ExecutionPolicy>(__exec),
@@ -1299,7 +1300,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
         using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;
 
         return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-                                    std::forward<_Range2>(__result), __n,
+                                    std::forward<_Range2>(__result),
                                     _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
                                     _CopyOp{_ReduceOp{}, _Assign{}});
     }
@@ -1360,16 +1361,35 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
         using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;
 
         return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-                                    std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
+                                    std::forward<_Range2>(__result), _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
     }
 }
 
-template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred,
+          typename _Assign = oneapi::dpl::__internal::__pstl_assign>
+auto
+__parallel_copy_if_out_lim(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
+                   _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{})
+{
+    using _Size = decltype(__out_rng.size());
+    using _ReduceOp = std::plus<_Size>;
+    using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
+    using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
+                                                  /*inclusive*/ std::true_type, 1>;
+
+    return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                    std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
+                                    _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
+}
+
+template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred,
           typename _Assign = oneapi::dpl::__internal::__pstl_assign>
 auto
 __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
-                   _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{})
+                   _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{})
 {
+    auto __n = __in_rng.size();
+    using _Size = decltype(__n);
     using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
 
     // Next power of 2 greater than or equal to __n
@@ -1413,7 +1433,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
                                                       /*inclusive*/ std::true_type, 1>;
 
         return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
-                                    std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
+                                    std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
                                     _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
     }
 }
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 3d9d923d80d..c36032617fe 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -22,7 +22,7 @@
 #include <tuple>
 #include <algorithm>
 
-#include "../../iterator_impl.h"
+#include "../../iterator_impl.h" 
 
 #include "sycl_defs.h"
 #include "execution_sycl_defs.h"
@@ -665,6 +665,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
     _T
     __get_value(size_t idx = 0) const
     {
+        assert(__result_n > 0);
         assert(idx < __result_n);
         if (__use_USM_host && __supports_USM_device)
         {
@@ -682,6 +683,26 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
         }
     }
 
+    template <std::size_t _N>
+    void get_values(std::array<_T, _N>& __arr) const
+    {
+        assert(__result_n > 0);
+        assert(_N == __result_n);
+        if (__use_USM_host && __supports_USM_device)
+        {
+            std::copy_n(__result_buf.get(), __result_n, __arr.begin());
+        }
+        else if (__supports_USM_device)
+        {
+            __exec.queue().memcpy(__arr.begin(), __scratch_buf.get() + __scratch_n, __result_n * sizeof(_T)).wait();
+        }
+        else
+        {
+            auto _acc_h = __sycl_buf->get_host_access(sycl::read_only);
+            std::copy_n(_acc_h.begin() + __scratch_n, __result_n, __arr.begin());
+        }
+    }
+
     template <typename _Event>
     _T
     __wait_and_get_value(_Event&& __event, size_t idx = 0) const
@@ -691,6 +712,49 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
 
         return __get_value(idx);
     }
+
+    template <typename _Event, std::size_t _N>
+    void
+    __wait_and_get_value(_Event&& __event, std::array<_T, _N>& __arr) const
+    {
+        if (is_USM())
+            __event.wait_and_throw();
+
+        get_values(__arr);
+    }
+};
+
+// The type specifies the polymorphic behaviour for different value types via the overloads
+struct __wait_and_get_value
+{
+    template <typename _T>
+    constexpr auto
+    operator()(auto&& /*__event*/, const sycl::buffer<_T>& __buf)
+    {
+        return __buf.get_host_access(sycl::read_only)[0];
+    }
+
+    template <typename _ExecutionPolicy, typename _T>
+    constexpr auto
+    operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage)
+    {
+        return __storage.__wait_and_get_value(__event);
+    }
+
+    template <typename _ExecutionPolicy, typename _T, std::size_t _N>
+    constexpr void
+    operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage, std::array<_T, _N>& __arr)
+    {
+        __storage.__wait_and_get_value(__event, __arr);
+    }
+
+    template <typename _T>
+    constexpr auto
+    operator()(auto&& __event, const _T& __val)
+    {
+        __event.wait_and_throw();
+        return __val;
+    }
 };
 
 // Tag __async_mode describe a pattern call mode which should be executed asynchronously
@@ -714,29 +778,6 @@ class __future : private std::tuple<_Args...>
 {
     _Event __my_event;
 
-    template <typename _T>
-    constexpr auto
-    __wait_and_get_value(const sycl::buffer<_T>& __buf)
-    {
-        //according to a contract, returned value is one-element sycl::buffer
-        return __buf.get_host_access(sycl::read_only)[0];
-    }
-
-    template <typename _ExecutionPolicy, typename _T>
-    constexpr auto
-    __wait_and_get_value(const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage)
-    {
-        return __storage.__wait_and_get_value(__my_event);
-    }
-
-    template <typename _T>
-    constexpr auto
-    __wait_and_get_value(const _T& __val)
-    {
-        wait();
-        return __val;
-    }
-
   public:
     __future(_Event __e, _Args... __args) : std::tuple<_Args...>(__args...), __my_event(__e) {}
     __future(_Event __e, std::tuple<_Args...> __t) : std::tuple<_Args...>(__t), __my_event(__e) {}
@@ -770,13 +811,22 @@ class __future : private std::tuple<_Args...>
 #endif
     }
 
+    template <typename _T, std::size_t _N>
+    void
+    get_values(std::array<_T, _N>& __arr)
+    {
+        static_assert(sizeof...(_Args) > 0);
+        auto& __val = std::get<0>(*this);
+        __wait_and_get_value{}(event(), __val, __arr);
+    }
+
     auto
     get()
     {
         if constexpr (sizeof...(_Args) > 0)
         {
             auto& __val = std::get<0>(*this);
-            return __wait_and_get_value(__val);
+            return __wait_and_get_value{}(event(), __val);
         }
         else
             wait();
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index e0b57260ee0..9755c6affc7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -624,12 +624,24 @@ struct __copy_by_mask
                 // NOTE: we only need this explicit conversion when we have internal::tuple and
                 // ::std::tuple as operands, in all the other cases this is not necessary and no conversion
                 // is performed(i.e. __typle_type is the same type as its operand).
-                __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]);
+                if(__out_idx < __out_acc.size())
+                {
+                    __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]);
+                    auto __last_out_idx = __wg_sums_ptr[(__n - 1) / __size_per_wg];
+                    if(__out_idx + 1 == __last_out_idx)
+                    {
+                        __ret_ptr[0] = __item_idx + 1, __ret_ptr[1] = __last_out_idx;
+                    }
+                }
+                else if(__out_idx == __out_acc.size())
+                {
+                    __ret_ptr[0] = __item_idx, __ret_ptr[1] = __out_idx;
+                }
         }
         if (__item_idx == 0)
         {
             //copy final result to output
-            *__ret_ptr = __wg_sums_ptr[(__n - 1) / __size_per_wg];
+            __ret_ptr[1] = __wg_sums_ptr[(__n - 1) / __size_per_wg];
         }
     }
 };
diff --git a/include/oneapi/dpl/pstl/omp/parallel_scan.h b/include/oneapi/dpl/pstl/omp/parallel_scan.h
index c3bc022cb2e..d4abe26aff0 100644
--- a/include/oneapi/dpl/pstl/omp/parallel_scan.h
+++ b/include/oneapi/dpl/pstl/omp/parallel_scan.h
@@ -79,6 +79,37 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi
     }
 }
 
+template <typename _Index, typename _Tp, typename _Cp, typename _Sp>
+std::pair<_Index, _Index>
+__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine,
+            _Sp __scan, _Index __n_out)
+{
+    std::pair<_Index, _Index> __res{};
+    if (__m == 1)
+    {
+        if(__initial < __n_out)
+            __scan(__i * __tilesize, __lastsize, __initial, __n_out - __initial);
+    }
+    else
+    {
+        const _Index __k = __split(__m);
+        std::pair<_Index, _Index> __res_1{}, __res_2{};
+        oneapi::dpl::__omp_backend::__parallel_invoke_body(
+            [=, &__res_1] {
+                __res_1 = oneapi::dpl::__omp_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine,
+                                                                  __scan, __n_out);
+            },
+            // Assumes that __combine never throws.
+            // TODO: Consider adding a requirement for user functors to be constant.
+            [=, &__combine, &__res_2] {
+                __res_2 = oneapi::dpl::__omp_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize,
+                                                                  __combine(__initial, __r[__k - 1]), __combine, __scan, __n_out);
+            });
+        __res = std::make_pair(__res_1.first + __res_2.first, __res_1.second + __res_2.second);
+    }
+    return __res;
+}
+
 template <typename _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp,
           typename _Ap>
 void
@@ -107,6 +138,35 @@ __parallel_strict_scan_body(_ExecutionPolicy&& __exec, _Index __n, _Tp __initial
                                             __initial, __combine, __scan);
 }
 
+
+template <typename _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp,
+          typename _Ap>
+void
+__parallel_strict_scan_body(_ExecutionPolicy&& __exec, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine,
+                            _Sp __scan, _Ap __apex, _Index __n_out)
+{
+    _Index __p = omp_get_num_threads();
+    const _Index __slack = 4;
+    _Index __tilesize = (__n - 1) / (__slack * __p) + 1;
+    _Index __m = (__n - 1) / __tilesize;
+    __buffer<_ExecutionPolicy, _Tp> __buf(::std::forward<_ExecutionPolicy>(__exec), __m + 1);
+    _Tp* __r = __buf.get();
+
+    oneapi::dpl::__omp_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce,
+                                          __combine);
+
+    std::size_t __k = __m + 1;
+    _Tp __t = __r[__k - 1];
+    while ((__k &= __k - 1))
+    {
+        __t = __combine(__r[__k - 1], __t);
+    }
+
+    auto __res = oneapi::dpl::__omp_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize,
+                                            __initial, __combine, __scan, __n_out);
+    __apex(__res.first, __res.second);
+}
+
 template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
 void
 __parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial,
@@ -143,6 +203,42 @@ __parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPol
     }
 }
 
+template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
+void
+__parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial,
+                       _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out)
+{
+    if(__n_out == 0)
+        return;
+    else if(__n_out < 0)
+        __n_out = __n;
+
+    if (__n <= __default_chunk_size)
+    {
+        if (__n)
+        {
+            auto __res = __scan(_Index(0), __n, __initial, __n_out);
+            __apex(__res.first, __res.second);
+        }
+        return;
+    }
+
+    if (omp_in_parallel())
+    {
+        oneapi::dpl::__omp_backend::__parallel_strict_scan_body(::std::forward<_ExecutionPolicy>(__exec), __n,
+                                                                __initial, __reduce, __combine, __scan, __apex, __n_out);
+    }
+    else
+    {
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single nowait)
+        {
+            oneapi::dpl::__omp_backend::__parallel_strict_scan_body(::std::forward<_ExecutionPolicy>(__exec), __n,
+                                                                    __initial, __reduce, __combine, __scan, __apex, __n_out);
+        }
+    }
+}
+
 } // namespace __omp_backend
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h
index 05b91087078..668f596bb78 100644
--- a/include/oneapi/dpl/pstl/onedpl_config.h
+++ b/include/oneapi/dpl/pstl/onedpl_config.h
@@ -162,8 +162,10 @@
 #    define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _PSTL_PRAGMA_SIMD_EARLYEXIT
 #elif _ONEDPL_EARLYEXIT_PRESENT
 #    define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _ONEDPL_PRAGMA(omp simd early_exit)
+#    define _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(PRM) _ONEDPL_PRAGMA(omp simd early_exit reduction(PRM))
 #else
 #    define _ONEDPL_PRAGMA_SIMD_EARLYEXIT
+#    define _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(PRM)
 #endif
 
 #define _ONEDPL_MONOTONIC_PRESENT (__INTEL_COMPILER >= 1800)
diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h
index 6acd4b617f9..50ab0a2cd01 100644
--- a/include/oneapi/dpl/pstl/parallel_backend_serial.h
+++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h
@@ -86,6 +86,23 @@ __parallel_strict_scan(oneapi::dpl::__internal::__serial_backend_tag, _Execution
         __scan(_Index(0), __n, __initial);
 }
 
+template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
+void
+__parallel_strict_scan(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __n, _Tp __initial,
+                       _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out)
+{
+    if(__n_out == 0)
+        return;
+    else if(__n_out < 0)
+        __n_out = __n;
+
+    if (__n)
+    {
+        auto __res = __scan(_Index(0), __n, __initial, __n_out);
+        __apex(__res.first, __res.second);
+    }
+}
+
 template <class _ExecutionPolicy, class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce, class _Scan>
 _Tp
 __parallel_transform_scan(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __n, _UnaryOp,
diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h
index 59821e98156..e23b166bd6b 100644
--- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h
+++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h
@@ -337,6 +337,36 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi
     }
 }
 
+template <typename _Index, typename _Tp, typename _Cp, typename _Sp, typename _OutBound>
+std::pair<_Index, _Index>
+__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine,
+            _Sp __scan, _OutBound __n_out)
+{
+    std::pair<_Index, _Index> __res{};
+    if (__m == 1)
+    {
+        if(__initial < __n_out)
+            __res = __scan(__i * __tilesize, __lastsize, __initial, __n_out - __initial);
+    }
+    else
+    {
+        const _Index __k = __split(__m);
+        auto __start = __combine(__initial, __r[__k - 1]);
+
+        std::pair<_Index, _Index> __res_1{}, __res_2{};
+        tbb::parallel_invoke(
+            [=, &__res_1] { __res_1 = __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan, __n_out); },
+            // Assumes that __combine never throws.
+            //TODO: Consider adding a requirement for user functors to be constant.
+            [=, &__combine, &__res_2] {
+                __res_2 = __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize,
+                                        __start, __combine, __scan, __n_out);
+            });
+        __res = std::make_pair(__res_1.first + __res_2.first, __res_1.second + __res_2.second);
+    }
+    return __res;
+}
+
 // Adapted from Intel(R) Cilk(TM) version from cilkpub.
 // Let i:len denote a counted interval of length n starting at i.  s denotes a generalized-sum value.
 // Expected actions of the functors are:
@@ -344,6 +374,7 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi
 //     combine(s1,s2) -> s -- return merged sum
 //     apex(s) -- do any processing necessary between reduce and scan.
 //     scan(i,len,initial) -- perform scan over i:len starting with initial.
+//     [n_out -- limit for output range]
 // The initial range 0:n is partitioned into consecutive subranges.
 // reduce and scan are each called exactly once per subrange.
 // Thus callers can rely upon side effects in reduce.
@@ -391,6 +422,50 @@ __parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPol
     });
 }
 
+template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
+void
+__parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial,
+                       _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out)
+{
+    if(__n_out == 0)
+        return;
+    else if(__n_out < 0)
+        __n_out = __n;
+    tbb::this_task_arena::isolate([=, &__combine]() {
+        if (__n > 1)
+        {
+            _Index __p = tbb::this_task_arena::max_concurrency();
+            const _Index __slack = 4;
+            _Index __tilesize = (__n - 1) / (__slack * __p) + 1;
+            _Index __m = (__n - 1) / __tilesize;
+            __tbb_backend::__buffer<_ExecutionPolicy, _Tp> __buf(__exec, __m + 1);
+            _Tp* __r = __buf.get();
+            __tbb_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce,
+                                     __combine);
+
+            // When __apex is a no-op and __combine has no side effects, a good optimizer
+            // should be able to eliminate all code between here and __apex.
+            // Alternatively, provide a default value for __apex that can be
+            // recognized by metaprogramming that conditionlly executes the following.
+            size_t __k = __m + 1;
+            _Tp __t = __r[__k - 1];
+            while ((__k &= __k - 1))
+                __t = __combine(__r[__k - 1], __t);
+
+            auto __res = __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial,
+                                       __combine, __scan, __n_out);
+            __apex(__res.first, __res.second);
+            return;
+        }
+        // Fewer than 2 elements in sequence, or out of memory.  Handle has single block.
+        if (__n)
+        {
+            auto __res = __scan(_Index(0), __n, __initial, __n_out);
+            __apex(__res.first, __res.second);
+        }
+    });
+}
+
 template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
 _Tp
 __parallel_transform_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __n, _Up __u,
diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h
index 7e454c80268..c5b1db856a8 100644
--- a/include/oneapi/dpl/pstl/unseq_backend_simd.h
+++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h
@@ -302,6 +302,52 @@ __simd_copy_if(_InputIterator __first, _DifferenceType __n, _OutputIterator __re
     return __result + __cnt;
 }
 
+//const _Size __block_size = __lane_size / sizeof(_Tp);
+template <typename _InputIt, typename _DiffTypeIn, typename _OutputIt, typename _DiffTypeOut, typename _UnaryPredicate>
+std::pair<_InputIt, _OutputIt>
+__simd_copy_if(_InputIt __first, _DiffTypeIn __n, _OutputIt __result, _DiffTypeOut __m, _UnaryPredicate __pred) noexcept
+{
+    _DiffTypeIn __i = 0;
+    _DiffTypeOut __cnt = 0;
+    if(__m >= __n)
+    {
+        _ONEDPL_PRAGMA_SIMD
+        for (__i = 0; __i < __n; ++__i)
+        {
+            _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
+            if (__pred(__first[__i]))
+            {
+                __result[__cnt] = __first[__i];
+                ++__cnt;
+            }
+        }
+    }
+    else // __m < __n
+    {
+        _ONEDPL_PRAGMA_SIMD
+        for (__i = 0; __i < __m; ++__i)
+        {
+            _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
+            if (__pred(__first[__i]))
+            {
+                __result[__cnt] = __first[__i];
+                ++__cnt;
+            }
+        }
+
+        //process the remaining (__n - __m) elements
+        for (__i = __m; __i < __n && __cnt < __m; ++__i)
+        {
+            if (__pred(__first[__i]))
+            {
+                __result[__cnt] = __first[__i];
+                ++__cnt;
+            }
+        }
+    }
+    return {__first + __i, __result + __cnt};
+}
+
 template <class _InputIterator, class _DifferenceType, class _BinaryPredicate>
 _DifferenceType
 __simd_calc_mask_2(_InputIterator __first, _DifferenceType __n, bool* __mask, _BinaryPredicate __pred) noexcept
@@ -332,6 +378,25 @@ __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, bool* __mask, _U
     return __count;
 }
 
+template <typename _InputIterator, typename _DifferenceType, typename _Bound, typename _UnaryPredicate>
+std::pair<_DifferenceType, _DifferenceType>
+__simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, _Bound __m, bool* __mask, _UnaryPredicate __pred) noexcept
+{
+    _DifferenceType __count = 0;
+    _DifferenceType __i = 0;
+
+    _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(+ : __count)
+    for (__i = 0; __i < __n; ++__i)
+    {
+        if(__count >= __m)
+            break;
+
+        __mask[__i] = __pred(__first[__i]);
+        __count += __mask[__i];
+    }
+    return {__count, __i};
+}
+
 template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
 void
 __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, bool* __mask,
@@ -352,6 +417,55 @@ __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator
     }
 }
 
+template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Bound, class _Assigner>
+void
+__simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _Bound __m, bool* __mask,
+                    _Assigner __assigner) noexcept
+{
+    _DifferenceType __cnt = 0;
+    _DifferenceType __i = 0;
+    if(__m >= __n)
+    {
+        _ONEDPL_PRAGMA_SIMD
+        for (__i = 0; __i < __n; ++__i)
+        {
+            if (__mask[__i])
+            {
+                _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
+                {
+                    __assigner(__first + __i, __result + __cnt);
+                    ++__cnt;
+                }
+            }
+        }
+    }
+    else // __m < __n
+    {
+        _ONEDPL_PRAGMA_SIMD
+        for (__i = 0; __i < __m; ++__i)
+        {
+            if (__mask[__i])
+            {
+                _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
+                {
+                    __assigner(__first + __i, __result + __cnt);
+                    ++__cnt;
+                }
+            }
+        }
+
+        //process the remaining (__n - __m) elements
+        for (__i = __m; __i < __n && __cnt < __m; ++__i)
+        {
+            if (__mask[__i])
+            {
+                __assigner(__first + __i, __result + __cnt);
+                ++__cnt;
+            }
+        }
+    }
+}
+
 template <class _InputIterator, class _DifferenceType, class _OutputIterator1, class _OutputIterator2>
 void
 __simd_partition_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true,
diff --git a/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp b/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp
index 842d9038b52..d343d0d6ac3 100644
--- a/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp
+++ b/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp
@@ -23,17 +23,25 @@ main()
     namespace dpl_ranges = oneapi::dpl::ranges;
 
     auto copy_if_checker = [](std::ranges::random_access_range auto&& r_in,
-                           std::ranges::random_access_range auto&& r_out, auto&&... args)
+                           std::ranges::random_access_range auto&& r_out, auto pred, auto proj)
     {
-        auto res = std::ranges::copy_if(std::forward<decltype(r_in)>(r_in), std::ranges::begin(r_out),
-            std::forward<decltype(args)>(args)...);
-
         using ret_type = std::ranges::copy_if_result<std::ranges::borrowed_iterator_t<decltype(r_in)>,
             std::ranges::borrowed_iterator_t<decltype(r_out)>>;
-        return ret_type{res.in, res.out};
+
+        auto it_in = std::ranges::begin(r_in);
+        auto it_out = std::ranges::begin(r_out);
+        for(; it_in != std::ranges::end(r_in) && it_out != std::ranges::end(r_out); ++it_in)
+        {
+             if (std::invoke(pred, std::invoke(proj, *it_in)))
+             {
+                 *it_out = *it_in;
+                 ++it_out;
+             }
+        }
+        return ret_type{it_in, it_out};
     };
 
-    test_range_algo<0, int, data_in_out>{big_sz}(dpl_ranges::copy_if,  copy_if_checker, pred);
+    test_range_algo<0, int, data_in_out>{big_sz}(dpl_ranges::copy_if,  copy_if_checker, pred, std::identity{});
     test_range_algo<1, int, data_in_out>{}(dpl_ranges::copy_if,  copy_if_checker, pred, proj);
     test_range_algo<2, P2, data_in_out>{}(dpl_ranges::copy_if,  copy_if_checker, pred, &P2::x);
     test_range_algo<3, P2, data_in_out>{}(dpl_ranges::copy_if,  copy_if_checker, pred, &P2::proj);