Skip to content

Commit 81af798

Browse files
committed
Fixing problems with scheduler fast-idle mode
Signed-off-by: Hartmut Kaiser <[email protected]>
1 parent afcbb5b commit 81af798

File tree

11 files changed

+425
-163
lines changed

11 files changed

+425
-163
lines changed

cmake/HPX_AddModule.cmake

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ function(add_hpx_module libname modulename)
6969
FORCE
7070
)
7171

72-
if(${modulename}_GLOBAL_HEADER_MODULE_GEN OR ${modulename}_MODULE_SOURCE)
72+
if(HPX_WITH_CXX_MODULES AND (${modulename}_GLOBAL_HEADER_MODULE_GEN
73+
OR ${modulename}_MODULE_SOURCE)
74+
)
7375
# Mark the module as exposing C++ modules
7476
set(cxx_modules ${HPX_ENABLED_CXX_MODULES})
7577
list(APPEND cxx_modules ${modulename})
@@ -193,7 +195,7 @@ function(add_hpx_module libname modulename)
193195
set(global_header
194196
"${CMAKE_CURRENT_BINARY_DIR}/include/hpx/modules/${modulename}.hpp"
195197
)
196-
if(${modulename}_GLOBAL_HEADER_MODULE_GEN)
198+
if(HPX_WITH_CXX_MODULES AND ${modulename}_GLOBAL_HEADER_MODULE_GEN)
197199
# generate list of macro headers to #include
198200
list(LENGTH ${modulename}_MACRO_HEADERS macro_headers)
199201
if(macro_headers GREATER 0)
@@ -236,7 +238,7 @@ function(add_hpx_module libname modulename)
236238
)
237239
set(generated_headers ${global_header})
238240

239-
if(${modulename}_GLOBAL_HEADER_MODULE_GEN)
241+
if(HPX_WITH_CXX_MODULES AND ${modulename}_GLOBAL_HEADER_MODULE_GEN)
240242
# collect all standard header files used by this module
241243
set(found_includes)
242244
hpx_collect_std_headers(

libs/core/algorithms/include/hpx/parallel/util/foreach_partitioner.hpp

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,18 @@ namespace hpx::parallel::util::detail {
6969
auto&& shape =
7070
detail::get_bulk_iteration_shape_idx(policy, first, count);
7171

72-
return execution::bulk_async_execute(policy.executor(),
73-
partitioner_iteration<Result, F>{HPX_FORWARD(F, f)},
74-
reshape(HPX_MOVE(shape)));
72+
if constexpr (hpx::is_async_execution_policy_v<ExPolicy>)
73+
{
74+
return execution::bulk_async_execute(policy.executor(),
75+
partitioner_iteration<Result, F>{HPX_FORWARD(F, f)},
76+
reshape(HPX_MOVE(shape)));
77+
}
78+
else
79+
{
80+
return execution::bulk_sync_execute(policy.executor(),
81+
partitioner_iteration<Result, F>{HPX_FORWARD(F, f)},
82+
reshape(HPX_MOVE(shape)));
83+
}
7584
}
7685
else
7786
{
@@ -101,8 +110,8 @@ namespace hpx::parallel::util::detail {
101110

102111
template <typename ExPolicy_, typename FwdIter, typename F1,
103112
typename F2, typename ReShape = hpx::identity>
104-
static decltype(auto) call(ExPolicy_&& policy, FwdIter first,
105-
std::size_t count, F1&& f1, F2&& f2, ReShape&& reshape = ReShape{})
113+
static auto call(ExPolicy_&& policy, FwdIter first, std::size_t count,
114+
F1&& f1, F2&& f2, ReShape&& reshape = ReShape{})
106115
{
107116
// inform parameter traits
108117
using scoped_executor_parameters =
@@ -115,14 +124,28 @@ namespace hpx::parallel::util::detail {
115124
FwdIter last = parallel::detail::next(first, count);
116125
try
117126
{
118-
auto&& items = detail::foreach_partition<Result>(
119-
HPX_FORWARD(ExPolicy_, policy), first, count,
120-
HPX_FORWARD(F1, f1), HPX_FORWARD(ReShape, reshape));
127+
if constexpr (std::is_void_v<decltype(foreach_partition<Result>(
128+
policy, first, count, f1, reshape))>)
129+
{
130+
detail::foreach_partition<Result>(
131+
HPX_FORWARD(ExPolicy_, policy), first, count,
132+
HPX_FORWARD(F1, f1), HPX_FORWARD(ReShape, reshape));
121133

122-
scoped_params.mark_end_of_scheduling();
134+
scoped_params.mark_end_of_scheduling();
123135

124-
return reduce(
125-
HPX_MOVE(items), HPX_FORWARD(F2, f2), HPX_MOVE(last));
136+
return HPX_INVOKE(f2, HPX_MOVE(last));
137+
}
138+
else
139+
{
140+
auto&& items = foreach_partition<Result>(
141+
HPX_FORWARD(ExPolicy_, policy), first, count,
142+
HPX_FORWARD(F1, f1), HPX_FORWARD(ReShape, reshape));
143+
144+
scoped_params.mark_end_of_scheduling();
145+
146+
return reduce(
147+
HPX_MOVE(items), HPX_FORWARD(F2, f2), HPX_MOVE(last));
148+
}
126149
}
127150
catch (...)
128151
{

libs/core/algorithms/include/hpx/parallel/util/result_types.hpp

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,15 @@ namespace hpx::parallel::util {
416416
namespace detail {
417417

418418
template <typename ZipIter>
419-
in_out_result<typename hpx::tuple_element<0,
420-
typename ZipIter::iterator_tuple_type>::type,
419+
in_out_result<
420+
typename hpx::tuple_element<0,
421+
typename std::decay_t<ZipIter>::iterator_tuple_type>::type,
421422
typename hpx::tuple_element<1,
422-
typename ZipIter::iterator_tuple_type>::type>
423+
typename std::decay_t<ZipIter>::iterator_tuple_type>::type>
423424
get_in_out_result(ZipIter&& zipiter)
424425
{
425-
using iterator_tuple_type = typename ZipIter::iterator_tuple_type;
426+
using iterator_tuple_type =
427+
typename std::decay_t<ZipIter>::iterator_tuple_type;
426428

427429
using result_type = in_out_result<
428430
typename hpx::tuple_element<0, iterator_tuple_type>::type,
@@ -433,11 +435,7 @@ namespace hpx::parallel::util {
433435
}
434436

435437
template <typename ZipIterSender>
436-
// clang-format off
437-
requires (
438-
hpx::execution::experimental::is_sender_v<ZipIterSender>
439-
)
440-
// clang-format on
438+
requires(hpx::execution::experimental::is_sender_v<ZipIterSender>)
441439
decltype(auto) get_in_out_result(ZipIterSender&& zipiter_sender)
442440
{
443441
return hpx::execution::experimental::then(
@@ -498,15 +496,17 @@ namespace hpx::parallel::util {
498496
}
499497

500498
template <typename ZipIter>
501-
in_in_out_result<typename hpx::tuple_element<0,
502-
typename ZipIter::iterator_tuple_type>::type,
499+
in_in_out_result<
500+
typename hpx::tuple_element<0,
501+
typename std::decay_t<ZipIter>::iterator_tuple_type>::type,
503502
typename hpx::tuple_element<1,
504-
typename ZipIter::iterator_tuple_type>::type,
503+
typename std::decay_t<ZipIter>::iterator_tuple_type>::type,
505504
typename hpx::tuple_element<2,
506-
typename ZipIter::iterator_tuple_type>::type>
505+
typename std::decay_t<ZipIter>::iterator_tuple_type>::type>
507506
get_in_in_out_result(ZipIter&& zipiter)
508507
{
509-
using iterator_tuple_type = typename ZipIter::iterator_tuple_type;
508+
using iterator_tuple_type =
509+
typename std::decay_t<ZipIter>::iterator_tuple_type;
510510

511511
using result_type = in_in_out_result<
512512
typename hpx::tuple_element<0, iterator_tuple_type>::type,
@@ -518,11 +518,7 @@ namespace hpx::parallel::util {
518518
}
519519

520520
template <typename ZipIterSender>
521-
// clang-format off
522-
requires (
523-
hpx::execution::experimental::is_sender_v<ZipIterSender>
524-
)
525-
// clang-format on
521+
requires(hpx::execution::experimental::is_sender_v<ZipIterSender>)
526522
decltype(auto) get_in_in_out_result(ZipIterSender&& zipiter_sender)
527523
{
528524
return hpx::execution::experimental::then(

libs/core/algorithms/tests/performance/benchmark_merge.cpp

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,33 @@ struct hpx::execution::experimental::is_executor_parameters<compute_chunk_size>
133133
{
134134
};
135135

136+
struct enable_fast_idle_mode
137+
{
138+
template <typename Executor>
139+
friend void tag_override_invoke(
140+
hpx::execution::experimental::mark_begin_execution_t,
141+
enable_fast_idle_mode, Executor&&)
142+
{
143+
hpx::threads::add_scheduler_mode(
144+
hpx::threads::policies::scheduler_mode::fast_idle_mode);
145+
}
146+
147+
template <typename Executor>
148+
friend void tag_override_invoke(
149+
hpx::execution::experimental::mark_end_execution_t,
150+
enable_fast_idle_mode, Executor&&)
151+
{
152+
hpx::threads::remove_scheduler_mode(
153+
hpx::threads::policies::scheduler_mode::fast_idle_mode);
154+
}
155+
};
156+
157+
template <>
158+
struct hpx::execution::experimental::is_executor_parameters<
159+
enable_fast_idle_mode> : std::true_type
160+
{
161+
};
162+
136163
///////////////////////////////////////////////////////////////////////////////
137164
template <typename T>
138165
struct random_to_item_t
@@ -234,7 +261,7 @@ void run_benchmark(std::size_t vector_size1, std::size_t vector_size2,
234261
double const time_seq = run_merge_benchmark_hpx(
235262
test_count, seq, first1, last1, first2, last2, dest);
236263

237-
hpx::this_thread::sleep_for(std::chrono::seconds(1));
264+
hpx::this_thread::sleep_for(std::chrono::milliseconds(200));
238265

239266
std::cout << "--- run_merge_benchmark_par ---" << std::endl;
240267

@@ -251,11 +278,11 @@ void run_benchmark(std::size_t vector_size1, std::size_t vector_size2,
251278
double const time_par = run_merge_benchmark_hpx(
252279
test_count, policy.with(ccs), first1, last1, first2, last2, dest);
253280

254-
std::cout << "--- run_merge_benchmark_par_stackless ---" << std::endl;
255-
256281
HPX_ITT_PAUSE();
257282

258-
hpx::this_thread::sleep_for(std::chrono::seconds(1));
283+
std::cout << "--- run_merge_benchmark_par_stackless ---" << std::endl;
284+
285+
hpx::this_thread::sleep_for(std::chrono::milliseconds(200));
259286

260287
HPX_ITT_RESUME();
261288

@@ -270,6 +297,26 @@ void run_benchmark(std::size_t vector_size1, std::size_t vector_size2,
270297

271298
HPX_ITT_PAUSE();
272299

300+
std::cout << "--- run_merge_benchmark_par_stackless_fast_idle ---"
301+
<< std::endl;
302+
303+
hpx::this_thread::sleep_for(std::chrono::milliseconds(200));
304+
305+
HPX_ITT_RESUME();
306+
307+
double time_par_stackless_fast_idle = 0;
308+
{
309+
enable_fast_idle_mode efim;
310+
auto const stackless_policy =
311+
hpx::execution::experimental::with_stacksize(
312+
policy, hpx::threads::thread_stacksize::nostack);
313+
time_par_stackless_fast_idle = run_merge_benchmark_hpx(test_count,
314+
stackless_policy.with(ccs, efim), first1, last1, first2, last2,
315+
dest);
316+
}
317+
318+
HPX_ITT_PAUSE();
319+
273320
std::cout << "--- run_merge_benchmark_par_fork_join ---" << std::endl;
274321
double time_par_fork_join = 0;
275322
{
@@ -290,6 +337,9 @@ void run_benchmark(std::size_t vector_size1, std::size_t vector_size2,
290337
hpx::util::format_to(std::cout, fmt, "par", time_par) << std::endl;
291338
hpx::util::format_to(std::cout, fmt, "par_stackless", time_par_stackless)
292339
<< std::endl;
340+
hpx::util::format_to(
341+
std::cout, fmt, "par_stackless_fast_idle", time_par_stackless_fast_idle)
342+
<< std::endl;
293343
hpx::util::format_to(std::cout, fmt, "par_fork_join", time_par_fork_join)
294344
<< std::endl;
295345
hpx::util::format_to(std::cout, fmt, "par_unseq", time_par_unseq)

libs/core/algorithms/tests/performance/benchmark_merge_sweep.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,33 @@ struct hpx::execution::experimental::is_executor_parameters<adaptive_chunk_size>
164164
{
165165
};
166166

167+
struct enable_fast_idle_mode
168+
{
169+
template <typename Executor>
170+
friend void tag_override_invoke(
171+
hpx::execution::experimental::mark_begin_execution_t,
172+
enable_fast_idle_mode, Executor&&)
173+
{
174+
hpx::threads::add_scheduler_mode(
175+
hpx::threads::policies::scheduler_mode::fast_idle_mode);
176+
}
177+
178+
template <typename Executor>
179+
friend void tag_override_invoke(
180+
hpx::execution::experimental::mark_end_execution_t,
181+
enable_fast_idle_mode, Executor&&)
182+
{
183+
hpx::threads::remove_scheduler_mode(
184+
hpx::threads::policies::scheduler_mode::fast_idle_mode);
185+
}
186+
};
187+
188+
template <>
189+
struct hpx::execution::experimental::is_executor_parameters<
190+
enable_fast_idle_mode> : std::true_type
191+
{
192+
};
193+
167194
///////////////////////////////////////////////////////////////////////////////
168195
template <typename T>
169196
struct random_to_item_t
@@ -461,6 +488,11 @@ int hpx_main(hpx::program_options::variables_map& vm)
461488
run_benchmark(stackless_policy, vector_size1, vector_size2, test_count,
462489
std::random_access_iterator_tag(), alloc, "std::vector (stackless)",
463490
entropy);
491+
492+
enable_fast_idle_mode efim;
493+
run_benchmark(stackless_policy.with(efim), vector_size1, vector_size2,
494+
test_count, std::random_access_iterator_tag(), alloc,
495+
"std::vector (stackless, fast-idle mode)", entropy);
464496
}
465497

466498
{
@@ -481,6 +513,11 @@ int hpx_main(hpx::program_options::variables_map& vm)
481513
run_benchmark(stackless_policy, vector_size1, vector_size2, test_count,
482514
std::random_access_iterator_tag(), alloc,
483515
"hpx::compute::vector (stackless)", entropy);
516+
517+
enable_fast_idle_mode efim;
518+
run_benchmark(stackless_policy.with(efim), vector_size1, vector_size2,
519+
test_count, std::random_access_iterator_tag(), alloc,
520+
"hpx::compute::vector (stackless, fast-idle mode)", entropy);
484521
}
485522

486523
return hpx::local::finalize();

libs/core/executors/include/hpx/executors/detail/hierarchical_spawning.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,42 @@ namespace hpx::parallel::execution::detail {
222222
policy, HPX_FORWARD(F, f), shape, HPX_FORWARD(Ts, ts)...);
223223
}
224224

225+
template <typename Launch, typename F, typename S, typename... Ts>
226+
decltype(auto) hierarchical_bulk_sync_execute(
227+
hpx::threads::thread_description const& desc,
228+
threads::thread_pool_base* pool, std::size_t first_thread,
229+
std::size_t num_threads, std::size_t hierarchical_threshold,
230+
Launch policy, F&& f, S const& shape, Ts&&... ts)
231+
{
232+
using result_type = detail::bulk_function_result_t<F, S, Ts...>;
233+
if constexpr (!std::is_void_v<result_type>)
234+
{
235+
return hpx::unwrap(hierarchical_bulk_async_execute_helper(desc,
236+
pool, first_thread, num_threads, hierarchical_threshold, policy,
237+
HPX_FORWARD(F, f), shape, HPX_FORWARD(Ts, ts)...));
238+
}
239+
else
240+
{
241+
return hpx::unwrap(hierarchical_bulk_async_execute_void(desc, pool,
242+
first_thread, num_threads, hierarchical_threshold, policy,
243+
HPX_FORWARD(F, f), shape, HPX_FORWARD(Ts, ts)...));
244+
}
245+
}
246+
247+
template <typename Launch, typename F, typename S, typename... Ts>
248+
decltype(auto) hierarchical_bulk_sync_execute(
249+
threads::thread_pool_base* pool, std::size_t first_thread,
250+
std::size_t num_threads, std::size_t hierarchical_threshold,
251+
Launch policy, F&& f, S const& shape, Ts&&... ts)
252+
{
253+
hpx::threads::thread_description const desc(
254+
f, "hierarchical_bulk_sync_execute");
255+
256+
return hierarchical_bulk_sync_execute(desc, pool, first_thread,
257+
num_threads, hierarchical_threshold, policy, HPX_FORWARD(F, f),
258+
shape, HPX_FORWARD(Ts, ts)...);
259+
}
260+
225261
template <typename Launch, typename F, typename S, typename... Ts>
226262
decltype(auto) hierarchical_bulk_async_execute(
227263
hpx::threads::thread_description const& desc,

0 commit comments

Comments
 (0)