Skip to content

Commit a26412e

Browse files
authored
Reapply "Add EXECUTORCH_THREADPOOL_SIZE options, default to u… (#14307) (#14842)
This reverts commit 750cba7. Re-applying the better threadpool size defaults from #14090 with the fix from #14838. This gives a 2-4x speedup for many models and platforms (I measured 4x speedup on M1 with MobileNet V3 + XNNPACK). On high core count server platforms (doing evals, for example), this can give a 100x speedup out of box.
1 parent f32e9fc commit a26412e

File tree

6 files changed

+98
-3
lines changed

6 files changed

+98
-3
lines changed

extension/threadpool/CMakeLists.txt

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,16 @@ if(NOT CMAKE_CXX_STANDARD)
2020
set(CMAKE_CXX_STANDARD 17)
2121
endif()
2222

23+
# Threadpool size specifiers. Mutual exclusion is checking in default.cmake.
24+
# Default to using performance cores if
25+
# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set.
26+
set(_threadpool_size_flag)
27+
if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
28+
set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES")
29+
else()
30+
set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES")
31+
endif()
32+
2333
add_library(
2434
extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
2535
cpuinfo_utils.cpp
@@ -36,7 +46,9 @@ target_include_directories(
3646
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include>
3747
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include>
3848
)
39-
target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
49+
target_compile_definitions(
50+
extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag}
51+
)
4052
target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
4153

4254
# Install libraries

extension/threadpool/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def define_common_targets():
2222
name = "threadpool_lib",
2323
srcs = _THREADPOOL_SRCS,
2424
deps = [
25+
":cpuinfo_utils",
2526
"//executorch/runtime/core:core",
2627
"//executorch/runtime/core/portable_type/c10/c10:c10",
2728
],

extension/threadpool/test/threadpool_test.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88

99
#include <executorch/extension/threadpool/threadpool.h>
10+
#include <executorch/runtime/platform/runtime.h>
1011

1112
#include <mutex>
1213
#include <numeric>
@@ -71,6 +72,8 @@ void run_lambda_with_size(
7172
} // namespace
7273

7374
TEST(ThreadPoolTest, ParallelAdd) {
75+
executorch::runtime::runtime_init();
76+
7477
std::vector<int32_t> a, b, c, c_ref;
7578
size_t vector_size = 100;
7679
size_t grain_size = 10;
@@ -111,6 +114,8 @@ TEST(ThreadPoolTest, ParallelAdd) {
111114

112115
// Test parallel reduction where we acquire lock within lambda
113116
TEST(ThreadPoolTest, ParallelReduce) {
117+
executorch::runtime::runtime_init();
118+
114119
std::vector<int32_t> a;
115120
int32_t c = 0, c_ref = 0;
116121
size_t vector_size = 100;
@@ -144,6 +149,8 @@ TEST(ThreadPoolTest, ParallelReduce) {
144149
// Copied from
145150
// caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
146151
TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
152+
executorch::runtime::runtime_init();
153+
147154
auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool();
148155

149156
ASSERT_NE(threadpool_ptr, nullptr);
@@ -173,6 +180,8 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
173180
}
174181

175182
TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
183+
executorch::runtime::runtime_init();
184+
176185
const std::vector<int64_t> array = {1, 2, 3};
177186

178187
auto pool = ::executorch::extension::threadpool::get_threadpool();

extension/threadpool/threadpool.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,34 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <executorch/extension/threadpool/cpuinfo_utils.h>
910
#include <executorch/extension/threadpool/threadpool.h>
1011

1112
#include <algorithm>
1213
#include <memory>
1314

1415
#include <executorch/extension/threadpool/threadpool_guard.h>
1516
#include <executorch/runtime/platform/assert.h>
17+
#include <executorch/runtime/platform/runtime.h>
1618

1719
#include <cpuinfo.h>
1820

21+
// At most one mode should be set.
22+
#if ( \
23+
defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
24+
defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES))
25+
#error Multiple \
26+
threadpool size specifiers are set.At most one of \
27+
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES, \
28+
and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined.
29+
#endif
30+
31+
// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set.
32+
#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
33+
!defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)
34+
#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1
35+
#endif
36+
1937
namespace executorch::extension::threadpool {
2038

2139
#if !(defined(WIN32))
@@ -97,13 +115,21 @@ void ThreadPool::run(
97115
// get_threadpool is not thread safe due to leak_corrupted_threadpool
98116
// Make this part threadsafe: TODO(kimishpatel)
99117
ThreadPool* get_threadpool() {
118+
executorch::runtime::runtime_init();
119+
100120
if (!cpuinfo_initialize()) {
101121
ET_LOG(Error, "cpuinfo initialization failed");
102122
return nullptr; // NOLINT(facebook-hte-NullableReturn)
103123
}
104124

105125
static const int num_threads = ([]() {
106-
int result = cpuinfo_get_processors_count();
126+
#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
127+
// Use threads=cores.
128+
auto result = cpuinfo_get_processors_count();
129+
#else
130+
// Set threads equal to the number of performance cores.
131+
auto result = ::executorch::extension::cpuinfo::get_num_performant_cores();
132+
#endif
107133

108134
/*
109135
* For llvm-tsan, holding limit for the number of locks for a single thread
@@ -113,9 +139,10 @@ ThreadPool* get_threadpool() {
113139
* tricky to detect if we are running under tsan, for now capping the
114140
* default threadcount to the tsan limit unconditionally.
115141
*/
116-
constexpr int tsan_thread_limit = 63;
142+
constexpr unsigned int tsan_thread_limit = 63;
117143
return std::min(result, tsan_thread_limit);
118144
})();
145+
119146
static auto threadpool = std::make_unique<ThreadPool>(num_threads);
120147

121148
// Inheriting from old threadpool to get around segfault issue

extension/threadpool/threadpool.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,22 @@
1414

1515
#include <pthreadpool.h>
1616

17+
/*
18+
* Threadpool Options:
19+
*
20+
* Threadpool size has a sizble affect on performance. By default, the
21+
* threadpool will be sized according to the number of performance cores. This
22+
* behavior can be overriden with the following build-time options. Note that
23+
* these options are mutually exclusive.
24+
*
25+
* - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool
26+
* equal to the number of performance cores on the system. This is the default
27+
* behavior.
28+
* - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool
29+
* equal to the number of logical cores on system. This is the historical
30+
* behavior.
31+
*/
32+
1733
namespace executorch::extension::threadpool {
1834

1935
class ThreadPool final {

tools/cmake/preset/default.cmake

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,36 @@ define_overridable_option(
183183
${_default_executorch_build_cpuinfo}
184184
)
185185

186+
# Threadpool size options. At most one can be specified. Note that the default
187+
# is managed in threadpool.cpp to allow the user to specify an alternate mode
188+
# without needing to explicitly set the default to off.
189+
define_overridable_option(
190+
EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES
191+
"Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores."
192+
BOOL
193+
OFF
194+
)
195+
define_overridable_option(
196+
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
197+
"Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores."
198+
BOOL
199+
OFF
200+
)
201+
202+
check_required_options_on(
203+
IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES
204+
EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
205+
)
206+
check_required_options_on(
207+
IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES
208+
EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
209+
)
210+
211+
check_conflicting_options_on(
212+
IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH
213+
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
214+
)
215+
186216
# TODO(jathu): move this to platform specific presets when created
187217
set(_default_executorch_build_executor_runner ON)
188218
if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")

0 commit comments

Comments
 (0)