Skip to content

Commit 9594623

Browse files
AntonRydahlldionne
authored andcommitted
Adding OpenMP Offloading Backend for C++ Parallel Algorithms
1 parent d4a0154 commit 9594623

31 files changed

+1532
-2
lines changed

.github/workflows/libcxx-build-and-test.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ jobs:
153153
'generic-no-wide-characters',
154154
'generic-no-rtti',
155155
'generic-optimized-speed',
156+
'generic-pstl-openmp',
156157
'generic-static',
157158
# TODO Find a better place for the benchmark and bootstrapping builds to live. They're either very expensive
158159
# or don't provide much value since the benchmark run results are too noise on the bots.

libcxx/CMakeLists.txt

+10-2
Original file line numberDiff line numberDiff line change
@@ -301,10 +301,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
301301
This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
302302

303303
if (LIBCXX_ENABLE_THREADS)
304-
set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
304+
set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
305305
else()
306-
set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
306+
set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
307307
endif()
308+
set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
308309

309310
# Misc options ----------------------------------------------------------------
310311
# FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -571,6 +572,11 @@ function(cxx_add_basic_build_flags target)
571572
endif()
572573
endif()
573574
target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
575+
576+
# If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
577+
if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
578+
target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
579+
endif()
574580
endfunction()
575581

576582
# Exception flags =============================================================
@@ -800,6 +806,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
800806
config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
801807
elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
802808
config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
809+
elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
810+
config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
803811
else()
804812
message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
805813
Valid backends are: serial, std_thread and libdispatch")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")

libcxx/docs/BuildingLibcxx.rst

+11
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,17 @@ libc++ Feature Options
424424
provided, this header will be included by the library, replacing the
425425
default assertion handler.
426426

427+
.. option:: LIBCXX_PSTL_BACKEND:STRING
428+
429+
**Default**:: ``"serial"``
430+
431+
**Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
432+
433+
Select the desired backend for C++ parallel algorithms. All four options can
434+
target multi-core CPU architectures, and ``openmp`` can additionally target
435+
GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
436+
later.
437+
427438

428439
libc++ ABI Feature Options
429440
--------------------------

libcxx/docs/UsingLibcxx.rst

+104
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,110 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
364364
* You are using allocator, which does not call destructor during deallocation.
365365
* You are aware that memory allocated with an allocator may be accessed, even when unused by container.
366366

367+
Offloading C++ Parallel Algorithms to GPUs
368+
------------------------------------------
369+
370+
Experimental support for GPU offloading has been added to ``libc++``. The
371+
implementation uses OpenMP target offloading to leverage GPU compute resources.
372+
The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
373+
However, the implementation only supports contiguous iterators, such as
374+
iterators for ``std::vector`` or ``std::array``.
375+
To enable the OpenMP offloading backend it must be selected with
376+
``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
377+
compiling a program, the user must specify the command line options
378+
``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
379+
enabled, please read
380+
`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
381+
You may also want to to visit
382+
`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
383+
384+
Example
385+
~~~~~~~
386+
387+
The following is an example of offloading vector addition to a GPU using our
388+
standard library extension. It implements the classical vector addition from
389+
BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
390+
both used as an input and an output iterator in this example.
391+
392+
.. code-block:: cpp
393+
394+
#include <algorithm>
395+
#include <execution>
396+
397+
template <typename T1, typename T2, typename T3>
398+
void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
399+
std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
400+
y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
401+
}
402+
403+
The execution policy ``std::execution::par_unseq`` states that the algorithm's
404+
execution may be parallelized, vectorized, and migrated across threads. This is
405+
the only execution mode that is safe to offload to GPUs, and for all other
406+
execution modes the algorithms will execute on the CPU.
407+
Special attention must be paid to the lambda captures when enabling GPU
408+
offloading. If the lambda captures by reference, the user must manually map the
409+
variables to the device. If capturing by reference, the above example could
410+
be implemented in the following way.
411+
412+
.. code-block:: cpp
413+
414+
template <typename T1, typename T2, typename T3>
415+
void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
416+
#pragma omp target data map(to : a)
417+
std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
418+
y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
419+
}
420+
421+
However, if unified shared memory, USM, is enabled, no additional data mapping
422+
is necessary when capturing y reference.
423+
424+
Compiling functions for GPUs with OpenMP
425+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
426+
427+
The C++ standard defines that all accesses to memory are inside a single address
428+
space. However, discrete GPU systems have distinct address spaces. A single
429+
address space can be emulated if your system supports unified shared memory.
430+
However, many discrete GPU systems do not, and in those cases it is important to
431+
pass device function pointers to the parallel algorithms. Below is an example of
432+
how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
433+
used to mark that a function should be compiled for both host and device.
434+
435+
.. code-block:: cpp
436+
437+
// This function computes the squared difference of two floating points
438+
float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
439+
440+
// Declare that the function must be compiled for both host and device
441+
#pragma omp declare target indirect to(squared)
442+
443+
int main() {
444+
std::vector<float> a(100, 1.0);
445+
std::vector<float> b(100, 1.25);
446+
447+
// Pass the host function pointer to the parallel algorithm and let OpenMP
448+
// translate it to the device function pointer internally
449+
float sum =
450+
std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
451+
b.begin(), 0.0f, std::plus{}, squared);
452+
453+
// Validate that the result is approximately 6.25
454+
assert(std::abs(sum - 6.25f) < 1e-10);
455+
return 0;
456+
}
457+
458+
Without unified shared memory, the above example will not work if the host
459+
function pointer ``squared`` is passed to the parallel algorithm.
460+
461+
Important notes about exception handling
462+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
463+
464+
GPU architectures do not support exception handling. If compiling a program
465+
containing parallel algorithms with current versions of Clang, a program with
466+
exceptions in offloaded code regions will compile, but the program will
467+
terminate if an exception is thrown on the device. This does not conform with
468+
the C++ standard and exception handling on GPUs will hopefully be better
469+
supported in future releases of LLVM.
470+
367471
Platform specific behavior
368472
==========================
369473

libcxx/include/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,7 @@ set(files
579579
__pstl/backend_fwd.h
580580
__pstl/backends/default.h
581581
__pstl/backends/libdispatch.h
582+
__pstl/backends/openmp.h
582583
__pstl/backends/serial.h
583584
__pstl/backends/std_thread.h
584585
__pstl/cpu_algos/any_of.h

libcxx/include/__config_site.in

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL
3636
#cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD
3737
#cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH
38+
#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
3839

3940
// Hardening.
4041
#cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@

libcxx/include/__pstl/backend.h

+4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ _LIBCPP_PUSH_MACROS
2828
#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
2929
# include <__pstl/backends/default.h>
3030
# include <__pstl/backends/libdispatch.h>
31+
#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
32+
# include <__pstl/backends/default.h>
33+
# include <__pstl/backends/openmp.h>
34+
# include <__pstl/backends/std_thread.h>
3135
#endif
3236

3337
_LIBCPP_POP_MACROS

libcxx/include/__pstl/backend_fwd.h

+4
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct __backend_configuration;
4747

4848
struct __default_backend_tag;
4949
struct __libdispatch_backend_tag;
50+
struct __openmp_backend_tag;
5051
struct __serial_backend_tag;
5152
struct __std_thread_backend_tag;
5253

@@ -56,6 +57,9 @@ using __current_configuration = __backend_configuration<__serial_backend_tag, __
5657
using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
5758
#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
5859
using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
60+
#elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
61+
using __current_configuration =
62+
__backend_configuration<__openmp_backend_tag, __std_thread_backend_tag, __default_backend_tag>;
5963
#else
6064

6165
// ...New vendors can add parallel backends here...

0 commit comments

Comments
 (0)