Skip to content

Commit 23c51bc

Browse files
committed
Refactor to support OpenMP in table generation and scaling algorithms and use it by default
1 parent 31f5c18 commit 23c51bc

8 files changed

+399
-23
lines changed

CMakeLists.txt

+14-1
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,12 @@ include(GNUInstallDirs)
3636

3737
## setting target API
3838
#
39-
set(ONE4ALL_TARGET_API "stl" CACHE
39+
set(ONE4ALL_TARGET_API "openmp" CACHE
4040
STRING "Choose target API ?")
4141
set_property(CACHE ONE4ALL_TARGET_API PROPERTY STRINGS
4242
"cuda"
4343
"oneapi"
44+
"openmp"
4445
"rocm"
4546
"stl"
4647
)
@@ -69,6 +70,8 @@ elseif(${ONE4ALL_TARGET_API} STREQUAL oneapi)
6970
find_package(IntelDPCPP QUIET REQUIRED)
7071
endif()
7172
find_package(oneDPL REQUIRED)
73+
elseif(${ONE4ALL_TARGET_API} STREQUAL openmp)
74+
find_package(OpenMP REQUIRED)
7275
elseif(${ONE4ALL_TARGET_API} STREQUAL rocm)
7376
include(CheckLanguage)
7477
check_language(HIP)
@@ -90,6 +93,7 @@ elseif(${ONE4ALL_TARGET_API} STREQUAL stl)
9093
find_package(OpenMP REQUIRED)
9194
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
9295
find_package(TBB REQUIRED) # required for C++17 parallel algorithms
96+
add_compile_definitions(USE_TBB=1)
9397
else()
9498
message(FATAL_ERROR "Wrong ONE4ALL_TARGET_API: ${ONE4ALL_TARGET_API}")
9599
endif()
@@ -141,6 +145,15 @@ target_link_libraries(oneapi INTERFACE
141145
oneDPL
142146
)
143147

148+
## defining one4all::openmp target
149+
#
150+
add_library(openmp INTERFACE)
151+
add_library(${PROJECT_NAME}::openmp ALIAS openmp)
152+
target_link_libraries(openmp INTERFACE
153+
${PROJECT_NAME}::${PROJECT_NAME}
154+
OpenMP::OpenMP_CXX
155+
)
156+
144157
## defining one4all::rocm target
145158
#
146159
add_library(rocm INTERFACE)

README.md

+13-3
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ Updated slides of the above video with more accurate benchmark results are inclu
2626
- [NVIDIA A100 vs. AMD MI210](#nvidia-a100-sxm4-40gb-sm108-vs-amd-instinct-mi210-sm104-higher-is-better)
2727
- [Using *one4all* for new projects](#using-one4all-for-new-projects)
2828
## Features
29-
- Support four target APIs
29+
- Support five target APIs
3030
- CUDA
3131
- oneAPI
32+
- OpenMP
3233
- ROCm
3334
- STL Parallel Algorithms
3435
- All the configurations are automatically done by [CMake](https://cmake.org/)
@@ -52,14 +53,23 @@ On [the Alliance](https://alliancecan.ca/) clusters, you can activate the above
5253
module load cmake googlebenchmark catch2
5354
```
5455

55-
### Building C++17 parallel algorithm version
56-
Parallel STL requires a [TBB](https://github.com/oneapi-src/oneTBB) version between 2018 to 2020 to work.
56+
### Building OpenMP version
57+
By default, CMake script builds the OpenMP version (i.e. `-DONE4ALL_TARGET_API=openmp`):
5758
```shell
5859
git clone https://github.com/arminms/one4all.git
5960
cd one4all
6061
cmake -S . -B build
6162
cmake --build build -j
6263
```
64+
65+
### Building C++17 parallel algorithm version
66+
Parallel STL requires a [TBB](https://github.com/oneapi-src/oneTBB) version between 2018 to 2020 to work.
67+
```shell
68+
git clone https://github.com/arminms/one4all.git
69+
cd one4all
70+
cmake -S . -B build-stl -DONE4ALL_TARGET_API=stl
71+
cmake --build build-stl -j
72+
```
6373
### Building CUDA version
6474
Requires CUDA version 11 or higher.
6575
```shell

include/one4all/algorithm/generate_table.hpp

+7-5
Original file line numberDiff line numberDiff line change
@@ -224,11 +224,9 @@ inline void generate_table
224224

225225
#else
226226

227-
# include <algorithm>
228-
# include <thread>
229227
# include <omp.h>
230228

231-
namespace one4all {
229+
namespace one4all::openmp {
232230

233231
//----------------------------------------------------------------------------//
234232
// block splitting technique (OpenMP)
@@ -265,17 +263,21 @@ inline void generate_table_bs
265263
}
266264
}
267265

266+
} // end of one4all::openmp namespace
267+
268268
//----------------------------------------------------------------------------//
269269
// set block splitting as the default algorithm using a function alias
270270

271+
namespace one4all {
272+
271273
template
272274
< typename ...ExplicitArgs
273275
, typename... Args
274276
>
275277
inline void generate_table(Args&&... args)
276-
{ generate_table_bs<ExplicitArgs...>(std::forward<Args>(args)...); }
278+
{ openmp::generate_table_bs<ExplicitArgs...>(std::forward<Args>(args)...); }
277279

278-
} // end one4all namespace
280+
} // end of one4all namespace
279281

280282
#endif //__INTEL_LLVM_COMPILER && SYCL_LANGUAGE_VERSION
281283

include/one4all/algorithm/scale_table.hpp

+40-1
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ inline void scale_table
223223

224224
//----------------------------------------------------------------------------//
225225

226-
#else
226+
#elif defined(USE_TBB)
227227

228228
# include <algorithm>
229229
# include <tbb/iterators.h>
@@ -271,6 +271,45 @@ inline void scale_table
271271

272272
} // end of one4all namespace
273273

274+
#else
275+
276+
namespace one4all::openmp {
277+
278+
//----------------------------------------------------------------------------//
279+
280+
template
281+
< typename T
282+
, typename Input1T
283+
, typename Input2T
284+
, typename OutputT
285+
, typename RSizeT
286+
, typename CSizeT
287+
>
288+
inline void scale_table
289+
( Input1T range
290+
, Input2T in
291+
, OutputT out
292+
, RSizeT nr
293+
, CSizeT nc
294+
, T tmin
295+
, T tmax
296+
)
297+
{ auto min = range;
298+
auto max = range + nc;
299+
#pragma omp parallel for
300+
for (size_t i = 0; i < nr * nc; ++i)
301+
{ CSizeT idx = i % nc;
302+
* ( out + i )
303+
= ( *(in + i) - *(min + idx) )
304+
/ ( *(max + idx) - *(min + idx) )
305+
* ( tmax - tmin )
306+
+ tmin
307+
;
308+
}
309+
}
310+
311+
} // end of one4all::openmp namespace
312+
274313
#endif //__INTEL_LLVM_COMPILER && SYCL_LANGUAGE_VERSION
275314

276315
#endif // _ONE4ALL_ALGORITHM_SCALE_TABLE_HPP_

perf/benchmarks_openmp.cpp

+181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
#include <execution>
2+
#include <vector>
3+
#include <algorithm>
4+
5+
#include <benchmark/benchmark.h>
6+
7+
#include <one4all/pcg/pcg_random.hpp>
8+
#include <one4all/algorithm/generate_table.hpp>
9+
#include <one4all/algorithm/scale_table.hpp>
10+
11+
const unsigned long seed_pi{3141592654};
12+
13+
//----------------------------------------------------------------------------//
14+
// generate_table() algortithm
15+
16+
template <class T>
17+
void generate_table_rs_seq_x8(benchmark::State& st)
18+
{ size_t nr = size_t(st.range());
19+
size_t nc = 8;
20+
std::vector<T> b(nr * nc), bs(nr * nc), r
21+
{ T(-10), T(-5), T(-1), T(0), T(1), T( 5), T(10), T(15) // mins
22+
, T( -5), T(-1), T( 0), T(1), T(5), T(10), T(15), T(20) // maxs
23+
};
24+
25+
for (auto _ : st)
26+
one4all::generate_table_rs<pcg32>
27+
( r.begin()
28+
, b.begin()
29+
, nr
30+
, nc
31+
, seed_pi
32+
);
33+
34+
st.counters["BW (GB/s)"] = benchmark::Counter
35+
( (nr * nc * sizeof(T)) / 1e9
36+
, benchmark::Counter::kIsIterationInvariantRate
37+
);
38+
}
39+
40+
BENCHMARK_TEMPLATE(generate_table_rs_seq_x8, float)
41+
-> RangeMultiplier(2)
42+
-> Range(1<<20, 1<<24)
43+
-> Unit(benchmark::kMillisecond);
44+
45+
BENCHMARK_TEMPLATE(generate_table_rs_seq_x8, double)
46+
-> RangeMultiplier(2)
47+
-> Range(1<<20, 1<<24)
48+
-> Unit(benchmark::kMillisecond);
49+
50+
template <class T>
51+
void generate_table_bs_x8(benchmark::State& st)
52+
{ size_t nr = size_t(st.range());
53+
size_t nc = 8;
54+
std::vector<T> b(nr * nc), bs(nr * nc), r
55+
{ T(-10), T(-5), T(-1), T(0), T(1), T( 5), T(10), T(15) // mins
56+
, T( -5), T(-1), T( 0), T(1), T(5), T(10), T(15), T(20) // maxs
57+
};
58+
59+
for (auto _ : st)
60+
one4all::openmp::generate_table_bs<pcg32>
61+
( r.begin()
62+
, b.begin()
63+
, nr
64+
, nc
65+
, seed_pi
66+
);
67+
68+
st.counters["BW (GB/s)"] = benchmark::Counter
69+
( (nr * nc * sizeof(T)) / 1e9
70+
, benchmark::Counter::kIsIterationInvariantRate
71+
);
72+
}
73+
74+
BENCHMARK_TEMPLATE(generate_table_bs_x8, float)
75+
-> RangeMultiplier(2)
76+
-> Range(1<<20, 1<<24)
77+
-> UseRealTime()
78+
-> Unit(benchmark::kMillisecond);
79+
80+
BENCHMARK_TEMPLATE(generate_table_bs_x8, double)
81+
-> RangeMultiplier(2)
82+
-> Range(1<<20, 1<<24)
83+
-> UseRealTime()
84+
-> Unit(benchmark::kMillisecond);
85+
86+
//----------------------------------------------------------------------------//
87+
// scale_table() algorithm
88+
89+
template <class T>
90+
void scale_table_seq_x8(benchmark::State& st)
91+
{ size_t nr = size_t(st.range());
92+
size_t nc = 8;
93+
std::vector<T> b(nr * nc), bs(nr * nc), r
94+
{ T(-10), T(-5), T(-1), T(0), T(1), T( 5), T(10), T(15) // mins
95+
, T( -5), T(-1), T( 0), T(1), T(5), T(10), T(15), T(20) // maxs
96+
};
97+
98+
one4all::generate_table<pcg32>
99+
( r.begin()
100+
, b.begin()
101+
, nr
102+
, nc
103+
, seed_pi
104+
);
105+
106+
for (auto _ : st)
107+
one4all::scale_table
108+
( r.begin()
109+
, b.begin()
110+
, bs.begin()
111+
, nr
112+
, nc
113+
, T(-1.0), T(1.0)
114+
);
115+
116+
st.counters["BW (GB/s)"] = benchmark::Counter
117+
( (nr * nc * sizeof(T) * 2) / 1e9
118+
, benchmark::Counter::kIsIterationInvariantRate
119+
);
120+
}
121+
122+
BENCHMARK_TEMPLATE(scale_table_seq_x8, float)
123+
-> RangeMultiplier(2)
124+
-> Range(1<<20, 1<<24)
125+
-> Unit(benchmark::kMillisecond);
126+
127+
BENCHMARK_TEMPLATE(scale_table_seq_x8, double)
128+
-> RangeMultiplier(2)
129+
-> Range(1<<20, 1<<24)
130+
-> Unit(benchmark::kMillisecond);
131+
132+
template <class T>
133+
void scale_table_openmp_x8(benchmark::State& st)
134+
{ size_t nr = size_t(st.range());
135+
size_t nc = 8;
136+
std::vector<T> b(nr * nc), bs(nr * nc), r
137+
{ T(-10), T(-5), T(-1), T(0), T(1), T( 5), T(10), T(15) // mins
138+
, T( -5), T(-1), T( 0), T(1), T(5), T(10), T(15), T(20) // maxs
139+
};
140+
141+
one4all::generate_table<pcg32>
142+
( r.begin()
143+
, b.begin()
144+
, nr
145+
, nc
146+
, seed_pi
147+
);
148+
149+
for (auto _ : st)
150+
one4all::openmp::scale_table
151+
( r.begin()
152+
, b.begin()
153+
, bs.begin()
154+
, nr
155+
, nc
156+
, T(-1.0), T(1.0)
157+
);
158+
159+
st.counters["BW (GB/s)"] = benchmark::Counter
160+
( (nr * nc * sizeof(T) * 2) / 1e9
161+
, benchmark::Counter::kIsIterationInvariantRate
162+
| benchmark::Counter::kAvgThreadsRate
163+
);
164+
}
165+
166+
BENCHMARK_TEMPLATE(scale_table_openmp_x8, float)
167+
-> RangeMultiplier(2)
168+
-> Range(1<<20, 1<<24)
169+
-> UseRealTime()
170+
-> Unit(benchmark::kMillisecond);
171+
172+
BENCHMARK_TEMPLATE(scale_table_openmp_x8, double)
173+
-> RangeMultiplier(2)
174+
-> Range(1<<20, 1<<24)
175+
-> UseRealTime()
176+
-> Unit(benchmark::kMillisecond);
177+
178+
//----------------------------------------------------------------------------//
179+
// main()
180+
181+
BENCHMARK_MAIN();

perf/benchmarks_stl.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void generate_table_bs_x8(benchmark::State& st)
5757
};
5858

5959
for (auto _ : st)
60-
one4all::generate_table_bs<pcg32>
60+
one4all::generate_table<pcg32>
6161
( r.begin()
6262
, b.begin()
6363
, nr

0 commit comments

Comments
 (0)