Skip to content

Commit 816c28e

Browse files
committed
Add hipSYCL support
Fix bad new/free pairs
1 parent ad2f093 commit 816c28e

14 files changed

+430
-221
lines changed

CMakeLists.txt

+9-3
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,13 @@ if (NOT MODEL)
2929
# set(CUDA_ARCH sm_60)
3030

3131
# set(MODEL std-indices)
32-
set(MODEL std-indices)
32+
33+
set(MODEL sycl-usm)
34+
set(SYCL_COMPILER ONEAPI-ICPX)
35+
36+
set(ENV{HIPSYCL_TARGETS} omp.accelerated)
37+
set(SYCL_COMPILER HIPSYCL)
38+
set(SYCL_COMPILER_DIR /opt/hipsycl/485ea80/)
3339
# set(KOKKOS_IN_TREE /home/tom/Downloads/kokkos-4.0.01/)
3440
# set(Kokkos_ENABLE_OPENMP ON)
3541

@@ -95,8 +101,8 @@ endif ()
95101
## Flags for debugging only, enable for development (ASan only works on few models)
96102
set(SANITIZE OFF)
97103
if (SANITIZE)
98-
set(CXX_EXTRA_FLAGS -fsanitize=address -fsanitize=undefined)
99-
set(CXX_EXTRA_LINK_FLAGS -fsanitize=address -fsanitize=undefined)
104+
set(CXX_EXTRA_FLAGS -fsanitize=address -fsanitize=undefined -g )
105+
set(CXX_EXTRA_LINK_FLAGS -fsanitize=address -fsanitize=undefined -g)
100106
endif ()
101107

102108

src/sycl-acc/cg.cpp

+20-19
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,9 @@ void cg_init_others(const int x, //
8686
auto p = pBuff.get_access<access::mode::read_write>(h);
8787
auto kx = kxBuff.get_access<access::mode::read>(h);
8888
auto ky = kyBuff.get_access<access::mode::read>(h);
89-
h.parallel_for<class cg_init_others>(
90-
range<1>(x * y), //
91-
sycl::reduction(rro_temp, h, {}, sycl::plus<>(), sycl::property::reduction::initialize_to_identity()), //
89+
h.parallel_for<class cg_init_others>( //
90+
range<1>(x * y), //
91+
reduction_shim(rro_temp, h, {}, sycl::plus<double>()), //
9292
[=](item<1> item, auto &acc) {
9393
const auto kk = item[0] % x;
9494
const auto jj = item[0] / x;
@@ -124,19 +124,20 @@ void cg_calc_w(const int x, //
124124
auto p = pBuff.get_access<access::mode::read>(h);
125125
auto kx = kxBuff.get_access<access::mode::read>(h);
126126
auto ky = kyBuff.get_access<access::mode::read>(h);
127-
h.parallel_for<class cg_calc_w>(range<1>(x * y), //
128-
sycl::reduction(pw_temp, h, {}, sycl::plus<>(), sycl::property::reduction::initialize_to_identity()), //
129-
[=](item<1> item, auto &acc) {
130-
const auto kk = item[0] % x;
131-
const auto jj = item[0] / x;
132-
if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
133-
// smvp uses kx and ky and index
134-
int index = item[0];
135-
const double smvp = tealeaf_SMVP(p);
136-
w[item[0]] = smvp;
137-
acc += w[item[0]] * p[item[0]];
138-
}
139-
});
127+
h.parallel_for<class cg_calc_w>( //
128+
range<1>(x * y), //
129+
reduction_shim(pw_temp, h, {}, sycl::plus<double>()), //
130+
[=](item<1> item, auto &acc) {
131+
const auto kk = item[0] % x;
132+
const auto jj = item[0] / x;
133+
if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
134+
// smvp uses kx and ky and index
135+
int index = item[0];
136+
const double smvp = tealeaf_SMVP(p);
137+
w[item[0]] = smvp;
138+
acc += w[item[0]] * p[item[0]];
139+
}
140+
});
140141
});
141142
#ifdef ENABLE_PROFILING
142143
device_queue.wait_and_throw();
@@ -162,9 +163,9 @@ void cg_calc_ur(const int x, //
162163
auto p = pBuff.get_access<access::mode::read>(h);
163164
auto u = uBuff.get_access<access::mode::read_write>(h);
164165
auto r = rBuff.get_access<access::mode::read_write>(h);
165-
h.parallel_for<class cg_calc_ur>(
166-
range<1>(x * y), //
167-
sycl::reduction(rrn_temp, h, {}, sycl::plus<>(), sycl::property::reduction::initialize_to_identity()), //
166+
h.parallel_for<class cg_calc_ur>( //
167+
range<1>(x * y), //
168+
reduction_shim(rrn_temp, h, {}, sycl::plus<double>()), //
168169
[=](item<1> item, auto &acc) {
169170
const auto kk = item[0] % x;
170171
const auto jj = item[0] / x;

src/sycl-acc/jacobi.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ void jacobi_iterate(const int x, //
6666
auto kx = kxBuff.get_access<access::mode::read>(h);
6767
auto ky = kyBuff.get_access<access::mode::read>(h);
6868

69-
h.parallel_for<class jacobi_iterate>(
70-
range<1>(x * y), //
71-
sycl::reduction(error_temp, h, {}, sycl::plus<>(), sycl::property::reduction::initialize_to_identity()), //
69+
h.parallel_for<class jacobi_iterate>( //
70+
range<1>(x * y), //
71+
reduction_shim(error_temp, h, {}, sycl::plus<double>()), //
7272
[=](item<1> item, auto &acc) {
7373
const auto kk = item[0] % x;
7474
const auto jj = item[0] / x;

src/sycl-acc/kernel_initialise.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -283,11 +283,11 @@ void run_kernel_initialise(Chunk *chunk, Settings &settings, int comms_lr_len, i
283283
allocate_buffer(&(chunk->cheby_betas), settings.max_iters, 1);
284284
}
285285

286-
void run_kernel_finalise(Chunk *chunk, Settings &settings) {
287-
delete chunk->cg_alphas;
288-
delete chunk->cg_betas;
289-
delete chunk->cheby_alphas;
290-
delete chunk->cheby_betas;
286+
void run_kernel_finalise(Chunk *chunk, Settings &) {
287+
delete[] chunk->cg_alphas;
288+
delete[] chunk->cg_betas;
289+
delete[] chunk->cheby_alphas;
290+
delete[] chunk->cheby_betas;
291291

292292
delete chunk->density0;
293293
delete chunk->density;

src/sycl-acc/pack_halos.cpp

+56-11
Original file line numberDiff line numberDiff line change
@@ -206,23 +206,26 @@ void run_pack_or_unpack(Chunk *chunk, Settings &settings, int depth, int face, b
206206
STOP_PROFILING(settings.kernel_profile, __func__);
207207
}
208208

209+
#if !(defined(__HIPSYCL__) || defined(__OPENSYCL__))
210+
209211
template <typename A> decltype(auto) get_native_ptr_or_throw(sycl::interop_handle &ih, A accessor) {
210212
using sycl::backend;
211213
using T = std::remove_cv_t<typename decltype(accessor)::value_type>;
212214
switch (ih.get_backend()) {
213215
case backend::ext_oneapi_level_zero: return reinterpret_cast<T *>(ih.get_native_mem<backend::ext_oneapi_level_zero>(accessor));
214-
#ifdef SYCL_EXT_ONEAPI_BACKEND_cuda
216+
#ifdef SYCL_EXT_ONEAPI_BACKEND_CUDA
215217
case backend::ext_oneapi_cuda: return reinterpret_cast<T *>(ih.get_native_mem<backend::ext_oneapi_cuda>(accessor));
216-
#endif
217-
#ifdef SYCL_EXT_ONEAPI_BACKEND_HIP
218+
#endif
219+
#ifdef SYCL_EXT_ONEAPI_BACKEND_HIP
218220
case backend::ext_oneapi_hip: return reinterpret_cast<T *>(ih.get_native_mem<backend::ext_oneapi_hip>(accessor));
219-
#endif
221+
#endif
220222
default:
221223
std::stringstream ss;
222224
ss << "backend " << ih.get_backend() << " does not support a pointer-based sycl::interop_handle::get_native_mem";
223225
throw std::logic_error(ss.str());
224226
}
225227
}
228+
#endif
226229

227230
void run_send_recv_halo(Chunk *chunk, Settings &settings, //
228231
FieldBufferType src_send_buffer, FieldBufferType src_recv_buffer, //
@@ -247,7 +250,7 @@ void run_send_recv_halo(Chunk *chunk, Settings &settings,
247250
chunk->ext->device_queue->submit([&](sycl::handler &h) {
248251
auto snd_buffer_acc = src_send_buffer->get_access<access_mode::read>(h);
249252
auto rcv_buffer_acc = src_recv_buffer->get_access<access_mode::write>(h);
250-
h.host_task([=, &settings](sycl::interop_handle ih) { // XXX pass handle arg here as copy, not ref!
253+
h.host_task([=, &settings](sycl::interop_handle ih) { // XXX pass handle arg here as copy, not ref!
251254
send_recv_message(settings, //
252255
get_native_ptr_or_throw(ih, snd_buffer_acc), //
253256
get_native_ptr_or_throw(ih, rcv_buffer_acc), //
@@ -256,13 +259,55 @@ void run_send_recv_halo(Chunk *chunk, Settings &settings,
256259
});
257260
}
258261
#else
259-
chunk->ext->device_queue->wait_and_throw();
260-
send_recv_message(settings, //
261-
host_accessor<double, 1, access_mode::read>{*src_send_buffer, buffer_len}.get_pointer(),
262-
host_accessor<double, 1, access_mode::read>{*src_recv_buffer, buffer_len}.get_pointer(), buffer_len, neighbour,
263-
send_tag, recv_tag, send_request, recv_request);
262+
if (settings.staging_buffer) {
263+
chunk->ext->device_queue->wait_and_throw();
264+
send_recv_message(settings, //
265+
host_accessor<double, 1, access_mode::read_write>{*src_send_buffer, buffer_len}.get_pointer(),
266+
host_accessor<double, 1, access_mode::read_write>{*src_recv_buffer, buffer_len}.get_pointer(), buffer_len, neighbour,
267+
send_tag, recv_tag, send_request, recv_request);
268+
} else {
269+
#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
270+
// chunk->ext->device_queue->wait_and_throw();
271+
auto d = chunk->ext->device_queue->get_device();
272+
// Construct the buffers so that get_pointer is not nullptr, only happens once per rank for the lifetime of the program
273+
if (!src_recv_buffer->get_pointer(d))
274+
chunk->ext->device_queue->submit([&](sycl::handler &h) { h.update(sycl::accessor{*src_recv_buffer, h}); }).wait_and_throw();
275+
if (!src_send_buffer->get_pointer(d))
276+
chunk->ext->device_queue->submit([&](sycl::handler &h) { h.update(sycl::accessor{*src_send_buffer, h}); }).wait_and_throw();
277+
// We can't use host_task here, but since we can pull out the pointers directly, if we synchronise before MPI_Waitall
278+
// the desired concurrency should still be there
279+
chunk->ext->device_queue->submit([&](sycl::handler &h) {
280+
h.update(sycl::accessor{*src_send_buffer, h, sycl::read_only});
281+
})
282+
.wait_and_throw();
283+
chunk->ext->device_queue->submit([&](sycl::handler &h) {
284+
h.update(sycl::accessor{*src_recv_buffer, h, sycl::write_only});
285+
})
286+
.wait_and_throw();
287+
send_recv_message(settings, //
288+
src_send_buffer->get_pointer(d), //
289+
src_recv_buffer->get_pointer(d), //
290+
buffer_len, neighbour, send_tag, recv_tag, send_request, recv_request);
291+
#else
292+
throw std::logic_error("host_task is disabled and staging is also disabled, this won't work");
293+
#endif
294+
}
264295
#endif
265296
}
266297

267-
void run_before_waitall_halo(Chunk *chunk, Settings &) { chunk->ext->device_queue->wait_and_throw(); }
298+
void run_before_waitall_halo(Chunk *chunk, Settings &settings) {
299+
#ifdef USE_HOSTTASK
300+
chunk->ext->device_queue->wait_and_throw();
301+
#else
302+
if (settings.staging_buffer) {
303+
// drop-through to waitall directly
304+
} else {
305+
#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
306+
chunk->ext->device_queue->wait_and_throw();
307+
#else
308+
throw std::logic_error("host_task is disabled and staging is also disabled, this won't work");
309+
#endif
310+
}
311+
#endif
312+
}
268313
void run_restore_recv_halo(Chunk *, Settings &, FieldBufferType, StagingBufferType, int) {}

src/sycl-acc/solver_methods.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ void field_summary_func(const int x, //
3232
auto density = densityBuff.get_access<access::mode::read>(h);
3333
auto energy0 = energy0Buff.get_access<access::mode::read>(h);
3434
auto volume = volumeBuff.get_access<access::mode::read>(h);
35-
h.parallel_for<class field_summary_func>(
36-
range<1>(x * y), //
37-
sycl::reduction(summary_temp, h, {}, sycl::plus<>(), sycl::property::reduction::initialize_to_identity()), //
35+
h.parallel_for<class field_summary_func>( //
36+
range<1>(x * y), //
37+
reduction_shim(summary_temp, h, {}, sycl::plus<Summary>()), //
3838
[=](item<1> item, auto &acc) {
3939
const auto kk = item[0] % x;
4040
const auto jj = item[0] / x;
@@ -141,8 +141,8 @@ void calculate_2norm(const int x, //
141141
buffer<double, 1> norm_temp{range<1>{1}};
142142
device_queue.submit([&](handler &h) {
143143
auto buffer = bufferBuff.get_access<access::mode::read>(h);
144-
h.parallel_for<class calculate_2norm>(
145-
range<1>(x * y), sycl::reduction(norm_temp, h, {}, sycl::plus<>(), sycl::property::reduction::initialize_to_identity()), //
144+
h.parallel_for<class calculate_2norm>( //
145+
range<1>(x * y), reduction_shim(norm_temp, h, {}, sycl::plus<double>()), //
146146
[=](item<1> item, auto &acc) {
147147
const auto kk = item[0] % x;
148148
const auto jj = item[0] / x;

src/sycl-acc/sycl_shared.hpp

+8
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,11 @@
55
using namespace cl::sycl;
66

77
using SyclBuffer = buffer<double, 1>;
8+
9+
template <typename T, int N, typename BinaryOp> inline auto reduction_shim(buffer<T, N> &b, sycl::handler &h, T init, BinaryOp f) {
10+
#if defined(__HIPSYCL__) || defined(__OPENSYCL__)
11+
return sycl::reduction(b. template get_access<access_mode::read_write>(h), init, f);
12+
#else
13+
return sycl::reduction(b, h, init, f, sycl::property::reduction::initialize_to_identity());
14+
#endif
15+
}

0 commit comments

Comments
 (0)