@@ -206,23 +206,26 @@ void run_pack_or_unpack(Chunk *chunk, Settings &settings, int depth, int face, b
206
206
STOP_PROFILING (settings.kernel_profile , __func__);
207
207
}
208
208
209
+ #if !(defined(__HIPSYCL__) || defined(__OPENSYCL__))
210
+
209
211
template <typename A> decltype (auto ) get_native_ptr_or_throw(sycl::interop_handle &ih, A accessor) {
210
212
using sycl::backend;
211
213
using T = std::remove_cv_t <typename decltype (accessor)::value_type>;
212
214
switch (ih.get_backend ()) {
213
215
case backend::ext_oneapi_level_zero: return reinterpret_cast <T *>(ih.get_native_mem <backend::ext_oneapi_level_zero>(accessor));
214
- #ifdef SYCL_EXT_ONEAPI_BACKEND_cuda
216
+ #ifdef SYCL_EXT_ONEAPI_BACKEND_CUDA
215
217
case backend::ext_oneapi_cuda: return reinterpret_cast <T *>(ih.get_native_mem <backend::ext_oneapi_cuda>(accessor));
216
- #endif
217
- #ifdef SYCL_EXT_ONEAPI_BACKEND_HIP
218
+ #endif
219
+ #ifdef SYCL_EXT_ONEAPI_BACKEND_HIP
218
220
case backend::ext_oneapi_hip: return reinterpret_cast <T *>(ih.get_native_mem <backend::ext_oneapi_hip>(accessor));
219
- #endif
221
+ #endif
220
222
default :
221
223
std::stringstream ss;
222
224
ss << " backend " << ih.get_backend () << " does not support a pointer-based sycl::interop_handle::get_native_mem" ;
223
225
throw std::logic_error (ss.str ());
224
226
}
225
227
}
228
+ #endif
226
229
227
230
void run_send_recv_halo (Chunk *chunk, Settings &settings, //
228
231
FieldBufferType src_send_buffer, FieldBufferType src_recv_buffer, //
@@ -247,7 +250,7 @@ void run_send_recv_halo(Chunk *chunk, Settings &settings,
247
250
chunk->ext ->device_queue ->submit ([&](sycl::handler &h) {
248
251
auto snd_buffer_acc = src_send_buffer->get_access <access_mode::read >(h);
249
252
auto rcv_buffer_acc = src_recv_buffer->get_access <access_mode::write >(h);
250
- h.host_task ([=, &settings](sycl::interop_handle ih) { // XXX pass handle arg here as copy, not ref!
253
+ h.host_task ([=, &settings](sycl::interop_handle ih) { // XXX pass handle arg here as copy, not ref!
251
254
send_recv_message (settings, //
252
255
get_native_ptr_or_throw (ih, snd_buffer_acc), //
253
256
get_native_ptr_or_throw (ih, rcv_buffer_acc), //
@@ -256,13 +259,55 @@ void run_send_recv_halo(Chunk *chunk, Settings &settings,
256
259
});
257
260
}
258
261
#else
259
- chunk->ext ->device_queue ->wait_and_throw ();
260
- send_recv_message (settings, //
261
- host_accessor<double , 1 , access_mode::read >{*src_send_buffer, buffer_len}.get_pointer (),
262
- host_accessor<double , 1 , access_mode::read >{*src_recv_buffer, buffer_len}.get_pointer (), buffer_len, neighbour,
263
- send_tag, recv_tag, send_request, recv_request);
262
+ if (settings.staging_buffer ) {
263
+ chunk->ext ->device_queue ->wait_and_throw ();
264
+ send_recv_message (settings, //
265
+ host_accessor<double , 1 , access_mode::read_write>{*src_send_buffer, buffer_len}.get_pointer (),
266
+ host_accessor<double , 1 , access_mode::read_write>{*src_recv_buffer, buffer_len}.get_pointer (), buffer_len, neighbour,
267
+ send_tag, recv_tag, send_request, recv_request);
268
+ } else {
269
+ #if defined(__HIPSYCL__) || defined(__OPENSYCL__)
270
+ // chunk->ext->device_queue->wait_and_throw();
271
+ auto d = chunk->ext ->device_queue ->get_device ();
272
+ // Construct the buffers so that get_pointer is not nullptr, only happens once per rank for the lifetime of the program
273
+ if (!src_recv_buffer->get_pointer (d))
274
+ chunk->ext ->device_queue ->submit ([&](sycl::handler &h) { h.update (sycl::accessor{*src_recv_buffer, h}); }).wait_and_throw ();
275
+ if (!src_send_buffer->get_pointer (d))
276
+ chunk->ext ->device_queue ->submit ([&](sycl::handler &h) { h.update (sycl::accessor{*src_send_buffer, h}); }).wait_and_throw ();
277
+ // We can't use host_task here, but since we can pull out the pointers directly, if we synchronise before MPI_Waitall
278
+ // the desired concurrency should still be there
279
+ chunk->ext ->device_queue ->submit ([&](sycl::handler &h) {
280
+ h.update (sycl::accessor{*src_send_buffer, h, sycl::read_only});
281
+ })
282
+ .wait_and_throw ();
283
+ chunk->ext ->device_queue ->submit ([&](sycl::handler &h) {
284
+ h.update (sycl::accessor{*src_recv_buffer, h, sycl::write_only});
285
+ })
286
+ .wait_and_throw ();
287
+ send_recv_message (settings, //
288
+ src_send_buffer->get_pointer (d), //
289
+ src_recv_buffer->get_pointer (d), //
290
+ buffer_len, neighbour, send_tag, recv_tag, send_request, recv_request);
291
+ #else
292
+ throw std::logic_error (" host_task is disabled and staging is also disabled, this won't work" );
293
+ #endif
294
+ }
264
295
#endif
265
296
}
266
297
267
- void run_before_waitall_halo (Chunk *chunk, Settings &) { chunk->ext ->device_queue ->wait_and_throw (); }
298
+ void run_before_waitall_halo (Chunk *chunk, Settings &settings) {
299
+ #ifdef USE_HOSTTASK
300
+ chunk->ext ->device_queue ->wait_and_throw ();
301
+ #else
302
+ if (settings.staging_buffer ) {
303
+ // drop-through to waitall directly
304
+ } else {
305
+ #if defined(__HIPSYCL__) || defined(__OPENSYCL__)
306
+ chunk->ext ->device_queue ->wait_and_throw ();
307
+ #else
308
+ throw std::logic_error (" host_task is disabled and staging is also disabled, this won't work" );
309
+ #endif
310
+ }
311
+ #endif
312
+ }
268
313
void run_restore_recv_halo (Chunk *, Settings &, FieldBufferType, StagingBufferType, int ) {}
0 commit comments