From 62b4ca9c37a93350a554d04057e5c577dd89a921 Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Sun, 1 Jun 2025 03:11:41 +0900 Subject: [PATCH 01/26] initial BDA work --- lib/cl/device.jl | 5 + lib/cl/libopencl.jl | 684 +++++++--------------------------------- lib/cl/memory/bda.jl | 146 +++++++++ lib/cl/memory/memory.jl | 3 +- 4 files changed, 262 insertions(+), 576 deletions(-) create mode 100644 lib/cl/memory/bda.jl diff --git a/lib/cl/device.jl b/lib/cl/device.jl index e7afc1c6..0651b32c 100644 --- a/lib/cl/device.jl +++ b/lib/cl/device.jl @@ -190,6 +190,11 @@ function exec_capabilities(d::Device) ) end +function bda_supported(d::Device) + "cl_ext_buffer_device_address" in d.extensions || return false + return true +end + function usm_supported(d::Device) "cl_intel_unified_shared_memory" in d.extensions || return false return true diff --git a/lib/cl/libopencl.jl b/lib/cl/libopencl.jl index 7814f8fa..f34b1bb6 100644 --- a/lib/cl/libopencl.jl +++ b/lib/cl/libopencl.jl @@ -8,8 +8,8 @@ end function check(f) res = retry_reclaim(err -> err == CL_OUT_OF_RESOURCES || - err == CL_MEM_OBJECT_ALLOCATION_FAILURE || - err == CL_OUT_OF_HOST_MEMORY) do + err == CL_MEM_OBJECT_ALLOCATION_FAILURE || + err == CL_OUT_OF_HOST_MEMORY) do return f() end @@ -21,7 +21,7 @@ function check(f) end macro CL_MAKE_VERSION(major, minor, patch) - quote + return quote VersionNumber($major, $minor, $patch) end end @@ -1263,424 +1263,6 @@ end const cl_device_partition_property_ext = cl_ulong -const cl_device_command_buffer_capabilities_khr = cl_bitfield - -mutable struct _cl_command_buffer_khr end - -const cl_command_buffer_khr = Ptr{_cl_command_buffer_khr} - -const cl_sync_point_khr = cl_uint - -const cl_command_buffer_info_khr = cl_uint - -const cl_command_buffer_state_khr = cl_uint - -const cl_command_buffer_properties_khr = cl_properties - -const cl_command_buffer_flags_khr = cl_bitfield - -const cl_command_properties_khr = cl_properties - -mutable struct _cl_mutable_command_khr end - -const cl_mutable_command_khr = Ptr{_cl_mutable_command_khr} - -# typedef cl_command_buffer_khr CL_API_CALL clCreateCommandBufferKHR_t ( cl_uint num_queues , const cl_command_queue * queues , const cl_command_buffer_properties_khr * properties , cl_int * errcode_ret ) -const clCreateCommandBufferKHR_t = Cvoid - -const clCreateCommandBufferKHR_fn = Ptr{clCreateCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clFinalizeCommandBufferKHR_t ( cl_command_buffer_khr command_buffer ) -const clFinalizeCommandBufferKHR_t = Cvoid - -const clFinalizeCommandBufferKHR_fn = Ptr{clFinalizeCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clRetainCommandBufferKHR_t ( cl_command_buffer_khr command_buffer ) -const clRetainCommandBufferKHR_t = Cvoid - -const clRetainCommandBufferKHR_fn = Ptr{clRetainCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clReleaseCommandBufferKHR_t ( cl_command_buffer_khr command_buffer ) -const clReleaseCommandBufferKHR_t = Cvoid - -const clReleaseCommandBufferKHR_fn = Ptr{clReleaseCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clEnqueueCommandBufferKHR_t ( cl_uint num_queues , cl_command_queue * queues , cl_command_buffer_khr command_buffer , cl_uint num_events_in_wait_list , const cl_event * event_wait_list , cl_event * event ) -const clEnqueueCommandBufferKHR_t = Cvoid - -const clEnqueueCommandBufferKHR_fn = Ptr{clEnqueueCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clCommandBarrierWithWaitListKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandBarrierWithWaitListKHR_t = Cvoid - -const clCommandBarrierWithWaitListKHR_fn = Ptr{clCommandBarrierWithWaitListKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyBufferKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_buffer , cl_mem dst_buffer , size_t src_offset , size_t dst_offset , size_t size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyBufferKHR_t = Cvoid - -const clCommandCopyBufferKHR_fn = Ptr{clCommandCopyBufferKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyBufferRectKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_buffer , cl_mem dst_buffer , const size_t * src_origin , const size_t * dst_origin , const size_t * region , size_t src_row_pitch , size_t src_slice_pitch , size_t dst_row_pitch , size_t dst_slice_pitch , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyBufferRectKHR_t = Cvoid - -const clCommandCopyBufferRectKHR_fn = Ptr{clCommandCopyBufferRectKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyBufferToImageKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_buffer , cl_mem dst_image , size_t src_offset , const size_t * dst_origin , const size_t * region , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyBufferToImageKHR_t = Cvoid - -const clCommandCopyBufferToImageKHR_fn = Ptr{clCommandCopyBufferToImageKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyImageKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_image , cl_mem dst_image , const size_t * src_origin , const size_t * dst_origin , const size_t * region , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyImageKHR_t = Cvoid - -const clCommandCopyImageKHR_fn = Ptr{clCommandCopyImageKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyImageToBufferKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_image , cl_mem dst_buffer , const size_t * src_origin , const size_t * region , size_t dst_offset , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyImageToBufferKHR_t = Cvoid - -const clCommandCopyImageToBufferKHR_fn = Ptr{clCommandCopyImageToBufferKHR_t} - -# typedef cl_int CL_API_CALL clCommandFillBufferKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem buffer , const void * pattern , size_t pattern_size , size_t offset , size_t size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandFillBufferKHR_t = Cvoid - -const clCommandFillBufferKHR_fn = Ptr{clCommandFillBufferKHR_t} - -# typedef cl_int CL_API_CALL clCommandFillImageKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem image , const void * fill_color , const size_t * origin , const size_t * region , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandFillImageKHR_t = Cvoid - -const clCommandFillImageKHR_fn = Ptr{clCommandFillImageKHR_t} - -# typedef cl_int CL_API_CALL clCommandNDRangeKernelKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_kernel kernel , cl_uint work_dim , const size_t * global_work_offset , const size_t * global_work_size , const size_t * local_work_size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandNDRangeKernelKHR_t = Cvoid - -const clCommandNDRangeKernelKHR_fn = Ptr{clCommandNDRangeKernelKHR_t} - -# typedef cl_int CL_API_CALL clGetCommandBufferInfoKHR_t ( cl_command_buffer_khr command_buffer , cl_command_buffer_info_khr param_name , size_t param_value_size , void * param_value , size_t * param_value_size_ret ) -const clGetCommandBufferInfoKHR_t = Cvoid - -const clGetCommandBufferInfoKHR_fn = Ptr{clGetCommandBufferInfoKHR_t} - -function clCreateCommandBufferKHR(num_queues, queues, properties, errcode_ret) - @ext_ccall libopencl.clCreateCommandBufferKHR(num_queues::cl_uint, - queues::Ptr{cl_command_queue}, - properties::Ptr{cl_command_buffer_properties_khr}, - errcode_ret::Ptr{cl_int})::cl_command_buffer_khr -end - -@checked function clFinalizeCommandBufferKHR(command_buffer) - @ext_ccall libopencl.clFinalizeCommandBufferKHR(command_buffer::cl_command_buffer_khr)::cl_int -end - -@checked function clRetainCommandBufferKHR(command_buffer) - @ext_ccall libopencl.clRetainCommandBufferKHR(command_buffer::cl_command_buffer_khr)::cl_int -end - -@checked function clReleaseCommandBufferKHR(command_buffer) - @ext_ccall libopencl.clReleaseCommandBufferKHR(command_buffer::cl_command_buffer_khr)::cl_int -end - -@checked function clEnqueueCommandBufferKHR(num_queues, queues, command_buffer, - num_events_in_wait_list, event_wait_list, event) - @ext_ccall libopencl.clEnqueueCommandBufferKHR(num_queues::cl_uint, - queues::Ptr{cl_command_queue}, - command_buffer::cl_command_buffer_khr, - num_events_in_wait_list::cl_uint, - event_wait_list::Ptr{cl_event}, - event::Ptr{cl_event})::cl_int -end - -@checked function clCommandBarrierWithWaitListKHR(command_buffer, command_queue, properties, - num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandBarrierWithWaitListKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyBufferKHR(command_buffer, command_queue, properties, - src_buffer, dst_buffer, src_offset, dst_offset, - size, num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, mutable_handle) - @ext_ccall libopencl.clCommandCopyBufferKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_buffer::cl_mem, dst_buffer::cl_mem, - src_offset::Csize_t, dst_offset::Csize_t, - size::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyBufferRectKHR(command_buffer, command_queue, properties, - src_buffer, dst_buffer, src_origin, dst_origin, - region, src_row_pitch, src_slice_pitch, - dst_row_pitch, dst_slice_pitch, - num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandCopyBufferRectKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_buffer::cl_mem, dst_buffer::cl_mem, - src_origin::Ptr{Csize_t}, - dst_origin::Ptr{Csize_t}, - region::Ptr{Csize_t}, - src_row_pitch::Csize_t, - src_slice_pitch::Csize_t, - dst_row_pitch::Csize_t, - dst_slice_pitch::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyBufferToImageKHR(command_buffer, command_queue, properties, - src_buffer, dst_image, src_offset, - dst_origin, region, - num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandCopyBufferToImageKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_buffer::cl_mem, - dst_image::cl_mem, - src_offset::Csize_t, - dst_origin::Ptr{Csize_t}, - region::Ptr{Csize_t}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyImageKHR(command_buffer, command_queue, properties, - src_image, dst_image, src_origin, dst_origin, - region, num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, mutable_handle) - @ext_ccall libopencl.clCommandCopyImageKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_image::cl_mem, dst_image::cl_mem, - src_origin::Ptr{Csize_t}, - dst_origin::Ptr{Csize_t}, - region::Ptr{Csize_t}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyImageToBufferKHR(command_buffer, command_queue, properties, - src_image, dst_buffer, src_origin, region, - dst_offset, num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandCopyImageToBufferKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_image::cl_mem, - dst_buffer::cl_mem, - src_origin::Ptr{Csize_t}, - region::Ptr{Csize_t}, - dst_offset::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandFillBufferKHR(command_buffer, command_queue, properties, buffer, - pattern, pattern_size, offset, size, - num_sync_points_in_wait_list, sync_point_wait_list, - sync_point, mutable_handle) - @ext_ccall libopencl.clCommandFillBufferKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - buffer::cl_mem, pattern::Ptr{Cvoid}, - pattern_size::Csize_t, offset::Csize_t, - size::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandFillImageKHR(command_buffer, command_queue, properties, image, - fill_color, origin, region, - num_sync_points_in_wait_list, sync_point_wait_list, - sync_point, mutable_handle) - @ext_ccall libopencl.clCommandFillImageKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - image::cl_mem, fill_color::Ptr{Cvoid}, - origin::Ptr{Csize_t}, region::Ptr{Csize_t}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandNDRangeKernelKHR(command_buffer, command_queue, properties, - kernel, work_dim, global_work_offset, - global_work_size, local_work_size, - num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandNDRangeKernelKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - kernel::cl_kernel, work_dim::cl_uint, - global_work_offset::Ptr{Csize_t}, - global_work_size::Ptr{Csize_t}, - local_work_size::Ptr{Csize_t}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clGetCommandBufferInfoKHR(command_buffer, param_name, param_value_size, - param_value, param_value_size_ret) - @ext_ccall libopencl.clGetCommandBufferInfoKHR(command_buffer::cl_command_buffer_khr, - param_name::cl_command_buffer_info_khr, - param_value_size::Csize_t, - param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int -end - -# typedef cl_int CL_API_CALL clCommandSVMMemcpyKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , void * dst_ptr , const void * src_ptr , size_t size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandSVMMemcpyKHR_t = Cvoid - -const clCommandSVMMemcpyKHR_fn = Ptr{clCommandSVMMemcpyKHR_t} - -# typedef cl_int CL_API_CALL clCommandSVMMemFillKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , void * svm_ptr , const void * pattern , size_t pattern_size , size_t size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandSVMMemFillKHR_t = Cvoid - -const clCommandSVMMemFillKHR_fn = Ptr{clCommandSVMMemFillKHR_t} - -@checked function clCommandSVMMemcpyKHR(command_buffer, command_queue, properties, dst_ptr, - src_ptr, size, num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, mutable_handle) - @ext_ccall libopencl.clCommandSVMMemcpyKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - dst_ptr::Ptr{Cvoid}, src_ptr::Ptr{Cvoid}, - size::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandSVMMemFillKHR(command_buffer, command_queue, properties, svm_ptr, - pattern, pattern_size, size, - num_sync_points_in_wait_list, sync_point_wait_list, - sync_point, mutable_handle) - @ext_ccall libopencl.clCommandSVMMemFillKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - svm_ptr::Ptr{Cvoid}, pattern::Ptr{Cvoid}, - pattern_size::Csize_t, size::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -const cl_platform_command_buffer_capabilities_khr = cl_bitfield - -# typedef cl_command_buffer_khr CL_API_CALL clRemapCommandBufferKHR_t ( cl_command_buffer_khr command_buffer , cl_bool automatic , cl_uint num_queues , const cl_command_queue * queues , cl_uint num_handles , const cl_mutable_command_khr * handles , cl_mutable_command_khr * handles_ret , cl_int * errcode_ret ) -const clRemapCommandBufferKHR_t = Cvoid - -const clRemapCommandBufferKHR_fn = Ptr{clRemapCommandBufferKHR_t} - -function clRemapCommandBufferKHR(command_buffer, automatic, num_queues, queues, num_handles, - handles, handles_ret, errcode_ret) - @ext_ccall libopencl.clRemapCommandBufferKHR(command_buffer::cl_command_buffer_khr, - automatic::cl_bool, num_queues::cl_uint, - queues::Ptr{cl_command_queue}, - num_handles::cl_uint, - handles::Ptr{cl_mutable_command_khr}, - handles_ret::Ptr{cl_mutable_command_khr}, - errcode_ret::Ptr{cl_int})::cl_command_buffer_khr -end - -const cl_command_buffer_update_type_khr = cl_uint - -const cl_mutable_dispatch_fields_khr = cl_bitfield - -const cl_mutable_command_info_khr = cl_uint - -struct _cl_mutable_dispatch_arg_khr - arg_index::cl_uint - arg_size::Csize_t - arg_value::Ptr{Cvoid} -end - -const cl_mutable_dispatch_arg_khr = _cl_mutable_dispatch_arg_khr - -struct _cl_mutable_dispatch_exec_info_khr - param_name::cl_uint - param_value_size::Csize_t - param_value::Ptr{Cvoid} -end - -const cl_mutable_dispatch_exec_info_khr = _cl_mutable_dispatch_exec_info_khr - -struct _cl_mutable_dispatch_config_khr - command::cl_mutable_command_khr - num_args::cl_uint - num_svm_args::cl_uint - num_exec_infos::cl_uint - work_dim::cl_uint - arg_list::Ptr{cl_mutable_dispatch_arg_khr} - arg_svm_list::Ptr{cl_mutable_dispatch_arg_khr} - exec_info_list::Ptr{cl_mutable_dispatch_exec_info_khr} - global_work_offset::Ptr{Csize_t} - global_work_size::Ptr{Csize_t} - local_work_size::Ptr{Csize_t} -end - -const cl_mutable_dispatch_config_khr = _cl_mutable_dispatch_config_khr - -const cl_mutable_dispatch_asserts_khr = cl_bitfield - -# typedef cl_int CL_API_CALL clUpdateMutableCommandsKHR_t ( cl_command_buffer_khr command_buffer , cl_uint num_configs , const cl_command_buffer_update_type_khr * config_types , const void * * configs ) -const clUpdateMutableCommandsKHR_t = Cvoid - -const clUpdateMutableCommandsKHR_fn = Ptr{clUpdateMutableCommandsKHR_t} - -# typedef cl_int CL_API_CALL clGetMutableCommandInfoKHR_t ( cl_mutable_command_khr command , cl_mutable_command_info_khr param_name , size_t param_value_size , void * param_value , size_t * param_value_size_ret ) -const clGetMutableCommandInfoKHR_t = Cvoid - -const clGetMutableCommandInfoKHR_fn = Ptr{clGetMutableCommandInfoKHR_t} - -@checked function clUpdateMutableCommandsKHR(command_buffer, num_configs, config_types, - configs) - @ext_ccall libopencl.clUpdateMutableCommandsKHR(command_buffer::cl_command_buffer_khr, - num_configs::cl_uint, - config_types::Ptr{cl_command_buffer_update_type_khr}, - configs::Ptr{Ptr{Cvoid}})::cl_int -end - -@checked function clGetMutableCommandInfoKHR(command, param_name, param_value_size, - param_value, param_value_size_ret) - @ext_ccall libopencl.clGetMutableCommandInfoKHR(command::cl_mutable_command_khr, - param_name::cl_mutable_command_info_khr, - param_value_size::Csize_t, - param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int -end - # typedef cl_int CL_API_CALL clSetMemObjectDestructorAPPLE_t ( cl_mem memobj , void ( CL_CALLBACK * pfn_notify ) ( cl_mem memobj , void * user_data ) , void * user_data ) const clSetMemObjectDestructorAPPLE_t = Cvoid @@ -1731,12 +1313,32 @@ const clIcdGetPlatformIDsKHR_t = Cvoid const clIcdGetPlatformIDsKHR_fn = Ptr{clIcdGetPlatformIDsKHR_t} +# typedef void * CL_API_CALL clIcdGetFunctionAddressForPlatformKHR_t ( cl_platform_id platform , const char * func_name ) +const clIcdGetFunctionAddressForPlatformKHR_t = Cvoid + +const clIcdGetFunctionAddressForPlatformKHR_fn = Ptr{clIcdGetFunctionAddressForPlatformKHR_t} + +# typedef cl_int CL_API_CALL clIcdSetPlatformDispatchDataKHR_t ( cl_platform_id platform , void * dispatch_data ) +const clIcdSetPlatformDispatchDataKHR_t = Cvoid + +const clIcdSetPlatformDispatchDataKHR_fn = Ptr{clIcdSetPlatformDispatchDataKHR_t} + @checked function clIcdGetPlatformIDsKHR(num_entries, platforms, num_platforms) @ext_ccall libopencl.clIcdGetPlatformIDsKHR(num_entries::cl_uint, platforms::Ptr{cl_platform_id}, num_platforms::Ptr{cl_uint})::cl_int end +function clIcdGetFunctionAddressForPlatformKHR(platform, func_name) + @ext_ccall libopencl.clIcdGetFunctionAddressForPlatformKHR(platform::cl_platform_id, + func_name::Ptr{Cchar})::Ptr{Cvoid} +end + +@checked function clIcdSetPlatformDispatchDataKHR(platform, dispatch_data) + @ext_ccall libopencl.clIcdSetPlatformDispatchDataKHR(platform::cl_platform_id, + dispatch_data::Ptr{Cvoid})::cl_int +end + # typedef cl_program CL_API_CALL clCreateProgramWithILKHR_t ( cl_context context , const void * il , size_t length , cl_int * errcode_ret ) const clCreateProgramWithILKHR_t = Cvoid @@ -2452,7 +2054,7 @@ end @checked function clGetMemAllocInfoINTEL(context, ptr, param_name, param_value_size, param_value, param_value_size_ret) - @ext_ccall libopencl.clGetMemAllocInfoINTEL(context::cl_context, ptr::CLPtr{Cvoid}, + @ext_ccall libopencl.clGetMemAllocInfoINTEL(context::cl_context, ptr::PtrOrCLPtr{Cvoid}, param_name::cl_mem_info_intel, param_value_size::Csize_t, param_value::Ptr{Cvoid}, @@ -2462,14 +2064,15 @@ end @checked function clSetKernelArgMemPointerINTEL(kernel, arg_index, arg_value) @ext_ccall libopencl.clSetKernelArgMemPointerINTEL(kernel::cl_kernel, arg_index::cl_uint, - arg_value::CLPtr{Cvoid})::cl_int + arg_value::PtrOrCLPtr{Cvoid})::cl_int end @checked function clEnqueueMemFillINTEL(command_queue, dst_ptr, pattern, pattern_size, size, num_events_in_wait_list, event_wait_list, event) @ext_ccall libopencl.clEnqueueMemFillINTEL(command_queue::cl_command_queue, - dst_ptr::CLPtr{Cvoid}, pattern::Ptr{Cvoid}, - pattern_size::Csize_t, size::Csize_t, + dst_ptr::PtrOrCLPtr{Cvoid}, + pattern::Ptr{Cvoid}, pattern_size::Csize_t, + size::Csize_t, num_events_in_wait_list::cl_uint, event_wait_list::Ptr{cl_event}, event::Ptr{cl_event})::cl_int @@ -2488,7 +2091,7 @@ end @checked function clEnqueueMemAdviseINTEL(command_queue, ptr, size, advice, num_events_in_wait_list, event_wait_list, event) @ext_ccall libopencl.clEnqueueMemAdviseINTEL(command_queue::cl_command_queue, - ptr::CLPtr{Cvoid}, size::Csize_t, + ptr::PtrOrCLPtr{Cvoid}, size::Csize_t, advice::cl_mem_advice_intel, num_events_in_wait_list::cl_uint, event_wait_list::Ptr{cl_event}, @@ -2503,7 +2106,7 @@ const clEnqueueMigrateMemINTEL_fn = Ptr{clEnqueueMigrateMemINTEL_t} @checked function clEnqueueMigrateMemINTEL(command_queue, ptr, size, flags, num_events_in_wait_list, event_wait_list, event) @ext_ccall libopencl.clEnqueueMigrateMemINTEL(command_queue::cl_command_queue, - ptr::CLPtr{Cvoid}, size::Csize_t, + ptr::PtrOrCLPtr{Cvoid}, size::Csize_t, flags::cl_mem_migration_flags, num_events_in_wait_list::cl_uint, event_wait_list::Ptr{cl_event}, @@ -2637,7 +2240,18 @@ const clSetContentSizeBufferPoCL_fn = Ptr{clSetContentSizeBufferPoCL_t} content_size_buffer::cl_mem)::cl_int end -const cl_device_kernel_clock_capabilities_khr = cl_bitfield +const cl_mem_device_address_ext = cl_ulong + +# typedef cl_int CL_API_CALL clSetKernelArgDevicePointerEXT_t ( cl_kernel kernel , cl_uint arg_index , cl_mem_device_address_ext arg_value ) +const clSetKernelArgDevicePointerEXT_t = Cvoid + +const clSetKernelArgDevicePointerEXT_fn = Ptr{clSetKernelArgDevicePointerEXT_t} + +@checked function clSetKernelArgDevicePointerEXT(kernel, arg_index, arg_value) + @ext_ccall libopencl.clSetKernelArgDevicePointerEXT(kernel::cl_kernel, + arg_index::cl_uint, + arg_value::cl_mem_device_address_ext)::cl_int +end # typedef cl_int CL_API_CALL clCancelCommandsIMG_t ( const cl_event * event_list , size_t num_events_in_list ) const clCancelCommandsIMG_t = Cvoid @@ -2649,6 +2263,18 @@ const clCancelCommandsIMG_fn = Ptr{clCancelCommandsIMG_t} num_events_in_list::Csize_t)::cl_int end +const cl_perf_hint_qcom = cl_uint + +# typedef cl_int CL_API_CALL clSetPerfHintQCOM_t ( cl_context context , cl_perf_hint_qcom perf_hint ) +const clSetPerfHintQCOM_t = Cvoid + +const clSetPerfHintQCOM_fn = Ptr{clSetPerfHintQCOM_t} + +@checked function clSetPerfHintQCOM(context, perf_hint) + @ext_ccall libopencl.clSetPerfHintQCOM(context::cl_context, + perf_hint::cl_perf_hint_qcom)::cl_int +end + const CL_NAME_VERSION_MAX_NAME_SIZE = 64 const CL_SUCCESS = 0 @@ -3647,126 +3273,6 @@ const CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_NAME = "cl_intel_sharing_format const CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 0, 0) -const cl_khr_command_buffer = 1 - -const CL_KHR_COMMAND_BUFFER_EXTENSION_NAME = "cl_khr_command_buffer" - -const CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 5) - -const CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR = 0x12a9 - -const CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR = 0x12aa - -const CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR = 1 << 0 - -const CL_COMMAND_BUFFER_CAPABILITY_DEVICE_SIDE_ENQUEUE_KHR = 1 << 1 - -const CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR = 1 << 2 - -const CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR = 1 << 3 - -const CL_COMMAND_BUFFER_FLAGS_KHR = 0x1293 - -const CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR = 1 << 0 - -const CL_INVALID_COMMAND_BUFFER_KHR = -1138 - -const CL_INVALID_SYNC_POINT_WAIT_LIST_KHR = -1139 - -const CL_INCOMPATIBLE_COMMAND_QUEUE_KHR = -1140 - -const CL_COMMAND_BUFFER_QUEUES_KHR = 0x1294 - -const CL_COMMAND_BUFFER_NUM_QUEUES_KHR = 0x1295 - -const CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR = 0x1296 - -const CL_COMMAND_BUFFER_STATE_KHR = 0x1297 - -const CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR = 0x1298 - -const CL_COMMAND_BUFFER_CONTEXT_KHR = 0x1299 - -const CL_COMMAND_BUFFER_STATE_RECORDING_KHR = 0 - -const CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR = 1 - -const CL_COMMAND_BUFFER_STATE_PENDING_KHR = 2 - -const CL_COMMAND_COMMAND_BUFFER_KHR = 0x12a8 - -const cl_khr_command_buffer_multi_device = 1 - -const CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_NAME = "cl_khr_command_buffer_multi_device" - -const CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 1) - -const CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR = 0x0908 - -const CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR = 1 << 0 - -const CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR = 1 << 1 - -const CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR = 1 << 2 - -const CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR = 0x12ab - -const CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR = 0x12ac - -const CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR = 1 << 4 - -const CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR = 1 << 2 - -const cl_khr_command_buffer_mutable_dispatch = 1 - -const CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME = "cl_khr_command_buffer_mutable_dispatch" - -const CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 3) - -const CL_COMMAND_BUFFER_MUTABLE_KHR = 1 << 1 - -const CL_INVALID_MUTABLE_COMMAND_KHR = -1141 - -const CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR = 0x12b0 - -const CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR = 0x12b1 - -const CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR = 1 << 0 - -const CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR = 1 << 1 - -const CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR = 1 << 2 - -const CL_MUTABLE_DISPATCH_ARGUMENTS_KHR = 1 << 3 - -const CL_MUTABLE_DISPATCH_EXEC_INFO_KHR = 1 << 4 - -const CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR = 0x12a0 - -const CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR = 0x12a1 - -const CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR = 0x12ad - -const CL_MUTABLE_COMMAND_PROPERTIES_ARRAY_KHR = 0x12a2 - -const CL_MUTABLE_DISPATCH_KERNEL_KHR = 0x12a3 - -const CL_MUTABLE_DISPATCH_DIMENSIONS_KHR = 0x12a4 - -const CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR = 0x12a5 - -const CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR = 0x12a6 - -const CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR = 0x12a7 - -const CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR = 0 - -const CL_COMMAND_BUFFER_MUTABLE_DISPATCH_ASSERTS_KHR = 0x12b7 - -const CL_MUTABLE_DISPATCH_ASSERTS_KHR = 0x12b8 - -const CL_MUTABLE_DISPATCH_ASSERT_NO_ADDITIONAL_WORK_GROUPS_KHR = 1 << 0 - const cl_khr_fp64 = 1 const CL_KHR_FP64_EXTENSION_NAME = "cl_khr_fp64" @@ -3797,12 +3303,14 @@ const cl_khr_icd = 1 const CL_KHR_ICD_EXTENSION_NAME = "cl_khr_icd" -const CL_KHR_ICD_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) +const CL_KHR_ICD_EXTENSION_VERSION = @CL_MAKE_VERSION(2, 0, 0) const CL_PLATFORM_ICD_SUFFIX_KHR = 0x0920 const CL_PLATFORM_NOT_FOUND_KHR = -1001 +const CL_ICD2_TAG_KHR = intptr_t(0x4f50454e434c3331) + const cl_khr_il_program = 1 const CL_KHR_IL_PROGRAM_EXTENSION_NAME = "cl_khr_il_program" @@ -4351,18 +3859,6 @@ const CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXTENSION_VERSION = @CL_MAKE_VERSION(1, const CL_SEMAPHORE_HANDLE_SYNC_FD_KHR = 0x2058 -const cl_khr_external_semaphore_win32 = 1 - -const CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME = "cl_khr_external_semaphore_win32" - -const CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 1) - -const CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR = 0x2056 - -const CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR = 0x2057 - -const CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_NAME_KHR = 0x2068 - const cl_khr_semaphore = 1 const CL_KHR_SEMAPHORE_EXTENSION_NAME = "cl_khr_semaphore" @@ -5350,20 +4846,6 @@ const CL_KHR_INT64_EXTENDED_ATOMICS_EXTENSION_NAME = "cl_khr_int64_extended_atom const CL_KHR_INT64_EXTENDED_ATOMICS_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) -const cl_khr_kernel_clock = 1 - -const CL_KHR_KERNEL_CLOCK_EXTENSION_NAME = "cl_khr_kernel_clock" - -const CL_KHR_KERNEL_CLOCK_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 0) - -const CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR = 0x1076 - -const CL_DEVICE_KERNEL_CLOCK_SCOPE_DEVICE_KHR = 1 << 0 - -const CL_DEVICE_KERNEL_CLOCK_SCOPE_WORK_GROUP_KHR = 1 << 1 - -const CL_DEVICE_KERNEL_CLOCK_SCOPE_SUB_GROUP_KHR = 1 << 2 - const cl_khr_local_int32_base_atomics = 1 const CL_KHR_LOCAL_INT32_BASE_ATOMICS_EXTENSION_NAME = "cl_khr_local_int32_base_atomics" @@ -5466,6 +4948,18 @@ const CL_KHR_WORK_GROUP_UNIFORM_ARITHMETIC_EXTENSION_NAME = "cl_khr_work_group_u const CL_KHR_WORK_GROUP_UNIFORM_ARITHMETIC_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) +const cl_ext_buffer_device_address = 1 + +const CL_EXT_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME = "cl_ext_buffer_device_address" + +const CL_EXT_BUFFER_DEVICE_ADDRESS_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 2) + +const CL_MEM_DEVICE_PRIVATE_ADDRESS_EXT = 0x5000 + +const CL_MEM_DEVICE_ADDRESS_EXT = 0x5001 + +const CL_KERNEL_EXEC_INFO_DEVICE_PTRS_EXT = 0x5002 + const cl_ext_image_unorm_int_2_101010 = 1 const CL_EXT_IMAGE_UNORM_INT_2_101010_EXTENSION_NAME = "cl_ext_image_unorm_int_2_101010" @@ -5474,6 +4968,32 @@ const CL_EXT_IMAGE_UNORM_INT_2_101010_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, const CL_UNORM_INT_2_101010_EXT = 0x10e5 +const cl_ext_image_unsigned_10x6_12x4_14x2 = 1 + +const CL_EXT_IMAGE_UNSIGNED_10X6_12X4_14X2_EXTENSION_NAME = "cl_ext_image_unsigned_10x6_12x4_14x2" + +const CL_EXT_IMAGE_UNSIGNED_10X6_12X4_14X2_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) + +const CL_UNSIGNED_INT10X6_EXT = 0x10e6 + +const CL_UNSIGNED_INT12X4_EXT = 0x10e7 + +const CL_UNSIGNED_INT14X2_EXT = 0x10e8 + +const CL_UNORM_INT10X6_EXT = 0x10e1 + +const CL_UNORM_INT12X4_EXT = 0x10e9 + +const CL_UNORM_INT14X2_EXT = 0x10ea + +const cl_ext_immutable_memory_objects = 1 + +const CL_EXT_IMMUTABLE_MEMORY_OBJECTS_EXTENSION_NAME = "cl_ext_immutable_memory_objects" + +const CL_EXT_IMMUTABLE_MEMORY_OBJECTS_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) + +const CL_MEM_IMMUTABLE_EXT = 1 << 6 + const cl_img_cancel_command = 1 const CL_IMG_CANCEL_COMMAND_EXTENSION_NAME = "cl_img_cancel_command" @@ -5481,3 +5001,17 @@ const CL_IMG_CANCEL_COMMAND_EXTENSION_NAME = "cl_img_cancel_command" const CL_IMG_CANCEL_COMMAND_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 0, 0) const CL_CANCELLED_IMG = -1126 + +const cl_qcom_perf_hint = 1 + +const CL_QCOM_PERF_HINT_EXTENSION_NAME = "cl_qcom_perf_hint" + +const CL_QCOM_PERF_HINT_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 5) + +const CL_PERF_HINT_HIGH_QCOM = 0x40c3 + +const CL_PERF_HINT_NORMAL_QCOM = 0x40c4 + +const CL_PERF_HINT_LOW_QCOM = 0x40c5 + +const CL_CONTEXT_PERF_HINT_QCOM = 0x40c2 diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl new file mode 100644 index 00000000..57e7eb38 --- /dev/null +++ b/lib/cl/memory/bda.jl @@ -0,0 +1,146 @@ +struct BufferDeviceMemory <: AbstractMemory + id::cl_mem + ptr::CLPtr{Cvoid} + bytesize::Int + context::Context +end + +BufferDeviceMemory() = BufferDeviceMemory(C_NULL, CL_NULL, 0, context()) + +function bda_alloc(bytesize::Integer; + alignment::Integer = 0, device_access::Symbol = :rw, host_access::Symbol = :rw + ) + bytesize == 0 && return BufferDeviceMemory() + + flags = if device_access == :rw + CL_MEM_READ_WRITE + elseif device_access == :r + CL_MEM_READ_ONLY + elseif device_access == :w + CL_MEM_WRITE_ONLY + else + throw(ArgumentError("Invalid access type")) + end + + if host_access == :rw + # nothing to do + elseif host_access == :r + flags |= CL_MEM_HOST_READ_ONLY + elseif host_access == :w + flags |= CL_MEM_HOST_WRITE_ONLY + elseif host_access == :none + flags |= CL_MEM_HOST_NO_ACCESS + else + throw(ArgumentError("Host access flag must be one of :rw, :r, or :w")) + end + + + err_code = Ref{Cint}() + properties = cl_mem_properties[CL_MEM_DEVICE_PRIVATE_ADDRESS_EXT, CL_TRUE, 0] + mem_id = clCreateBufferWithProperties(context(), properties, flags, bytesize, C_NULL, err_code) + addr = Ref{cl_mem_device_address_ext}() + clGetMemObjectInfo(mem_id, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), addr, C_NULL) + ptr = CLPtr{Cvoid}(addr[]) + @assert ptr != C_NULL + if err_code[] != CL_SUCCESS + throw(CLError(err_code[])) + end + return BufferDeviceMemory(mem_id, ptr, bytesize, context()) +end + +function bda_free(buf::BufferDeviceMemory) + if sizeof(buf) != 0 + clReleaseMemObject(buf.id) + end + return +end + +Base.pointer(buf::BufferDeviceMemory) = buf.ptr +Base.sizeof(buf::BufferDeviceMemory) = buf.bytesize +context(buf::BufferDeviceMemory) = buf.context + +Base.show(io::IO, buf::BufferDeviceMemory) = + @printf(io, "BufferDeviceMemory(%s at %p)", Base.format_bytes(sizeof(buf)), Int(pointer(buf))) + +Base.convert(::Type{Ptr{T}}, buf::BufferDeviceMemory) where {T} = + convert(Ptr{T}, pointer(buf)) + +Base.convert(::Type{CLPtr{T}}, buf::BufferDeviceMemory) where {T} = + reinterpret(CLPtr{T}, pointer(buf)) + +#= +## memory operations + +# these generally only make sense for coarse-grained SVM buffers; +# fine-grained buffers can just be used directly. + +# copy from and to SVM buffers +function enqueue_svm_copy( + + dst::Union{Ptr, CLPtr}, src::Union{Ptr, CLPtr}, nbytes::Integer; queue::CmdQueue = queue(), bloc, C_NULL)ing::Bool = false, + wait_for::Vector{Event} = Event[] + ) + n_evts = length(wait_for) + evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] + return GC.@preserve wait_for begin + ret_evt = Ref{cl_event}() + clEnqueueSVMMemcpy(queue, blocking, dst, src, nbytes, n_evts, evt_ids, ret_evt) + @return_event ret_evt[] + end +end + +# map an SVM buffer into the host address space, returning an event +function enqueue_svm_map( + ptr::Union{Ptr, CLPtr}, nbytes::Integer, flags = :rw; queue::CmdQueue = queue(), blocking::Bool = false, + wait_for::Vector{Event} = Event[] + ) + flags = if flags == :rw + CL_MAP_READ | CL_MAP_WRITE + elseif flags == :r + CL_MAP_READ + elseif flags == :w + CL_MAP_WRITE + else + throw(ArgumentError("enqueue_unmap can have flags of :r, :w, or :rw, got :$flags")) + end + n_evts = length(wait_for) + evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] + GC.@preserve wait_for begin + ret_evt = Ref{cl_event}() + clEnqueueSVMMap( + queue, blocking, flags, ptr, nbytes, + n_evts, evt_ids, ret_evt + ) + + return Event(ret_evt[]) + end +end + +# unmap a buffer, returning an event +function enqueue_svm_unmap(ptr::Union{Ptr, CLPtr}; queue::CmdQueue = queue(), wait_for::Vector{Event} = Event[]) + n_evts = length(wait_for) + evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] + GC.@preserve wait_for begin + ret_evt = Ref{cl_event}() + clEnqueueSVMUnmap(queue, ptr, n_evts, evt_ids, ret_evt) + return Event(ret_evt[]) + end +end + +# fill a buffer with a pattern, returning an event +function enqueue_svm_fill(ptr::Union{Ptr, CLPtr}, pattern::T, N::Integer; + wait_for::Vector{Event}=Event[]) where {T} + nbytes = N * sizeof(T) + nbytes == 0 && return + pattern_size = sizeof(T) + n_evts = length(wait_for) + evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] + GC.@preserve wait_for begin + ret_evt = Ref{cl_event}() + clEnqueueSVMMemFill(queue(), ptr, Ref(pattern), + pattern_size, nbytes, + n_evts, evt_ids, ret_evt) + @return_event ret_evt[] + end +end +=# diff --git a/lib/cl/memory/memory.jl b/lib/cl/memory/memory.jl index 2e128c0f..8f8f65ea 100644 --- a/lib/cl/memory/memory.jl +++ b/lib/cl/memory/memory.jl @@ -1,6 +1,6 @@ # Raw memory management -export device_alloc, host_alloc, shared_alloc, svm_alloc, free +export device_alloc, host_alloc, shared_alloc, svm_alloc, free, bda_alloc # # untyped buffers @@ -17,5 +17,6 @@ Base.convert(T::Type{<:Union{Ptr, CLPtr}}, buf::AbstractMemory) = # and not the pointer of the buffer object itself. Base.unsafe_convert(P::Type{<:Union{Ptr, CLPtr}}, buf::AbstractMemory) = convert(P, buf) +include("bda.jl") include("usm.jl") include("svm.jl") From 458a35dab8eec86fa9a6b0569ec256c22e1414cb Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Mon, 16 Jun 2025 01:54:43 +0900 Subject: [PATCH 02/26] get bda working --- lib/cl/buffer.jl | 16 ++++---- lib/cl/kernel.jl | 8 ++++ lib/cl/memory/bda.jl | 87 +++++++------------------------------------- lib/cl/state.jl | 9 +++-- src/array.jl | 14 ++++++- src/memory.jl | 7 ++++ src/util.jl | 2 + 7 files changed, 58 insertions(+), 85 deletions(-) diff --git a/lib/cl/buffer.jl b/lib/cl/buffer.jl index 045d1218..0d4774f8 100644 --- a/lib/cl/buffer.jl +++ b/lib/cl/buffer.jl @@ -144,7 +144,7 @@ end ## memory operations # reading from buffer to host array, return an event -function enqueue_read(dst::Ptr, src::Buffer, src_off::Int, nbytes::Int; +function enqueue_read(dst::Ptr, src::Union{Buffer, cl_mem}, src_off::Int, nbytes::Int; blocking::Bool=false, wait_for::Vector{Event}=Event[]) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] @@ -155,11 +155,11 @@ function enqueue_read(dst::Ptr, src::Buffer, src_off::Int, nbytes::Int; @return_nanny_event(ret_evt[], dst) end end -enqueue_read(dst::Ptr, src::Buffer, nbytes; kwargs...) = +enqueue_read(dst::Ptr, src::Union{Buffer, cl_mem}, nbytes; kwargs...) = enqueue_read(dst, src, 0, nbytes; kwargs...) # writing from host array to buffer, return an event -function enqueue_write(dst::Buffer, dst_off::Int, src::Ptr, nbytes::Int; +function enqueue_write(dst::Union{Buffer, cl_mem}, dst_off::Int, src::Ptr, nbytes::Int; blocking::Bool=false, wait_for::Vector{Event}=Event[]) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] @@ -170,11 +170,11 @@ function enqueue_write(dst::Buffer, dst_off::Int, src::Ptr, nbytes::Int; @return_nanny_event(ret_evt[], dst) end end -enqueue_write(dst::Buffer, src::Ptr, nbytes; kwargs...) = +enqueue_write(dst::Union{Buffer, cl_mem}, src::Ptr, nbytes; kwargs...) = enqueue_write(dst, 0, src, nbytes; kwargs...) # copying between two buffers, return an event -function enqueue_copy(dst::Buffer, dst_off::Int, src::Buffer, src_off::Int, +function enqueue_copy(dst::Union{Buffer, cl_mem}, dst_off::Int, src::Union{Buffer, cl_mem}, src_off::Int, nbytes::Int; blocking::Bool=false, wait_for::Vector{Event}=Event[]) n_evts = length(wait_for) @@ -186,7 +186,7 @@ function enqueue_copy(dst::Buffer, dst_off::Int, src::Buffer, src_off::Int, @return_event ret_evt[] end end -enqueue_copy(dst::Buffer, src::Buffer, N; kwargs...) = +enqueue_copy(dst::Union{Buffer, cl_mem}, src::Union{Buffer, cl_mem}, N; kwargs...) = enqueue_copy(dst, 0, src, 0, N; kwargs...) # map a buffer into the host address space, returning a pointer and an event @@ -231,7 +231,7 @@ function enqueue_unmap(b::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[]) end # fill a buffer with a pattern, returning an event -function enqueue_fill(b::Buffer, offset::Integer, pattern::T, N::Integer; +function enqueue_fill(b::Union{Buffer, cl_mem}, offset::Integer, pattern::T, N::Integer; wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) nbytes_pattern = sizeof(T) @@ -246,4 +246,4 @@ function enqueue_fill(b::Buffer, offset::Integer, pattern::T, N::Integer; @return_event ret_evt[] end end -enqueue_fill(b::Buffer, pattern, N::Integer) = enqueue_fill(b, 0, pattern, N) +enqueue_fill(b::Union{Buffer, cl_mem}, pattern, N::Integer) = enqueue_fill(b, 0, pattern, N) diff --git a/lib/cl/kernel.jl b/lib/cl/kernel.jl index 6d78972e..60f63cf9 100644 --- a/lib/cl/kernel.jl +++ b/lib/cl/kernel.jl @@ -79,6 +79,8 @@ function set_arg!(k::Kernel, idx::Integer, arg::AbstractMemory) clSetKernelArgSVMPointer(k, idx - 1, pointer(arg)) elseif arg isa UnifiedMemory clSetKernelArgMemPointerINTEL(k, idx - 1, pointer(arg)) + elseif arg isa BufferDeviceMemory + clSetKernelArgDevicePointerEXT(k, idx - 1, pointer(arg)) else error("Unknown memory type") end @@ -191,6 +193,7 @@ function call( if !isempty(indirect_memory) svm_pointers = CLPtr{Cvoid}[] usm_pointers = CLPtr{Cvoid}[] + bda_pointers = CLPtr{Cvoid}[] device_access = host_access = shared_access = false for memory in indirect_memory ptr = pointer(memory) @@ -200,6 +203,8 @@ function call( if memory isa SharedVirtualMemory push!(svm_pointers, ptr) + elseif memory isa BufferDeviceMemory + push!(bda_pointers, ptr) elseif memory isa UnifiedDeviceMemory device_access = true push!(usm_pointers, ptr) @@ -229,6 +234,9 @@ function call( if !isempty(svm_pointers) clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS, sizeof(svm_pointers), svm_pointers) end + if !isempty(bda_pointers) + clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_DEVICE_PTRS_EXT, sizeof(bda_pointers), bda_pointers) + end if !isempty(usm_pointers) clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(usm_pointers), usm_pointers) end diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl index 57e7eb38..000f0e0f 100644 --- a/lib/cl/memory/bda.jl +++ b/lib/cl/memory/bda.jl @@ -68,79 +68,20 @@ Base.convert(::Type{Ptr{T}}, buf::BufferDeviceMemory) where {T} = Base.convert(::Type{CLPtr{T}}, buf::BufferDeviceMemory) where {T} = reinterpret(CLPtr{T}, pointer(buf)) -#= -## memory operations +enqueue_bda_copy(dst::Ptr, src::cl_mem, nbytes; kwargs...) = + enqueue_read(dst, src, nbytes; kwargs...) -# these generally only make sense for coarse-grained SVM buffers; -# fine-grained buffers can just be used directly. +enqueue_bda_copy(dst::cl_mem, src::Ptr, nbytes; kwargs...) = + enqueue_write(dst, src, nbytes; kwargs...) -# copy from and to SVM buffers -function enqueue_svm_copy( - - dst::Union{Ptr, CLPtr}, src::Union{Ptr, CLPtr}, nbytes::Integer; queue::CmdQueue = queue(), bloc, C_NULL)ing::Bool = false, - wait_for::Vector{Event} = Event[] - ) - n_evts = length(wait_for) - evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] - return GC.@preserve wait_for begin - ret_evt = Ref{cl_event}() - clEnqueueSVMMemcpy(queue, blocking, dst, src, nbytes, n_evts, evt_ids, ret_evt) - @return_event ret_evt[] - end -end - -# map an SVM buffer into the host address space, returning an event -function enqueue_svm_map( - ptr::Union{Ptr, CLPtr}, nbytes::Integer, flags = :rw; queue::CmdQueue = queue(), blocking::Bool = false, - wait_for::Vector{Event} = Event[] - ) - flags = if flags == :rw - CL_MAP_READ | CL_MAP_WRITE - elseif flags == :r - CL_MAP_READ - elseif flags == :w - CL_MAP_WRITE - else - throw(ArgumentError("enqueue_unmap can have flags of :r, :w, or :rw, got :$flags")) - end - n_evts = length(wait_for) - evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] - GC.@preserve wait_for begin - ret_evt = Ref{cl_event}() - clEnqueueSVMMap( - queue, blocking, flags, ptr, nbytes, - n_evts, evt_ids, ret_evt - ) - - return Event(ret_evt[]) - end -end - -# unmap a buffer, returning an event -function enqueue_svm_unmap(ptr::Union{Ptr, CLPtr}; queue::CmdQueue = queue(), wait_for::Vector{Event} = Event[]) - n_evts = length(wait_for) - evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] - GC.@preserve wait_for begin - ret_evt = Ref{cl_event}() - clEnqueueSVMUnmap(queue, ptr, n_evts, evt_ids, ret_evt) - return Event(ret_evt[]) - end -end +enqueue_bda_copy(dst::cl_mem, src::cl_mem, nbytes; kwargs...) = + enqueue_copy(dst, src, nbytes; kwargs...) + +enqueue_bda_copy(dst::Ptr, dst_off::Int, src::cl_mem, src_off::Int, nbytes; kwargs...) = + enqueue_read(dst, src, src_off, nbytes; kwargs...) + +enqueue_bda_copy(dst::cl_mem, dst_off::Int, src::Ptr, src_off::Int, nbytes; kwargs...) = + enqueue_write(dst, dst_off, src, nbytes; kwargs...) -# fill a buffer with a pattern, returning an event -function enqueue_svm_fill(ptr::Union{Ptr, CLPtr}, pattern::T, N::Integer; - wait_for::Vector{Event}=Event[]) where {T} - nbytes = N * sizeof(T) - nbytes == 0 && return - pattern_size = sizeof(T) - n_evts = length(wait_for) - evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] - GC.@preserve wait_for begin - ret_evt = Ref{cl_event}() - clEnqueueSVMMemFill(queue(), ptr, Ref(pattern), - pattern_size, nbytes, - n_evts, evt_ids, ret_evt) - @return_event ret_evt[] - end -end -=# +enqueue_bda_copy(dst::cl_mem, dst_off::Int, src::cl_mem, src_off::Int, nbytes; kwargs...) = + enqueue_copy(dst, dst_off, src, src_off, nbytes; kwargs...) diff --git a/lib/cl/state.jl b/lib/cl/state.jl index a3628d49..489664c9 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -161,6 +161,7 @@ end abstract type AbstractMemoryBackend end struct SVMBackend <: AbstractMemoryBackend end struct USMBackend <: AbstractMemoryBackend end +struct BDABackend <: AbstractMemoryBackend end function default_memory_backend(dev::Device) # determine if USM is supported @@ -171,15 +172,17 @@ function default_memory_backend(dev::Device) false end + bda = bda_supported(dev) + # determine if SVM is available (if needed) - if !usm + if !usm && !bda caps = svm_capabilities(dev) if !caps.coarse_grain_buffer - error("Device $dev does not support USM or coarse-grained SVM, either of which is required by OpenCL.jl") + error("Device $dev does not support USM, coarse-grained SVM, or Buffer Device Address, one of which is required by OpenCL.jl") end end - usm ? USMBackend() : SVMBackend() + usm ? USMBackend() : (bda ? BDABackend : SVMBackend()) end function memory_backend() diff --git a/src/array.jl b/src/array.jl index fa8d1572..ba2cbd6f 100644 --- a/src/array.jl +++ b/src/array.jl @@ -96,8 +96,10 @@ const CLVecOrMat{T} = Union{CLVector{T}, CLMatrix{T}} function memory_type() if cl.memory_backend() == cl.USMBackend() return cl.UnifiedDeviceMemory - else + elseif cl.memory_backend() == cl.SVMBackend() return cl.SharedVirtualMemory + else + return cl.BufferDeviceMemory end end CLArray{T, N}(::UndefInitializer, dims::Dims{N}) where {T, N} = @@ -177,6 +179,7 @@ is_device(a::CLArray) = memtype(a) == cl.UnifiedDeviceMemory is_shared(a::CLArray) = memtype(a) == cl.UnifiedSharedMemory is_host(a::CLArray) = memtype(a) == cl.UnifiedHostMemory is_svm(a::CLArray) = memtype(a) == cl.SharedVirtualMemory +is_bda(a::CLArray) = memtype(a) == cl.BufferDeviceMemory ## derived types @@ -379,6 +382,10 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr cl.context!(context(device_array)) do if memtype(device_array) == cl.SharedVirtualMemory cl.enqueue_svm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) + elseif memtype(device_array) == cl.BufferDeviceMemory + dstptr = dst isa Array ? pointer(dst, dst_off) : dst.data[].mem.id + srcptr = src isa Array ? pointer(src, src_off) : src.data[].mem.id + cl.enqueue_bda_copy(dstptr, 0, srcptr, 0, nbytes; blocking) else cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) end @@ -421,6 +428,8 @@ function Base.fill!(A::DenseCLArray{T}, val) where {T} GC.@preserve A begin if memtype(A) == cl.SharedVirtualMemory cl.enqueue_svm_fill(pointer(A), convert(T, val), length(A)) + elseif memtype(A) == cl.BufferDeviceMemory + cl.enqueue_fill(A.data[].mem.id, convert(T, val), length(A)) else cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) end @@ -500,6 +509,9 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} GC.@preserve a begin if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) + elseif memtype(a) == cl.BufferDeviceMemory + @warn "resizing" + cl.enqueue_bda_copy(mem.id, a.data[].mem.id, nbytes; blocking) else cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) end diff --git a/src/memory.jl b/src/memory.jl index b975d142..d695beb1 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -134,6 +134,11 @@ function alloc(::Type{cl.SharedVirtualMemory}, bytes::Int; alignment::Int = 0) return Managed(mem) end +function alloc(::Type{cl.BufferDeviceMemory}, bytes::Int; alignment::Int = 0) + mem = cl.bda_alloc(bytes; alignment) + return Managed(mem) +end + function free(managed::Managed{<:cl.AbstractMemory}) mem = managed.mem cl.context!(cl.context(mem)) do @@ -148,6 +153,8 @@ function free(managed::Managed{<:cl.AbstractMemory}) if mem isa cl.SharedVirtualMemory cl.svm_free(mem) + elseif mem isa cl.BufferDeviceMemory + cl.bda_free(mem) else cl.usm_free(mem) end diff --git a/src/util.jl b/src/util.jl index 85baf052..fec47695 100644 --- a/src/util.jl +++ b/src/util.jl @@ -71,6 +71,8 @@ function versioninfo(io::IO=stdout) backend = cl.default_memory_backend(device) if backend == cl.SVMBackend() push!(tags, "svm") + elseif backend == cl.BDABackend() + push!(tags, "bda") elseif backend == cl.USMBackend() push!(tags, "usm") end From 8fc14624ff84afd47ffcda1f1f0180f7823aa632 Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Mon, 16 Jun 2025 02:36:01 +0900 Subject: [PATCH 03/26] remove debug --- src/array.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index ba2cbd6f..9a222539 100644 --- a/src/array.jl +++ b/src/array.jl @@ -510,7 +510,6 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) elseif memtype(a) == cl.BufferDeviceMemory - @warn "resizing" cl.enqueue_bda_copy(mem.id, a.data[].mem.id, nbytes; blocking) else cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) From 5053b77c650c97ae5f050bc04e42adc032e07fcb Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Mon, 16 Jun 2025 02:56:55 +0900 Subject: [PATCH 04/26] fix resize --- src/array.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 9a222539..1bf10b63 100644 --- a/src/array.jl +++ b/src/array.jl @@ -510,7 +510,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) elseif memtype(a) == cl.BufferDeviceMemory - cl.enqueue_bda_copy(mem.id, a.data[].mem.id, nbytes; blocking) + cl.enqueue_bda_copy(mem.mem.id, a.data[].mem.id, m*sizeof(T); blocking=false) else cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) end From dfa773a9580f66ffd05606e4046bcd8ee800ea2d Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Mon, 16 Jun 2025 20:08:40 +0900 Subject: [PATCH 05/26] get indexing working --- src/array.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/array.jl b/src/array.jl index 1bf10b63..b551d4df 100644 --- a/src/array.jl +++ b/src/array.jl @@ -383,9 +383,9 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr if memtype(device_array) == cl.SharedVirtualMemory cl.enqueue_svm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) elseif memtype(device_array) == cl.BufferDeviceMemory - dstptr = dst isa Array ? pointer(dst, dst_off) : dst.data[].mem.id - srcptr = src isa Array ? pointer(src, src_off) : src.data[].mem.id - cl.enqueue_bda_copy(dstptr, 0, srcptr, 0, nbytes; blocking) + dstptr = dst isa Array ? pointer(dst) : dst.data[].mem.id + srcptr = src isa Array ? pointer(src) : src.data[].mem.id + cl.enqueue_bda_copy(dstptr, (dst_off - 1) * sizeof(T), srcptr, (src_off - 1) * sizeof(T), nbytes; blocking) else cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) end From 5a5559bbe44c72853ef814ffa359885caed60e0f Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Tue, 17 Jun 2025 18:33:34 +0900 Subject: [PATCH 06/26] get bda to use Managed properly --- lib/cl/memory/bda.jl | 22 ++-------------------- lib/cl/state.jl | 16 ++++++++++++---- src/array.jl | 16 ++++++++++------ src/memory.jl | 4 ++-- test/setup.jl | 2 +- 5 files changed, 27 insertions(+), 33 deletions(-) diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl index 000f0e0f..0bb2525f 100644 --- a/lib/cl/memory/bda.jl +++ b/lib/cl/memory/bda.jl @@ -5,12 +5,10 @@ struct BufferDeviceMemory <: AbstractMemory context::Context end -BufferDeviceMemory() = BufferDeviceMemory(C_NULL, CL_NULL, 0, context()) - function bda_alloc(bytesize::Integer; alignment::Integer = 0, device_access::Symbol = :rw, host_access::Symbol = :rw ) - bytesize == 0 && return BufferDeviceMemory() + bytesize == 0 && error("size 0 is not supported for BufferDeviceMemory.") flags = if device_access == :rw CL_MEM_READ_WRITE @@ -68,20 +66,4 @@ Base.convert(::Type{Ptr{T}}, buf::BufferDeviceMemory) where {T} = Base.convert(::Type{CLPtr{T}}, buf::BufferDeviceMemory) where {T} = reinterpret(CLPtr{T}, pointer(buf)) -enqueue_bda_copy(dst::Ptr, src::cl_mem, nbytes; kwargs...) = - enqueue_read(dst, src, nbytes; kwargs...) - -enqueue_bda_copy(dst::cl_mem, src::Ptr, nbytes; kwargs...) = - enqueue_write(dst, src, nbytes; kwargs...) - -enqueue_bda_copy(dst::cl_mem, src::cl_mem, nbytes; kwargs...) = - enqueue_copy(dst, src, nbytes; kwargs...) - -enqueue_bda_copy(dst::Ptr, dst_off::Int, src::cl_mem, src_off::Int, nbytes; kwargs...) = - enqueue_read(dst, src, src_off, nbytes; kwargs...) - -enqueue_bda_copy(dst::cl_mem, dst_off::Int, src::Ptr, src_off::Int, nbytes; kwargs...) = - enqueue_write(dst, dst_off, src, nbytes; kwargs...) - -enqueue_bda_copy(dst::cl_mem, dst_off::Int, src::cl_mem, src_off::Int, nbytes; kwargs...) = - enqueue_copy(dst, dst_off, src, src_off, nbytes; kwargs...) +Base.convert(::Type{cl_mem}, buf::BufferDeviceMemory) = buf.id diff --git a/lib/cl/state.jl b/lib/cl/state.jl index 489664c9..4cc9a680 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -175,14 +175,22 @@ function default_memory_backend(dev::Device) bda = bda_supported(dev) # determine if SVM is available (if needed) - if !usm && !bda + svm = let caps = svm_capabilities(dev) - if !caps.coarse_grain_buffer + caps.coarse_grain_buffer + end + + if usm + USMBackend() + else + if svm + SVMBackend() + elseif bda + BDABackend() + else error("Device $dev does not support USM, coarse-grained SVM, or Buffer Device Address, one of which is required by OpenCL.jl") end end - - usm ? USMBackend() : (bda ? BDABackend : SVMBackend()) end function memory_backend() diff --git a/src/array.jl b/src/array.jl index b551d4df..dc471cba 100644 --- a/src/array.jl +++ b/src/array.jl @@ -98,7 +98,7 @@ function memory_type() return cl.UnifiedDeviceMemory elseif cl.memory_backend() == cl.SVMBackend() return cl.SharedVirtualMemory - else + elseif cl.memory_backend() == cl.BDABackend() return cl.BufferDeviceMemory end end @@ -383,9 +383,13 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr if memtype(device_array) == cl.SharedVirtualMemory cl.enqueue_svm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) elseif memtype(device_array) == cl.BufferDeviceMemory - dstptr = dst isa Array ? pointer(dst) : dst.data[].mem.id - srcptr = src isa Array ? pointer(src) : src.data[].mem.id - cl.enqueue_bda_copy(dstptr, (dst_off - 1) * sizeof(T), srcptr, (src_off - 1) * sizeof(T), nbytes; blocking) + if src isa CLArray && dst isa CLArray + cl.enqueue_copy(convert(cl.cl_mem, dst.data[]), (dst_off - 1) * sizeof(T), convert(cl.cl_mem, src.data[]), (src_off - 1) * sizeof(T), nbytes; blocking) + elseif dst isa CLArray + cl.enqueue_write(convert(cl.cl_mem, dst.data[]), (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) + elseif src isa CLArray + cl.enqueue_read(pointer(dst, dst_off), convert(cl.cl_mem, src.data[]), (src_off - 1) * sizeof(T), nbytes; blocking) + end else cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) end @@ -429,7 +433,7 @@ function Base.fill!(A::DenseCLArray{T}, val) where {T} if memtype(A) == cl.SharedVirtualMemory cl.enqueue_svm_fill(pointer(A), convert(T, val), length(A)) elseif memtype(A) == cl.BufferDeviceMemory - cl.enqueue_fill(A.data[].mem.id, convert(T, val), length(A)) + cl.enqueue_fill(convert(cl.cl_mem, A.data[]), convert(T, val), length(A)) else cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) end @@ -510,7 +514,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) elseif memtype(a) == cl.BufferDeviceMemory - cl.enqueue_bda_copy(mem.mem.id, a.data[].mem.id, m*sizeof(T); blocking=false) + cl.enqueue_copy(convert(cl.cl_mem, mem), convert(cl.cl_mem, a.data[]), m*sizeof(T); blocking=false) else cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) end diff --git a/src/memory.jl b/src/memory.jl index d695beb1..693e419f 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -38,9 +38,9 @@ function maybe_synchronize(managed::Managed) return nothing end -function Base.convert(::Type{CLPtr{T}}, managed::Managed{M}) where {T, M} +function Base.convert(t::Union{Type{CLPtr{T}}, Type{cl.cl_mem}}, managed::Managed{M}) where {T, M} # let null pointers pass through as-is - ptr = convert(CLPtr{T}, managed.mem) + ptr = convert(t, managed.mem) if ptr == cl.CL_NULL return ptr end diff --git a/test/setup.jl b/test/setup.jl index b0e0c4cd..994150c4 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -1,5 +1,5 @@ using Distributed, Test -using OpenCL, pocl_jll +using OpenCL using IOCapture # KernelAbstractions has a testsuite that isn't part of the main package. From 558c9b356caf1d1126e5484133d59f4adb227ca0 Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Tue, 17 Jun 2025 18:34:40 +0900 Subject: [PATCH 07/26] typo: bring back pocl_jll for tests --- test/setup.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/setup.jl b/test/setup.jl index 994150c4..b0e0c4cd 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -1,5 +1,5 @@ using Distributed, Test -using OpenCL +using OpenCL, pocl_jll using IOCapture # KernelAbstractions has a testsuite that isn't part of the main package. From 9d1f5dc3d41f091eaef8e8442d360b9511cfcc01 Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Tue, 17 Jun 2025 22:43:52 +0900 Subject: [PATCH 08/26] spoof 0 sized buffer device arrays --- lib/cl/memory/bda.jl | 36 +++++++++++++++++++++++++++++++++++- src/array.jl | 10 +++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl index 0bb2525f..ce61d52b 100644 --- a/lib/cl/memory/bda.jl +++ b/lib/cl/memory/bda.jl @@ -5,10 +5,12 @@ struct BufferDeviceMemory <: AbstractMemory context::Context end +BufferDeviceMemory() = BufferDeviceMemory(C_NULL, CL_NULL, 0, context()) + function bda_alloc(bytesize::Integer; alignment::Integer = 0, device_access::Symbol = :rw, host_access::Symbol = :rw ) - bytesize == 0 && error("size 0 is not supported for BufferDeviceMemory.") + bytesize == 0 && return BufferDeviceMemory() flags = if device_access == :rw CL_MEM_READ_WRITE @@ -67,3 +69,35 @@ Base.convert(::Type{CLPtr{T}}, buf::BufferDeviceMemory) where {T} = reinterpret(CLPtr{T}, pointer(buf)) Base.convert(::Type{cl_mem}, buf::BufferDeviceMemory) = buf.id + +function enqueue_bda_copy(dst, dst_off, src, src_off, nbytes; kwargs...) + if nbytes == 0 || dst == C_NULL || src == C_NULL + return nothing + else + enqueue_copy(dst, dst_off, src, src_off, nbytes; kwargs...) + end +end + +function enqueue_bda_read(dst, src, src_off, nbytes; kwargs...) + if nbytes == 0 || src == C_NULL + return nothing + else + enqueue_read(dst, src, src_off, nbytes; kwargs...) + end +end + +function enqueue_bda_write(dst, dst_off, src, nbytes; kwargs...) + if nbytes == 0 || dst == C_NULL + return nothing + else + enqueue_write(dst, dst_off, src, nbytes; kwargs...) + end +end + +function enqueue_bda_fill(b, offset, pattern, N; kwargs...) + if b == C_NULL + return nothing + else + enqueue_fill(b, offset, pattern, N; kwargs...) + end +end diff --git a/src/array.jl b/src/array.jl index dc471cba..9d84901b 100644 --- a/src/array.jl +++ b/src/array.jl @@ -384,11 +384,11 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr cl.enqueue_svm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) elseif memtype(device_array) == cl.BufferDeviceMemory if src isa CLArray && dst isa CLArray - cl.enqueue_copy(convert(cl.cl_mem, dst.data[]), (dst_off - 1) * sizeof(T), convert(cl.cl_mem, src.data[]), (src_off - 1) * sizeof(T), nbytes; blocking) + cl.enqueue_bda_copy(convert(cl.cl_mem, dst.data[]), (dst_off - 1) * sizeof(T), convert(cl.cl_mem, src.data[]), (src_off - 1) * sizeof(T), nbytes; blocking) elseif dst isa CLArray - cl.enqueue_write(convert(cl.cl_mem, dst.data[]), (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) + cl.enqueue_bda_write(convert(cl.cl_mem, dst.data[]), (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) elseif src isa CLArray - cl.enqueue_read(pointer(dst, dst_off), convert(cl.cl_mem, src.data[]), (src_off - 1) * sizeof(T), nbytes; blocking) + cl.enqueue_bda_read(pointer(dst, dst_off), convert(cl.cl_mem, src.data[]), (src_off - 1) * sizeof(T), nbytes; blocking) end else cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) @@ -433,7 +433,7 @@ function Base.fill!(A::DenseCLArray{T}, val) where {T} if memtype(A) == cl.SharedVirtualMemory cl.enqueue_svm_fill(pointer(A), convert(T, val), length(A)) elseif memtype(A) == cl.BufferDeviceMemory - cl.enqueue_fill(convert(cl.cl_mem, A.data[]), convert(T, val), length(A)) + cl.enqueue_bda_fill(convert(cl.cl_mem, A.data[]), 0, convert(T, val), length(A)) else cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) end @@ -514,7 +514,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) elseif memtype(a) == cl.BufferDeviceMemory - cl.enqueue_copy(convert(cl.cl_mem, mem), convert(cl.cl_mem, a.data[]), m*sizeof(T); blocking=false) + cl.enqueue_bda_copy(convert(cl.cl_mem, mem), 0, convert(cl.cl_mem, a.data[]), 0, m*sizeof(T); blocking=false) else cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) end From 19b4061a7fc995edc9b01d8556b2ba1096d84952 Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Tue, 17 Jun 2025 23:32:17 +0900 Subject: [PATCH 09/26] cleaner 0 sized buffers --- lib/cl/memory/bda.jl | 1 - lib/cl/memory/svm.jl | 1 - lib/cl/memory/usm.jl | 3 --- src/array.jl | 4 ++-- src/memory.jl | 7 +++++++ 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl index ce61d52b..6cbd0bbc 100644 --- a/lib/cl/memory/bda.jl +++ b/lib/cl/memory/bda.jl @@ -10,7 +10,6 @@ BufferDeviceMemory() = BufferDeviceMemory(C_NULL, CL_NULL, 0, context()) function bda_alloc(bytesize::Integer; alignment::Integer = 0, device_access::Symbol = :rw, host_access::Symbol = :rw ) - bytesize == 0 && return BufferDeviceMemory() flags = if device_access == :rw CL_MEM_READ_WRITE diff --git a/lib/cl/memory/svm.jl b/lib/cl/memory/svm.jl index cbe6f487..a00ba83a 100644 --- a/lib/cl/memory/svm.jl +++ b/lib/cl/memory/svm.jl @@ -9,7 +9,6 @@ SharedVirtualMemory() = SharedVirtualMemory(CL_NULL, 0, context()) function svm_alloc(bytesize::Integer; alignment::Integer = 0, access::Symbol = :rw, fine_grained = false ) - bytesize == 0 && return SharedVirtualMemory() flags = if access == :rw CL_MEM_READ_WRITE diff --git a/lib/cl/memory/usm.jl b/lib/cl/memory/usm.jl index 86f56455..720efa45 100644 --- a/lib/cl/memory/usm.jl +++ b/lib/cl/memory/usm.jl @@ -31,7 +31,6 @@ UnifiedDeviceMemory() = UnifiedDeviceMemory(CL_NULL, 0, context()) function device_alloc(bytesize::Integer; alignment::Integer = 0, write_combined::Bool = false ) - bytesize == 0 && return UnifiedDeviceMemory() flags = 0 if write_combined @@ -78,7 +77,6 @@ UnifiedHostMemory() = UnifiedHostMemory(C_NULL, 0, context()) function host_alloc(bytesize::Integer; alignment::Integer = 0, write_combined::Bool = false ) - bytesize == 0 && return UnifiedHostMemory() flags = 0 if write_combined @@ -124,7 +122,6 @@ UnifiedSharedMemory() = UnifiedSharedMemory(CL_NULL, 0, context()) function shared_alloc(bytesize::Integer; alignment::Integer = 0, write_combined = false, placement = nothing ) - bytesize == 0 && return UnifiedSharedMemory() flags = 0 if write_combined diff --git a/src/array.jl b/src/array.jl index 9d84901b..5e17bfa2 100644 --- a/src/array.jl +++ b/src/array.jl @@ -52,7 +52,7 @@ mutable struct CLArray{T, N, M} <: AbstractGPUArray{T, N} maxsize end data = GPUArrays.cached_alloc((CLArray, cl.context(), M, bufsize)) do - buf = alloc(M, bufsize; alignment=Base.datatype_alignment(T)) + buf = managed_alloc(M, bufsize; alignment=Base.datatype_alignment(T)) DataRef(free, buf) end obj = new{T, N, M}(data, maxsize, 0, dims) @@ -506,7 +506,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} # replace the data with a new CL. this 'unshares' the array. # as a result, we can safely support resizing unowned buffers. new_data = cl.context!(context(a)) do - mem = alloc(memtype(a), bufsize; alignment=Base.datatype_alignment(T)) + mem = managed_alloc(memtype(a), bufsize; alignment=Base.datatype_alignment(T)) ptr = convert(CLPtr{T}, mem) m = min(length(a), n) if m > 0 diff --git a/src/memory.jl b/src/memory.jl index 693e419f..8fc20873 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -112,6 +112,13 @@ end ## public interface +function managed_alloc(t::Type{T}, bytes::Int; kwargs...) where T <: cl.AbstractMemory + if bytes == 0 + return Managed(T()) + else + alloc(t, bytes; kwargs...) + end +end function alloc(::Type{cl.UnifiedDeviceMemory}, bytes::Int; alignment::Int = 0) mem = cl.device_alloc(bytes; alignment) From f0db23d4ff2055ca49a4408df525aa1ceb0dcada Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Thu, 19 Jun 2025 14:15:44 +0900 Subject: [PATCH 10/26] fixed offsets and views --- lib/cl/state.jl | 11 +++++++++++ src/array.jl | 10 +++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/lib/cl/state.jl b/lib/cl/state.jl index 4cc9a680..3a9cfb4d 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -180,6 +180,17 @@ function default_memory_backend(dev::Device) caps.coarse_grain_buffer end + if haskey(ENV, "JULIA_OPENCL_BACKEND") && ENV["JULIA_OPENCL_BACKEND"] in ["usm", "bda", "svm"] + user_backend = ENV["JULIA_OPENCL_BACKEND"] + if user_backend == "usm" && usm + return USMBackend() + elseif user_backend == "bda" && bda + return BDABackend() + elseif user_backend == "svm" && svm + return SVMBackend() + end + end + if usm USMBackend() else diff --git a/src/array.jl b/src/array.jl index 5e17bfa2..f7390b6d 100644 --- a/src/array.jl +++ b/src/array.jl @@ -384,11 +384,11 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr cl.enqueue_svm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) elseif memtype(device_array) == cl.BufferDeviceMemory if src isa CLArray && dst isa CLArray - cl.enqueue_bda_copy(convert(cl.cl_mem, dst.data[]), (dst_off - 1) * sizeof(T), convert(cl.cl_mem, src.data[]), (src_off - 1) * sizeof(T), nbytes; blocking) + cl.enqueue_bda_copy(convert(cl.cl_mem, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), convert(cl.cl_mem, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) elseif dst isa CLArray - cl.enqueue_bda_write(convert(cl.cl_mem, dst.data[]), (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) + cl.enqueue_bda_write(convert(cl.cl_mem, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) elseif src isa CLArray - cl.enqueue_bda_read(pointer(dst, dst_off), convert(cl.cl_mem, src.data[]), (src_off - 1) * sizeof(T), nbytes; blocking) + cl.enqueue_bda_read(pointer(dst, dst_off), convert(cl.cl_mem, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) end else cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) @@ -433,7 +433,7 @@ function Base.fill!(A::DenseCLArray{T}, val) where {T} if memtype(A) == cl.SharedVirtualMemory cl.enqueue_svm_fill(pointer(A), convert(T, val), length(A)) elseif memtype(A) == cl.BufferDeviceMemory - cl.enqueue_bda_fill(convert(cl.cl_mem, A.data[]), 0, convert(T, val), length(A)) + cl.enqueue_bda_fill(convert(cl.cl_mem, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) else cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) end @@ -514,7 +514,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) elseif memtype(a) == cl.BufferDeviceMemory - cl.enqueue_bda_copy(convert(cl.cl_mem, mem), 0, convert(cl.cl_mem, a.data[]), 0, m*sizeof(T); blocking=false) + cl.enqueue_bda_copy(convert(cl.cl_mem, mem), 0, convert(cl.cl_mem, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) else cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) end From c98d60bfbfeacb0025444f81377ff665f5fb4b46 Mon Sep 17 00:00:00 2001 From: Chetan Vardhan Date: Thu, 19 Jun 2025 17:11:46 +0900 Subject: [PATCH 11/26] update libopencl according to latest headers --- lib/cl/libopencl.jl | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/lib/cl/libopencl.jl b/lib/cl/libopencl.jl index f34b1bb6..1886fcd6 100644 --- a/lib/cl/libopencl.jl +++ b/lib/cl/libopencl.jl @@ -2240,6 +2240,8 @@ const clSetContentSizeBufferPoCL_fn = Ptr{clSetContentSizeBufferPoCL_t} content_size_buffer::cl_mem)::cl_int end +const cl_device_kernel_clock_capabilities_khr = cl_bitfield + const cl_mem_device_address_ext = cl_ulong # typedef cl_int CL_API_CALL clSetKernelArgDevicePointerEXT_t ( cl_kernel kernel , cl_uint arg_index , cl_mem_device_address_ext arg_value ) @@ -4846,6 +4848,20 @@ const CL_KHR_INT64_EXTENDED_ATOMICS_EXTENSION_NAME = "cl_khr_int64_extended_atom const CL_KHR_INT64_EXTENDED_ATOMICS_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) +const cl_khr_kernel_clock = 1 + +const CL_KHR_KERNEL_CLOCK_EXTENSION_NAME = "cl_khr_kernel_clock" + +const CL_KHR_KERNEL_CLOCK_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) + +const CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR = 0x1076 + +const CL_DEVICE_KERNEL_CLOCK_SCOPE_DEVICE_KHR = 1 << 0 + +const CL_DEVICE_KERNEL_CLOCK_SCOPE_WORK_GROUP_KHR = 1 << 1 + +const CL_DEVICE_KERNEL_CLOCK_SCOPE_SUB_GROUP_KHR = 1 << 2 + const cl_khr_local_int32_base_atomics = 1 const CL_KHR_LOCAL_INT32_BASE_ATOMICS_EXTENSION_NAME = "cl_khr_local_int32_base_atomics" @@ -4888,6 +4904,18 @@ const CL_KHR_SPIRV_NO_INTEGER_WRAP_DECORATION_EXTENSION_NAME = "cl_khr_spirv_no_ const CL_KHR_SPIRV_NO_INTEGER_WRAP_DECORATION_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) +const cl_khr_spirv_queries = 1 + +const CL_KHR_SPIRV_QUERIES_EXTENSION_NAME = "cl_khr_spirv_queries" + +const CL_KHR_SPIRV_QUERIES_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) + +const CL_DEVICE_SPIRV_EXTENDED_INSTRUCTION_SETS_KHR = 0x12b9 + +const CL_DEVICE_SPIRV_EXTENSIONS_KHR = 0x12ba + +const CL_DEVICE_SPIRV_CAPABILITIES_KHR = 0x12bb + const cl_khr_srgb_image_writes = 1 const CL_KHR_SRGB_IMAGE_WRITES_EXTENSION_NAME = "cl_khr_srgb_image_writes" From 7eea327482e3151bf4e72beeedbd6f555970986a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 10:12:42 +0200 Subject: [PATCH 12/26] Switch to preferences. --- .github/workflows/Test.yml | 4 ++++ .gitignore | 1 - LocalPreferences.toml | 7 +++++++ Project.toml | 2 ++ lib/cl/CL.jl | 2 ++ lib/cl/state.jl | 41 ++++++++++++++++++++------------------ src/OpenCL.jl | 1 + src/util.jl | 13 ++++++++++++ 8 files changed, 51 insertions(+), 20 deletions(-) create mode 100644 LocalPreferences.toml diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index d8d9eb2c..ec454e4c 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -26,7 +26,9 @@ jobs: os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-13, macOS-15, windows-2025] arch: [x64, arm64] pocl: [jll, local] + memory_backend: [usm, bda, svm] exclude: + # unsupported combinations - os: ubuntu-24.04 arch: arm64 - os: windows-2025 @@ -130,6 +132,8 @@ jobs: - name: Setup OpenCL.jl run: | + echo '[OpenCL]' > test/LocalPreferences.toml + echo 'memory_backend="${{ matrix.memory_backend }}"' >> test/LocalPreferences.toml julia --project -e ' using Pkg Pkg.develop(path="lib/intrinsics")' diff --git a/.gitignore b/.gitignore index 3819a7de..ba39cc53 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ Manifest.toml -LocalPreferences.toml diff --git a/LocalPreferences.toml b/LocalPreferences.toml new file mode 100644 index 00000000..c7da6de0 --- /dev/null +++ b/LocalPreferences.toml @@ -0,0 +1,7 @@ +[OpenCL] +# which memory back-end to use for CLArray allocations. This can be: +# - "auto" (default): automatically selects the best available backend +# - "usm": Unified Shared Memory (`cl_intel_unified_shared_memory`) +# - "bda": Buffer Device Address (`cl_mem` + `cl_ext_buffer_device_address`) +# - "svm": Shared Virtual Memory (coarse-grained) +#memory_backend="auto" diff --git a/Project.toml b/Project.toml index d3fd72d4..b914bd7d 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4" +Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -26,6 +27,7 @@ KernelAbstractions = "0.9.2" LLVM = "9.1" LinearAlgebra = "1" OpenCL_jll = "=2024.10.24" +Preferences = "1" Printf = "1" Random = "1" Reexport = "1" diff --git a/lib/cl/CL.jl b/lib/cl/CL.jl index 67e3fc49..9306c87a 100644 --- a/lib/cl/CL.jl +++ b/lib/cl/CL.jl @@ -1,6 +1,8 @@ module cl +import ..OpenCL using Printf +using Preferences include("pointer.jl") include("api.jl") diff --git a/lib/cl/state.jl b/lib/cl/state.jl index 3a9cfb4d..e3f136c6 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -173,34 +173,37 @@ function default_memory_backend(dev::Device) end bda = bda_supported(dev) - + # determine if SVM is available (if needed) svm = let caps = svm_capabilities(dev) caps.coarse_grain_buffer end - if haskey(ENV, "JULIA_OPENCL_BACKEND") && ENV["JULIA_OPENCL_BACKEND"] in ["usm", "bda", "svm"] - user_backend = ENV["JULIA_OPENCL_BACKEND"] - if user_backend == "usm" && usm - return USMBackend() - elseif user_backend == "bda" && bda - return BDABackend() - elseif user_backend == "svm" && svm - return SVMBackend() + preferred_backend = load_preference(OpenCL, "memory_backend", "auto") + if preferred_backend == "auto" + if usm + USMBackend() + else + if svm + SVMBackend() + elseif bda + BDABackend() + else + error("Device $dev does not support USM, coarse-grained SVM, or Buffer Device Address, one of which is required by OpenCL.jl") + end end - end - - if usm + elseif preferred_backend == "usm" + usm || error("Use of USM memory backend requested, which is not supported by device $dev") USMBackend() + elseif preferred_backend == "bda" + bda || error("Use of Buffer Device Address memory backend requested, which is not supported by device $dev") + BDABackend() + elseif preferred_backend == "svm" + svm || error("Use of coarse-grained SVM memory backend requested, which is not supported by device $dev") + SVMBackend() else - if svm - SVMBackend() - elseif bda - BDABackend() - else - error("Device $dev does not support USM, coarse-grained SVM, or Buffer Device Address, one of which is required by OpenCL.jl") - end + error("Unknown memory backend '$preferred_backend' requested") end end diff --git a/src/OpenCL.jl b/src/OpenCL.jl index 23cc0abb..caa63696 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -7,6 +7,7 @@ using Adapt using Reexport using GPUArrays using Random +using Preferences using Core: LLVMPtr diff --git a/src/util.jl b/src/util.jl index fec47695..014384d2 100644 --- a/src/util.jl +++ b/src/util.jl @@ -53,6 +53,19 @@ function versioninfo(io::IO=stdout) println(io) end + prefs = [ + "memory_backend" => load_preference(OpenCL, "memory_backend"), + ] + if any(x->!isnothing(x[2]), prefs) + println(io, "Preferences:") + for (key, val) in prefs + if !isnothing(val) + println(io, "- $key: $val") + end + end + println(io) + end + println(io, "Available platforms: ", length(cl.platforms())) for platform in cl.platforms() println(io, " - $(platform.name)") From 26fdaf725d7392f7ff61ee1e0fd8e8bfade49f48 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 10:50:10 +0200 Subject: [PATCH 13/26] Improve reporting. --- lib/cl/state.jl | 73 +++++++++++++++++++++++++++---------------------- src/util.jl | 16 ++++++----- 2 files changed, 50 insertions(+), 39 deletions(-) diff --git a/lib/cl/state.jl b/lib/cl/state.jl index e3f136c6..56c2249c 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -163,53 +163,62 @@ struct SVMBackend <: AbstractMemoryBackend end struct USMBackend <: AbstractMemoryBackend end struct BDABackend <: AbstractMemoryBackend end -function default_memory_backend(dev::Device) - # determine if USM is supported - usm = if usm_supported(dev) - caps = usm_capabilities(dev) - caps.host.access && caps.device.access - else - false +function supported_memory_backends(dev::Device) + backends = AbstractMemoryBackend[] + + # unified shared memory is the first choice, as it gives us separate host and device + # memory spaces that can be directly referenced by raw pointers. + if usm_supported(dev) + usm_caps = usm_capabilities(dev) + if usm_caps.host.access && usm_caps.device.access + push!(backends, USMBackend()) + end end - bda = bda_supported(dev) + # plain old device buffers are second choice, but require an extension to support being + # referenced by raw pointers. + if bda_supported(dev) + push!(backends, BDABackend()) + end - # determine if SVM is available (if needed) - svm = let - caps = svm_capabilities(dev) - caps.coarse_grain_buffer + # shared virtual memory is last, because it comes at a performance cost. + svm_caps = svm_capabilities(dev) + if svm_caps.coarse_grain_buffer + push!(backends, SVMBackend()) end + return backends +end + +function default_memory_backend(dev::Device) + supported_backends = supported_memory_backends(dev) + isempty(supported_backends) && return nothing + preferred_backend = load_preference(OpenCL, "memory_backend", "auto") if preferred_backend == "auto" - if usm + first(supported_backends) + else + backend = if preferred_backend == "usm" USMBackend() + elseif preferred_backend == "bda" + BDABackend() + elseif preferred_backend == "svm" + SVMBackend() else - if svm - SVMBackend() - elseif bda - BDABackend() - else - error("Device $dev does not support USM, coarse-grained SVM, or Buffer Device Address, one of which is required by OpenCL.jl") - end + error("Unknown memory backend '$preferred_backend' requested") end - elseif preferred_backend == "usm" - usm || error("Use of USM memory backend requested, which is not supported by device $dev") - USMBackend() - elseif preferred_backend == "bda" - bda || error("Use of Buffer Device Address memory backend requested, which is not supported by device $dev") - BDABackend() - elseif preferred_backend == "svm" - svm || error("Use of coarse-grained SVM memory backend requested, which is not supported by device $dev") - SVMBackend() - else - error("Unknown memory backend '$preferred_backend' requested") + in(backend, supported_backends) || return nothing + backend end end function memory_backend() return get!(task_local_storage(), :CLMemoryBackend) do - default_memory_backend(device()) + backend = default_memory_backend(device()) + if backend === nothing + error("Device $(device()) does not support any of the available memory backends") + end + backend end end diff --git a/src/util.jl b/src/util.jl index 014384d2..08f9eb5f 100644 --- a/src/util.jl +++ b/src/util.jl @@ -81,13 +81,15 @@ function versioninfo(io::IO=stdout) # show a list of tags tags = [] ## memory back-end - backend = cl.default_memory_backend(device) - if backend == cl.SVMBackend() - push!(tags, "svm") - elseif backend == cl.BDABackend() - push!(tags, "bda") - elseif backend == cl.USMBackend() - push!(tags, "usm") + for backend in cl.supported_memory_backends(device) + suffix = backend == cl.default_memory_backend(device) ? "*" : "" + if backend isa cl.SVMBackend + push!(tags, "svm"*suffix) + elseif backend isa cl.BDABackend + push!(tags, "bda"*suffix) + elseif backend isa cl.USMBackend + push!(tags, "usm"*suffix) + end end ## relevant extensions if in("cl_khr_fp16", device.extensions) From 367d027dafd7bc973520443168131a9c724fef52 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 10:52:09 +0200 Subject: [PATCH 14/26] Rename CI job. --- .github/workflows/Test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index ec454e4c..d11996c3 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -13,7 +13,7 @@ concurrency: jobs: test: - name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - PoCL ${{ matrix.pocl }} + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ matrix.memory_backend }} - PoCL ${{ matrix.pocl }} runs-on: ${{ matrix.os }} timeout-minutes: 180 permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created From d4949a696fc44cad51debe436e7e91c339542aad Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 12:56:59 +0200 Subject: [PATCH 15/26] POC: Unify most of BDA with existing Buffer abstraction. --- lib/cl/CL.jl | 2 +- lib/cl/buffer.jl | 29 ++++++++----- lib/cl/memory/bda.jl | 99 +++++++++----------------------------------- lib/cl/memory/svm.jl | 26 +++++------- lib/cl/memory/usm.jl | 60 +++++++++++++-------------- src/array.jl | 11 ++--- src/memory.jl | 3 +- 7 files changed, 86 insertions(+), 144 deletions(-) diff --git a/lib/cl/CL.jl b/lib/cl/CL.jl index 9306c87a..1de3033d 100644 --- a/lib/cl/CL.jl +++ b/lib/cl/CL.jl @@ -20,8 +20,8 @@ include("device.jl") include("context.jl") include("cmdqueue.jl") include("event.jl") -include("memory/memory.jl") include("buffer.jl") +include("memory/memory.jl") include("program.jl") include("kernel.jl") diff --git a/lib/cl/buffer.jl b/lib/cl/buffer.jl index 0d4774f8..02ff7ced 100644 --- a/lib/cl/buffer.jl +++ b/lib/cl/buffer.jl @@ -95,7 +95,7 @@ Base.sizeof(b::Buffer{T}) where {T} = b.len * sizeof(T) # for internal use function Buffer{T}(len::Int, flags::Integer, hostbuf=nothing; - device=:rw, host=:rw) where {T} + device=:rw, host=:rw, device_private_address=false) where {T} sz = len * sizeof(T) if device == :rw @@ -121,7 +121,16 @@ function Buffer{T}(len::Int, flags::Integer, hostbuf=nothing; end err_code = Ref{Cint}() - mem_id = clCreateBuffer(context(), flags, sz, something(hostbuf, C_NULL), err_code) + properties = cl_mem_properties[] + if device_private_address + append!(properties, [CL_MEM_DEVICE_PRIVATE_ADDRESS_EXT, CL_TRUE]) + end + mem_id = if isempty(properties) + clCreateBuffer(context(), flags, sz, something(hostbuf, C_NULL), err_code) + else + push!(properties, 0) + clCreateBufferWithProperties(context(), properties, flags, sz, something(hostbuf, C_NULL), err_code) + end if err_code[] != CL_SUCCESS throw(CLError(err_code[])) end @@ -144,7 +153,7 @@ end ## memory operations # reading from buffer to host array, return an event -function enqueue_read(dst::Ptr, src::Union{Buffer, cl_mem}, src_off::Int, nbytes::Int; +function enqueue_read(dst::Ptr, src::Buffer, src_off::Int, nbytes::Int; blocking::Bool=false, wait_for::Vector{Event}=Event[]) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] @@ -155,11 +164,11 @@ function enqueue_read(dst::Ptr, src::Union{Buffer, cl_mem}, src_off::Int, nbytes @return_nanny_event(ret_evt[], dst) end end -enqueue_read(dst::Ptr, src::Union{Buffer, cl_mem}, nbytes; kwargs...) = +enqueue_read(dst::Ptr, src::Buffer, nbytes; kwargs...) = enqueue_read(dst, src, 0, nbytes; kwargs...) # writing from host array to buffer, return an event -function enqueue_write(dst::Union{Buffer, cl_mem}, dst_off::Int, src::Ptr, nbytes::Int; +function enqueue_write(dst::Buffer, dst_off::Int, src::Ptr, nbytes::Int; blocking::Bool=false, wait_for::Vector{Event}=Event[]) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] @@ -170,11 +179,11 @@ function enqueue_write(dst::Union{Buffer, cl_mem}, dst_off::Int, src::Ptr, nbyte @return_nanny_event(ret_evt[], dst) end end -enqueue_write(dst::Union{Buffer, cl_mem}, src::Ptr, nbytes; kwargs...) = +enqueue_write(dst::Buffer, src::Ptr, nbytes; kwargs...) = enqueue_write(dst, 0, src, nbytes; kwargs...) # copying between two buffers, return an event -function enqueue_copy(dst::Union{Buffer, cl_mem}, dst_off::Int, src::Union{Buffer, cl_mem}, src_off::Int, +function enqueue_copy(dst::Buffer, dst_off::Int, src::Buffer, src_off::Int, nbytes::Int; blocking::Bool=false, wait_for::Vector{Event}=Event[]) n_evts = length(wait_for) @@ -186,7 +195,7 @@ function enqueue_copy(dst::Union{Buffer, cl_mem}, dst_off::Int, src::Union{Buffe @return_event ret_evt[] end end -enqueue_copy(dst::Union{Buffer, cl_mem}, src::Union{Buffer, cl_mem}, N; kwargs...) = +enqueue_copy(dst::Buffer, src::Buffer, N; kwargs...) = enqueue_copy(dst, 0, src, 0, N; kwargs...) # map a buffer into the host address space, returning a pointer and an event @@ -231,7 +240,7 @@ function enqueue_unmap(b::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[]) end # fill a buffer with a pattern, returning an event -function enqueue_fill(b::Union{Buffer, cl_mem}, offset::Integer, pattern::T, N::Integer; +function enqueue_fill(b::Buffer, offset::Integer, pattern::T, N::Integer; wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) nbytes_pattern = sizeof(T) @@ -246,4 +255,4 @@ function enqueue_fill(b::Union{Buffer, cl_mem}, offset::Integer, pattern::T, N:: @return_event ret_evt[] end end -enqueue_fill(b::Union{Buffer, cl_mem}, pattern, N::Integer) = enqueue_fill(b, 0, pattern, N) +enqueue_fill(b::Buffer, pattern, N::Integer) = enqueue_fill(b, 0, pattern, N) diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl index 6cbd0bbc..19f86243 100644 --- a/lib/cl/memory/bda.jl +++ b/lib/cl/memory/bda.jl @@ -1,102 +1,43 @@ struct BufferDeviceMemory <: AbstractMemory - id::cl_mem + buf::Union{Buffer{UInt8}, Nothing} ptr::CLPtr{Cvoid} bytesize::Int context::Context end -BufferDeviceMemory() = BufferDeviceMemory(C_NULL, CL_NULL, 0, context()) +BufferDeviceMemory() = BufferDeviceMemory(nothing, CL_NULL, 0, context()) function bda_alloc(bytesize::Integer; - alignment::Integer = 0, device_access::Symbol = :rw, host_access::Symbol = :rw + alignment::Integer = 0, device::Symbol = :rw, host::Symbol = :rw ) - flags = if device_access == :rw - CL_MEM_READ_WRITE - elseif device_access == :r - CL_MEM_READ_ONLY - elseif device_access == :w - CL_MEM_WRITE_ONLY - else - throw(ArgumentError("Invalid access type")) - end + # TODO: use alignment + buf = Buffer{UInt8}(bytesize; device, host, device_private_address=true) - if host_access == :rw - # nothing to do - elseif host_access == :r - flags |= CL_MEM_HOST_READ_ONLY - elseif host_access == :w - flags |= CL_MEM_HOST_WRITE_ONLY - elseif host_access == :none - flags |= CL_MEM_HOST_NO_ACCESS - else - throw(ArgumentError("Host access flag must be one of :rw, :r, or :w")) - end - - - err_code = Ref{Cint}() - properties = cl_mem_properties[CL_MEM_DEVICE_PRIVATE_ADDRESS_EXT, CL_TRUE, 0] - mem_id = clCreateBufferWithProperties(context(), properties, flags, bytesize, C_NULL, err_code) addr = Ref{cl_mem_device_address_ext}() - clGetMemObjectInfo(mem_id, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), addr, C_NULL) + clGetMemObjectInfo(buf, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), addr, C_NULL) ptr = CLPtr{Cvoid}(addr[]) @assert ptr != C_NULL - if err_code[] != CL_SUCCESS - throw(CLError(err_code[])) - end - return BufferDeviceMemory(mem_id, ptr, bytesize, context()) + return BufferDeviceMemory(buf, ptr, bytesize, context()) end -function bda_free(buf::BufferDeviceMemory) - if sizeof(buf) != 0 - clReleaseMemObject(buf.id) - end +function bda_free(mem::BufferDeviceMemory) + # XXX: Buffer is separately GCd + #clReleaseMemObject(mem.buf) return end -Base.pointer(buf::BufferDeviceMemory) = buf.ptr -Base.sizeof(buf::BufferDeviceMemory) = buf.bytesize -context(buf::BufferDeviceMemory) = buf.context - -Base.show(io::IO, buf::BufferDeviceMemory) = - @printf(io, "BufferDeviceMemory(%s at %p)", Base.format_bytes(sizeof(buf)), Int(pointer(buf))) +Base.pointer(mem::BufferDeviceMemory) = mem.ptr +Base.sizeof(mem::BufferDeviceMemory) = mem.bytesize +context(mem::BufferDeviceMemory) = mem.context -Base.convert(::Type{Ptr{T}}, buf::BufferDeviceMemory) where {T} = - convert(Ptr{T}, pointer(buf)) +Base.show(io::IO, mem::BufferDeviceMemory) = + @printf(io, "BufferDeviceMemory(%s at %p)", Base.format_bytes(sizeof(mem)), Int(pointer(mem))) -Base.convert(::Type{CLPtr{T}}, buf::BufferDeviceMemory) where {T} = - reinterpret(CLPtr{T}, pointer(buf)) +Base.convert(::Type{Ptr{T}}, mem::BufferDeviceMemory) where {T} = + convert(Ptr{T}, pointer(mem)) -Base.convert(::Type{cl_mem}, buf::BufferDeviceMemory) = buf.id +Base.convert(::Type{CLPtr{T}}, mem::BufferDeviceMemory) where {T} = + reinterpret(CLPtr{T}, pointer(mem)) -function enqueue_bda_copy(dst, dst_off, src, src_off, nbytes; kwargs...) - if nbytes == 0 || dst == C_NULL || src == C_NULL - return nothing - else - enqueue_copy(dst, dst_off, src, src_off, nbytes; kwargs...) - end -end - -function enqueue_bda_read(dst, src, src_off, nbytes; kwargs...) - if nbytes == 0 || src == C_NULL - return nothing - else - enqueue_read(dst, src, src_off, nbytes; kwargs...) - end -end - -function enqueue_bda_write(dst, dst_off, src, nbytes; kwargs...) - if nbytes == 0 || dst == C_NULL - return nothing - else - enqueue_write(dst, dst_off, src, nbytes; kwargs...) - end -end - -function enqueue_bda_fill(b, offset, pattern, N; kwargs...) - if b == C_NULL - return nothing - else - enqueue_fill(b, offset, pattern, N; kwargs...) - end -end +Base.convert(::Type{Buffer{UInt8}}, mem::BufferDeviceMemory) = mem.buf diff --git a/lib/cl/memory/svm.jl b/lib/cl/memory/svm.jl index a00ba83a..485d6d68 100644 --- a/lib/cl/memory/svm.jl +++ b/lib/cl/memory/svm.jl @@ -35,25 +35,20 @@ function svm_alloc(bytesize::Integer; return SharedVirtualMemory(ptr, bytesize, context()) end -function svm_free(buf::SharedVirtualMemory) - if sizeof(buf) != 0 - clSVMFree(context(buf), buf) - end - return -end +svm_free(mem::SharedVirtualMemory) = clSVMFree(context(mem), mem) -Base.pointer(buf::SharedVirtualMemory) = buf.ptr -Base.sizeof(buf::SharedVirtualMemory) = buf.bytesize -context(buf::SharedVirtualMemory) = buf.context +Base.pointer(mem::SharedVirtualMemory) = mem.ptr +Base.sizeof(mem::SharedVirtualMemory) = mem.bytesize +context(mem::SharedVirtualMemory) = mem.context -Base.show(io::IO, buf::SharedVirtualMemory) = - @printf(io, "SharedVirtualMemory(%s at %p)", Base.format_bytes(sizeof(buf)), Int(pointer(buf))) +Base.show(io::IO, mem::SharedVirtualMemory) = + @printf(io, "SharedVirtualMemory(%s at %p)", Base.format_bytes(sizeof(mem)), Int(pointer(mem))) -Base.convert(::Type{Ptr{T}}, buf::SharedVirtualMemory) where {T} = - convert(Ptr{T}, pointer(buf)) +Base.convert(::Type{Ptr{T}}, mem::SharedVirtualMemory) where {T} = + convert(Ptr{T}, pointer(mem)) -Base.convert(::Type{CLPtr{T}}, buf::SharedVirtualMemory) where {T} = - reinterpret(CLPtr{T}, pointer(buf)) +Base.convert(::Type{CLPtr{T}}, mem::SharedVirtualMemory) where {T} = + reinterpret(CLPtr{T}, pointer(mem)) ## memory operations @@ -117,7 +112,6 @@ end function enqueue_svm_fill(ptr::Union{Ptr, CLPtr}, pattern::T, N::Integer; wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) - nbytes == 0 && return pattern_size = sizeof(T) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] diff --git a/lib/cl/memory/usm.jl b/lib/cl/memory/usm.jl index 720efa45..002da65c 100644 --- a/lib/cl/memory/usm.jl +++ b/lib/cl/memory/usm.jl @@ -1,14 +1,11 @@ abstract type UnifiedMemory <: AbstractMemory end -function usm_free(buf::UnifiedMemory; blocking::Bool = false) - if sizeof(buf) != 0 - if blocking - clMemBlockingFreeINTEL(context(buf), buf) - else - clMemFreeINTEL(context(buf), buf) - end +function usm_free(mem::UnifiedMemory; blocking::Bool = false) + if blocking + clMemBlockingFreeINTEL(context(mem), mem) + else + clMemFreeINTEL(context(mem), mem) end - return end @@ -47,15 +44,15 @@ function device_alloc(bytesize::Integer; return UnifiedDeviceMemory(ptr, bytesize, context()) end -Base.pointer(buf::UnifiedDeviceMemory) = buf.ptr -Base.sizeof(buf::UnifiedDeviceMemory) = buf.bytesize -context(buf::UnifiedDeviceMemory) = buf.context +Base.pointer(mem::UnifiedDeviceMemory) = mem.ptr +Base.sizeof(mem::UnifiedDeviceMemory) = mem.bytesize +context(mem::UnifiedDeviceMemory) = mem.context -Base.show(io::IO, buf::UnifiedDeviceMemory) = - @printf(io, "UnifiedDeviceMemory(%s at %p)", Base.format_bytes(sizeof(buf)), pointer(buf)) +Base.show(io::IO, mem::UnifiedDeviceMemory) = + @printf(io, "UnifiedDeviceMemory(%s at %p)", Base.format_bytes(sizeof(mem)), pointer(mem)) -Base.convert(::Type{CLPtr{T}}, buf::UnifiedDeviceMemory) where {T} = - convert(CLPtr{T}, pointer(buf)) +Base.convert(::Type{CLPtr{T}}, mem::UnifiedDeviceMemory) where {T} = + convert(CLPtr{T}, pointer(mem)) ## host buffer @@ -93,15 +90,15 @@ function host_alloc(bytesize::Integer; return UnifiedHostMemory(ptr, bytesize, context()) end -Base.pointer(buf::UnifiedHostMemory) = buf.ptr -Base.sizeof(buf::UnifiedHostMemory) = buf.bytesize -context(buf::UnifiedHostMemory) = buf.context +Base.pointer(mem::UnifiedHostMemory) = mem.ptr +Base.sizeof(mem::UnifiedHostMemory) = mem.bytesize +context(mem::UnifiedHostMemory) = mem.context -Base.show(io::IO, buf::UnifiedHostMemory) = - @printf(io, "UnifiedHostMemory(%s at %p)", Base.format_bytes(sizeof(buf)), Int(pointer(buf))) +Base.show(io::IO, mem::UnifiedHostMemory) = + @printf(io, "UnifiedHostMemory(%s at %p)", Base.format_bytes(sizeof(mem)), Int(pointer(mem))) -Base.convert(::Type{Ptr{T}}, buf::UnifiedHostMemory) where {T} = - convert(Ptr{T}, pointer(buf)) +Base.convert(::Type{Ptr{T}}, mem::UnifiedHostMemory) where {T} = + convert(Ptr{T}, pointer(mem)) ## shared buffer @@ -147,18 +144,18 @@ function shared_alloc(bytesize::Integer; return UnifiedSharedMemory(ptr, bytesize, context()) end -Base.pointer(buf::UnifiedSharedMemory) = buf.ptr -Base.sizeof(buf::UnifiedSharedMemory) = buf.bytesize -context(buf::UnifiedSharedMemory) = buf.context +Base.pointer(mem::UnifiedSharedMemory) = mem.ptr +Base.sizeof(mem::UnifiedSharedMemory) = mem.bytesize +context(mem::UnifiedSharedMemory) = mem.context -Base.show(io::IO, buf::UnifiedSharedMemory) = - @printf(io, "UnifiedSharedMemory(%s at %p)", Base.format_bytes(sizeof(buf)), Int(pointer(buf))) +Base.show(io::IO, mem::UnifiedSharedMemory) = + @printf(io, "UnifiedSharedMemory(%s at %p)", Base.format_bytes(sizeof(mem)), Int(pointer(mem))) -Base.convert(::Type{Ptr{T}}, buf::UnifiedSharedMemory) where {T} = - convert(Ptr{T}, reinterpret(Ptr{Cvoid}, pointer(buf))) +Base.convert(::Type{Ptr{T}}, mem::UnifiedSharedMemory) where {T} = + convert(Ptr{T}, reinterpret(Ptr{Cvoid}, pointer(mem))) -Base.convert(::Type{CLPtr{T}}, buf::UnifiedSharedMemory) where {T} = - convert(CLPtr{T}, pointer(buf)) +Base.convert(::Type{CLPtr{T}}, mem::UnifiedSharedMemory) where {T} = + convert(CLPtr{T}, pointer(mem)) ## memory operations @@ -181,7 +178,6 @@ end function enqueue_usm_fill(ptr::Union{Ptr, CLPtr}, pattern::T, N::Integer; wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) - nbytes == 0 && return pattern_size = sizeof(T) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] diff --git a/src/array.jl b/src/array.jl index f7390b6d..3d3553cd 100644 --- a/src/array.jl +++ b/src/array.jl @@ -384,11 +384,11 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr cl.enqueue_svm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) elseif memtype(device_array) == cl.BufferDeviceMemory if src isa CLArray && dst isa CLArray - cl.enqueue_bda_copy(convert(cl.cl_mem, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), convert(cl.cl_mem, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) + cl.enqueue_copy(convert(cl.Buffer{UInt8}, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), convert(cl.Buffer{UInt8}, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) elseif dst isa CLArray - cl.enqueue_bda_write(convert(cl.cl_mem, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) + cl.enqueue_write(convert(cl.Buffer{UInt8}, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) elseif src isa CLArray - cl.enqueue_bda_read(pointer(dst, dst_off), convert(cl.cl_mem, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) + cl.enqueue_read(pointer(dst, dst_off), convert(cl.Buffer{UInt8}, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) end else cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) @@ -428,12 +428,13 @@ fill(v, dims...) = fill!(CLArray{typeof(v)}(undef, dims...), v) fill(v, dims::Dims) = fill!(CLArray{typeof(v)}(undef, dims...), v) function Base.fill!(A::DenseCLArray{T}, val) where {T} + isempty(A) && return A cl.context!(context(A)) do GC.@preserve A begin if memtype(A) == cl.SharedVirtualMemory cl.enqueue_svm_fill(pointer(A), convert(T, val), length(A)) elseif memtype(A) == cl.BufferDeviceMemory - cl.enqueue_bda_fill(convert(cl.cl_mem, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) + cl.enqueue_fill(convert(cl.Buffer{UInt8}, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) else cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) end @@ -514,7 +515,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) elseif memtype(a) == cl.BufferDeviceMemory - cl.enqueue_bda_copy(convert(cl.cl_mem, mem), 0, convert(cl.cl_mem, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) + cl.enqueue_copy(convert(cl.Buffer{UInt8}, mem), 0, convert(cl.Buffer{UInt8}, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) else cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) end diff --git a/src/memory.jl b/src/memory.jl index 8fc20873..4fc842a9 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -38,7 +38,7 @@ function maybe_synchronize(managed::Managed) return nothing end -function Base.convert(t::Union{Type{CLPtr{T}}, Type{cl.cl_mem}}, managed::Managed{M}) where {T, M} +function Base.convert(t::Union{Type{CLPtr{T}}, Type{cl.Buffer{T}}}, managed::Managed{M}) where {T, M} # let null pointers pass through as-is ptr = convert(t, managed.mem) if ptr == cl.CL_NULL @@ -147,6 +147,7 @@ function alloc(::Type{cl.BufferDeviceMemory}, bytes::Int; alignment::Int = 0) end function free(managed::Managed{<:cl.AbstractMemory}) + sizeof(managed) == 0 && return mem = managed.mem cl.context!(cl.context(mem)) do # "`clSVMFree` does not wait for previously enqueued commands that may be using From eb754c393f224d2928ec2b33b4d9ba7055917145 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 13:01:51 +0200 Subject: [PATCH 16/26] Reorder branches. --- src/array.jl | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/array.jl b/src/array.jl index 3d3553cd..b6b37775 100644 --- a/src/array.jl +++ b/src/array.jl @@ -382,16 +382,25 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr cl.context!(context(device_array)) do if memtype(device_array) == cl.SharedVirtualMemory cl.enqueue_svm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) - elseif memtype(device_array) == cl.BufferDeviceMemory + elseif memtype(device_array) <: cl.UnifiedMemory + cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) + else if src isa CLArray && dst isa CLArray - cl.enqueue_copy(convert(cl.Buffer{UInt8}, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), convert(cl.Buffer{UInt8}, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) + cl.enqueue_copy(convert(cl.Buffer{UInt8}, dst.data[]), + (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), + convert(cl.Buffer{UInt8}, src.data[]), + (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), + nbytes; blocking) elseif dst isa CLArray - cl.enqueue_write(convert(cl.Buffer{UInt8}, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) + cl.enqueue_write(convert(cl.Buffer{UInt8}, dst.data[]), + (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), + pointer(src, src_off), nbytes; blocking) elseif src isa CLArray - cl.enqueue_read(pointer(dst, dst_off), convert(cl.Buffer{UInt8}, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) + cl.enqueue_read(pointer(dst, dst_off), + convert(cl.Buffer{UInt8}, src.data[]), + (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), + nbytes; blocking) end - else - cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) end end end @@ -433,10 +442,10 @@ function Base.fill!(A::DenseCLArray{T}, val) where {T} GC.@preserve A begin if memtype(A) == cl.SharedVirtualMemory cl.enqueue_svm_fill(pointer(A), convert(T, val), length(A)) - elseif memtype(A) == cl.BufferDeviceMemory - cl.enqueue_fill(convert(cl.Buffer{UInt8}, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) - else + elseif memtype(A) <: cl.UnifiedMemory cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) + else + cl.enqueue_fill(convert(cl.Buffer{UInt8}, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) end end end @@ -514,10 +523,10 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} GC.@preserve a begin if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) - elseif memtype(a) == cl.BufferDeviceMemory - cl.enqueue_copy(convert(cl.Buffer{UInt8}, mem), 0, convert(cl.Buffer{UInt8}, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) - else + elseif memtype(a) <: cl.UnifiedMemory cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) + else + cl.enqueue_copy(convert(cl.Buffer{UInt8}, mem), 0, convert(cl.Buffer{UInt8}, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) end end end From 56551dc8d80ad37147a86fd1c85cdf388ee52b55 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 13:56:38 +0200 Subject: [PATCH 17/26] Simplify Buffer. --- lib/cl/buffer.jl | 52 +++++++++++++++++--------------------------- lib/cl/memory/bda.jl | 6 ++--- src/array.jl | 12 +++++----- src/memory.jl | 9 ++++---- test/buffer.jl | 11 ++++------ 5 files changed, 38 insertions(+), 52 deletions(-) diff --git a/lib/cl/buffer.jl b/lib/cl/buffer.jl index 02ff7ced..4a818408 100644 --- a/lib/cl/buffer.jl +++ b/lib/cl/buffer.jl @@ -73,31 +73,19 @@ end # OpenCL.Buffer -mutable struct Buffer{T} <: AbstractMemoryObject - const id::cl_mem - const len::Int - - function Buffer{T}(mem_id::cl_mem, len::Integer; retain::Bool=false) where {T} - buff = new{T}(mem_id, len) - retain && clRetainMemObject(buff) - finalizer(clReleaseMemObject, buff) - return buff - end +struct Buffer <: AbstractMemoryObject + id::cl_mem + bytesize::Int end -Base.ndims(b::Buffer) = 1 -Base.eltype(b::Buffer{T}) where {T} = T -Base.length(b::Buffer{T}) where {T} = b.len -Base.sizeof(b::Buffer{T}) where {T} = b.len * sizeof(T) +Base.sizeof(buf::Buffer) = buf.bytesize ## constructors # for internal use -function Buffer{T}(len::Int, flags::Integer, hostbuf=nothing; - device=:rw, host=:rw, device_private_address=false) where {T} - sz = len * sizeof(T) - +function Buffer(sz::Int, flags::Integer, hostbuf=nothing; + device=:rw, host=:rw, device_private_address=false) if device == :rw flags |= CL_MEM_READ_WRITE elseif device == :r @@ -134,19 +122,19 @@ function Buffer{T}(len::Int, flags::Integer, hostbuf=nothing; if err_code[] != CL_SUCCESS throw(CLError(err_code[])) end - return Buffer{T}(mem_id, len) + return Buffer(mem_id, sz) end # allocated buffer -function Buffer{T}(len::Integer; host_accessible=false, kwargs...) where {T} +function Buffer(sz::Integer; host_accessible=false, kwargs...) flags = host_accessible ? CL_MEM_ALLOC_HOST_PTR : 0 - Buffer{T}(len, flags, nothing; kwargs...) + Buffer(sz, flags, nothing; kwargs...) end # from host memory -function Buffer(hostbuf::Array{T}; copy::Bool=true, kwargs...) where {T} +function Buffer(hostbuf::Array; copy::Bool=true, kwargs...) flags = copy ? CL_MEM_COPY_HOST_PTR : CL_MEM_USE_HOST_PTR - Buffer{T}(length(hostbuf), flags, hostbuf; kwargs...) + Buffer(sizeof(hostbuf), flags, hostbuf; kwargs...) end @@ -199,7 +187,7 @@ enqueue_copy(dst::Buffer, src::Buffer, N; kwargs...) = enqueue_copy(dst, 0, src, 0, N; kwargs...) # map a buffer into the host address space, returning a pointer and an event -function enqueue_map(b::Buffer, offset::Integer, nbytes::Int, flags=:rw; +function enqueue_map(buf::Buffer, offset::Integer, nbytes::Int, flags=:rw; blocking::Bool=false, wait_for::Vector{Event}=Event[]) flags = if flags == :rw CL_MAP_READ | CL_MAP_WRITE @@ -216,7 +204,7 @@ function enqueue_map(b::Buffer, offset::Integer, nbytes::Int, flags=:rw; evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin status = Ref{Cint}() - ptr = clEnqueueMapBuffer(queue(), b, blocking, flags, offset, nbytes, + ptr = clEnqueueMapBuffer(queue(), buf, blocking, flags, offset, nbytes, n_evts, evt_ids, ret_evt, status) if status[] != CL_SUCCESS throw(CLError(status[])) @@ -225,22 +213,22 @@ function enqueue_map(b::Buffer, offset::Integer, nbytes::Int, flags=:rw; return ptr, Event(ret_evt[]) end end -enqueue_map(b::Buffer, nbytes::Int, flags=:rw; kwargs...) = - enqueue_map(b, 0, nbytes, flags; kwargs...) +enqueue_map(buf::Buffer, nbytes::Int, flags=:rw; kwargs...) = + enqueue_map(buf, 0, nbytes, flags; kwargs...) # unmap a buffer, return an event -function enqueue_unmap(b::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[]) +function enqueue_unmap(buf::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[]) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin ret_evt = Ref{cl_event}() - clEnqueueUnmapMemObject(queue(), b, ptr, n_evts, evt_ids, ret_evt) + clEnqueueUnmapMemObject(queue(), buf, ptr, n_evts, evt_ids, ret_evt) return Event(ret_evt[]) end end # fill a buffer with a pattern, returning an event -function enqueue_fill(b::Buffer, offset::Integer, pattern::T, N::Integer; +function enqueue_fill(buf::Buffer, offset::Integer, pattern::T, N::Integer; wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) nbytes_pattern = sizeof(T) @@ -249,10 +237,10 @@ function enqueue_fill(b::Buffer, offset::Integer, pattern::T, N::Integer; evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve begin ret_evt = Ref{cl_event}() - clEnqueueFillBuffer(queue(), b, [pattern], + clEnqueueFillBuffer(queue(), buf, [pattern], nbytes_pattern, offset, nbytes, n_evts, evt_ids, ret_evt) @return_event ret_evt[] end end -enqueue_fill(b::Buffer, pattern, N::Integer) = enqueue_fill(b, 0, pattern, N) +enqueue_fill(buf::Buffer, pattern, N::Integer) = enqueue_fill(buf, 0, pattern, N) diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl index 19f86243..15b3c8e9 100644 --- a/lib/cl/memory/bda.jl +++ b/lib/cl/memory/bda.jl @@ -1,5 +1,5 @@ struct BufferDeviceMemory <: AbstractMemory - buf::Union{Buffer{UInt8}, Nothing} + buf::Union{Buffer, Nothing} ptr::CLPtr{Cvoid} bytesize::Int context::Context @@ -12,7 +12,7 @@ function bda_alloc(bytesize::Integer; ) # TODO: use alignment - buf = Buffer{UInt8}(bytesize; device, host, device_private_address=true) + buf = Buffer(bytesize; device, host, device_private_address=true) addr = Ref{cl_mem_device_address_ext}() clGetMemObjectInfo(buf, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), addr, C_NULL) @@ -40,4 +40,4 @@ Base.convert(::Type{Ptr{T}}, mem::BufferDeviceMemory) where {T} = Base.convert(::Type{CLPtr{T}}, mem::BufferDeviceMemory) where {T} = reinterpret(CLPtr{T}, pointer(mem)) -Base.convert(::Type{Buffer{UInt8}}, mem::BufferDeviceMemory) = mem.buf +Base.convert(::Type{Buffer}, mem::BufferDeviceMemory) = mem.buf diff --git a/src/array.jl b/src/array.jl index b6b37775..91761300 100644 --- a/src/array.jl +++ b/src/array.jl @@ -386,18 +386,18 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) else if src isa CLArray && dst isa CLArray - cl.enqueue_copy(convert(cl.Buffer{UInt8}, dst.data[]), + cl.enqueue_copy(convert(cl.Buffer, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), - convert(cl.Buffer{UInt8}, src.data[]), + convert(cl.Buffer, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) elseif dst isa CLArray - cl.enqueue_write(convert(cl.Buffer{UInt8}, dst.data[]), + cl.enqueue_write(convert(cl.Buffer, dst.data[]), (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) elseif src isa CLArray cl.enqueue_read(pointer(dst, dst_off), - convert(cl.Buffer{UInt8}, src.data[]), + convert(cl.Buffer, src.data[]), (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), nbytes; blocking) end @@ -445,7 +445,7 @@ function Base.fill!(A::DenseCLArray{T}, val) where {T} elseif memtype(A) <: cl.UnifiedMemory cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) else - cl.enqueue_fill(convert(cl.Buffer{UInt8}, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) + cl.enqueue_fill(convert(cl.Buffer, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) end end end @@ -526,7 +526,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} elseif memtype(a) <: cl.UnifiedMemory cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) else - cl.enqueue_copy(convert(cl.Buffer{UInt8}, mem), 0, convert(cl.Buffer{UInt8}, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) + cl.enqueue_copy(convert(cl.Buffer, mem), 0, convert(cl.Buffer, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) end end end diff --git a/src/memory.jl b/src/memory.jl index 4fc842a9..665270ed 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -38,9 +38,10 @@ function maybe_synchronize(managed::Managed) return nothing end -function Base.convert(t::Union{Type{CLPtr{T}}, Type{cl.Buffer{T}}}, managed::Managed{M}) where {T, M} +function Base.convert(typ::Union{Type{<:CLPtr}, Type{cl.Buffer}}, managed::Managed) # let null pointers pass through as-is - ptr = convert(t, managed.mem) + # XXX: does not work for buffers + ptr = convert(typ, managed.mem) if ptr == cl.CL_NULL return ptr end @@ -55,9 +56,9 @@ function Base.convert(t::Union{Type{CLPtr{T}}, Type{cl.Buffer{T}}}, managed::Man return ptr end -function Base.convert(::Type{Ptr{T}}, managed::Managed{M}) where {T, M} +function Base.convert(typ::Type{<:Ptr}, managed::Managed{M}) where {M} # let null pointers pass through as-is - ptr = convert(Ptr{T}, managed.mem) + ptr = convert(typ, managed.mem) if ptr == C_NULL return ptr end diff --git a/test/buffer.jl b/test/buffer.jl index c2d0e913..34245c9c 100644 --- a/test/buffer.jl +++ b/test/buffer.jl @@ -1,14 +1,11 @@ @testset "Buffer" begin # simple buffer - let buf = cl.Buffer{Int}(1) - @test ndims(buf) == 1 - @test eltype(buf) == Int - @test length(buf) == 1 + let buf = cl.Buffer(sizeof(Int)) @test sizeof(buf) == sizeof(Int) end # memory copy - let buf = cl.Buffer{Int}(1) + let buf = cl.Buffer(sizeof(Int)) src = [42] cl.enqueue_write(buf, pointer(src), sizeof(src); blocking=true) @@ -18,7 +15,7 @@ end # host accessible, mapped - let buf = cl.Buffer{Int}(1; host_accessible=true) + let buf = cl.Buffer(sizeof(Int); host_accessible=true) src = [42] cl.enqueue_write(buf, pointer(src), sizeof(src); blocking=true) @@ -68,7 +65,7 @@ end # fill - let buf = cl.Buffer{Int}(3) + let buf = cl.Buffer(3*sizeof(Int)) cl.enqueue_fill(buf, 42, 3) arr = Vector{Int}(undef, 3) From abdc71046fd7e64090623417b3af1de1f6fc203d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 14:15:33 +0200 Subject: [PATCH 18/26] Rephrase preference. --- LocalPreferences.toml | 6 +++--- lib/cl/state.jl | 26 ++++++++++++-------------- src/util.jl | 2 +- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/LocalPreferences.toml b/LocalPreferences.toml index c7da6de0..befceb39 100644 --- a/LocalPreferences.toml +++ b/LocalPreferences.toml @@ -1,7 +1,7 @@ [OpenCL] -# which memory back-end to use for CLArray allocations. This can be: -# - "auto" (default): automatically selects the best available backend +# Which memory back-end to use for unspecified CLArray allocations. This can be: # - "usm": Unified Shared Memory (`cl_intel_unified_shared_memory`) # - "bda": Buffer Device Address (`cl_mem` + `cl_ext_buffer_device_address`) # - "svm": Shared Virtual Memory (coarse-grained) -#memory_backend="auto" +# If unspecified, the default will be used based on the platform and device capabilities. +#default_memory_backend="..." diff --git a/lib/cl/state.jl b/lib/cl/state.jl index 56c2249c..9cc57d35 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -194,22 +194,20 @@ function default_memory_backend(dev::Device) supported_backends = supported_memory_backends(dev) isempty(supported_backends) && return nothing - preferred_backend = load_preference(OpenCL, "memory_backend", "auto") - if preferred_backend == "auto" - first(supported_backends) + backend_str = load_preference(OpenCL, "default_memory_backend") + backend_str === nothing && return first(supported_backends) + + backend = if backend_str == "usm" + USMBackend() + elseif backend_str == "bda" + BDABackend() + elseif backend_str == "svm" + SVMBackend() else - backend = if preferred_backend == "usm" - USMBackend() - elseif preferred_backend == "bda" - BDABackend() - elseif preferred_backend == "svm" - SVMBackend() - else - error("Unknown memory backend '$preferred_backend' requested") - end - in(backend, supported_backends) || return nothing - backend + error("Unknown memory backend '$backend_str' requested") end + in(backend, supported_backends) ? backend : nothing + backend end function memory_backend() diff --git a/src/util.jl b/src/util.jl index 08f9eb5f..4d9a1640 100644 --- a/src/util.jl +++ b/src/util.jl @@ -54,7 +54,7 @@ function versioninfo(io::IO=stdout) end prefs = [ - "memory_backend" => load_preference(OpenCL, "memory_backend"), + "default_memory_backend" => load_preference(OpenCL, "default_memory_backend"), ] if any(x->!isnothing(x[2]), prefs) println(io, "Preferences:") From 255adef0f44ece945a216365e66fbd1337cf2977 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 14:25:10 +0200 Subject: [PATCH 19/26] Actually free buffers. --- lib/cl/buffer.jl | 2 ++ lib/cl/memory/bda.jl | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/cl/buffer.jl b/lib/cl/buffer.jl index 4a818408..81916efd 100644 --- a/lib/cl/buffer.jl +++ b/lib/cl/buffer.jl @@ -18,6 +18,8 @@ Base.sizeof(mem::AbstractMemoryObject) = mem.size context(mem::AbstractMemoryObject) = mem.context +release(mem::AbstractMemoryObject) = clReleaseMemObject(mem) + function Base.getproperty(mem::AbstractMemoryObject, s::Symbol) if s == :context param = Ref{cl_context}() diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl index 15b3c8e9..9ba866bc 100644 --- a/lib/cl/memory/bda.jl +++ b/lib/cl/memory/bda.jl @@ -21,11 +21,7 @@ function bda_alloc(bytesize::Integer; return BufferDeviceMemory(buf, ptr, bytesize, context()) end -function bda_free(mem::BufferDeviceMemory) - # XXX: Buffer is separately GCd - #clReleaseMemObject(mem.buf) - return -end +bda_free(mem::BufferDeviceMemory) = release(mem.buf) Base.pointer(mem::BufferDeviceMemory) = mem.ptr Base.sizeof(mem::BufferDeviceMemory) = mem.bytesize From abc5009540c7534523ec900919027fedeab5402a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 14:36:16 +0200 Subject: [PATCH 20/26] Implement pointer query as getproperty call. --- lib/cl/buffer.jl | 10 ++++++++-- lib/cl/memory/bda.jl | 8 +------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/cl/buffer.jl b/lib/cl/buffer.jl index 81916efd..22d9b326 100644 --- a/lib/cl/buffer.jl +++ b/lib/cl/buffer.jl @@ -16,8 +16,6 @@ Base.unsafe_convert(::Type{<:Ptr}, mem::AbstractMemoryObject) = mem Base.sizeof(mem::AbstractMemoryObject) = mem.size -context(mem::AbstractMemoryObject) = mem.context - release(mem::AbstractMemoryObject) = clReleaseMemObject(mem) function Base.getproperty(mem::AbstractMemoryObject, s::Symbol) @@ -65,11 +63,19 @@ function Base.getproperty(mem::AbstractMemoryObject, s::Symbol) result = Ref{Cuint}() clGetMemObjectInfo(mem, CL_MEM_MAP_COUNT, sizeof(Cuint), result, C_NULL) return Int(result[]) + elseif s == :device_address + result = Ref{cl_mem_device_address_ext}() + clGetMemObjectInfo(mem, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), result, C_NULL) + return CLPtr{Cvoid}(result[]) else return getfield(mem, s) end end +# convenience functions +context(mem::AbstractMemoryObject) = mem.context +Base.pointer(mem::AbstractMemoryObject) = mem.pointer + #TODO: enqueue_migrate_mem_objects(queue, mem_objects, flags=0, wait_for=None) #TODO: enqueue_migrate_mem_objects_ext(queue, mem_objects, flags=0, wait_for=None) diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl index 9ba866bc..8cbd539d 100644 --- a/lib/cl/memory/bda.jl +++ b/lib/cl/memory/bda.jl @@ -10,15 +10,9 @@ BufferDeviceMemory() = BufferDeviceMemory(nothing, CL_NULL, 0, context()) function bda_alloc(bytesize::Integer; alignment::Integer = 0, device::Symbol = :rw, host::Symbol = :rw ) - # TODO: use alignment buf = Buffer(bytesize; device, host, device_private_address=true) - - addr = Ref{cl_mem_device_address_ext}() - clGetMemObjectInfo(buf, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), addr, C_NULL) - ptr = CLPtr{Cvoid}(addr[]) - @assert ptr != C_NULL - return BufferDeviceMemory(buf, ptr, bytesize, context()) + return BufferDeviceMemory(buf, buf.device_address, bytesize, context()) end bda_free(mem::BufferDeviceMemory) = release(mem.buf) From 94461cfda81f70d5255f9e310b8cb474c4b5f581 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 15:35:28 +0200 Subject: [PATCH 21/26] WIP: Refactor memory hierarchy. --- LocalPreferences.toml | 2 +- lib/cl/CL.jl | 3 +- lib/cl/kernel.jl | 4 +- lib/cl/memory.jl | 99 +++++++++++++++++++++++++++++++++++ lib/cl/memory/bda.jl | 33 ------------ lib/cl/{ => memory}/buffer.jl | 98 ++++++---------------------------- lib/cl/memory/memory.jl | 22 -------- lib/cl/memory/svm.jl | 2 +- lib/cl/memory/usm.jl | 2 +- lib/cl/state.jl | 4 +- src/array.jl | 20 ++++--- src/memory.jl | 17 +++--- src/util.jl | 4 +- test/memory.jl | 4 +- 14 files changed, 149 insertions(+), 165 deletions(-) create mode 100644 lib/cl/memory.jl delete mode 100644 lib/cl/memory/bda.jl rename lib/cl/{ => memory}/buffer.jl (68%) delete mode 100644 lib/cl/memory/memory.jl diff --git a/LocalPreferences.toml b/LocalPreferences.toml index befceb39..5a4504be 100644 --- a/LocalPreferences.toml +++ b/LocalPreferences.toml @@ -1,7 +1,7 @@ [OpenCL] # Which memory back-end to use for unspecified CLArray allocations. This can be: # - "usm": Unified Shared Memory (`cl_intel_unified_shared_memory`) -# - "bda": Buffer Device Address (`cl_mem` + `cl_ext_buffer_device_address`) +# - "bda": plain buffers (`cl_mem` + `cl_ext_buffer_device_address`) # - "svm": Shared Virtual Memory (coarse-grained) # If unspecified, the default will be used based on the platform and device capabilities. #default_memory_backend="..." diff --git a/lib/cl/CL.jl b/lib/cl/CL.jl index 1de3033d..2788a72f 100644 --- a/lib/cl/CL.jl +++ b/lib/cl/CL.jl @@ -20,8 +20,7 @@ include("device.jl") include("context.jl") include("cmdqueue.jl") include("event.jl") -include("buffer.jl") -include("memory/memory.jl") +include("memory.jl") include("program.jl") include("kernel.jl") diff --git a/lib/cl/kernel.jl b/lib/cl/kernel.jl index 60f63cf9..f5615704 100644 --- a/lib/cl/kernel.jl +++ b/lib/cl/kernel.jl @@ -79,7 +79,7 @@ function set_arg!(k::Kernel, idx::Integer, arg::AbstractMemory) clSetKernelArgSVMPointer(k, idx - 1, pointer(arg)) elseif arg isa UnifiedMemory clSetKernelArgMemPointerINTEL(k, idx - 1, pointer(arg)) - elseif arg isa BufferDeviceMemory + elseif arg isa Buffer clSetKernelArgDevicePointerEXT(k, idx - 1, pointer(arg)) else error("Unknown memory type") @@ -203,7 +203,7 @@ function call( if memory isa SharedVirtualMemory push!(svm_pointers, ptr) - elseif memory isa BufferDeviceMemory + elseif memory isa Buffer push!(bda_pointers, ptr) elseif memory isa UnifiedDeviceMemory device_access = true diff --git a/lib/cl/memory.jl b/lib/cl/memory.jl new file mode 100644 index 00000000..88df52a3 --- /dev/null +++ b/lib/cl/memory.jl @@ -0,0 +1,99 @@ +# Raw memory management + +abstract type AbstractMemoryObject <: CLObject end +abstract type AbstractPointerMemory end +const AbstractMemory = Union{AbstractMemoryObject, AbstractPointerMemory} + +# this will be specialized for each memory type +Base.convert(T::Type{<:Union{Ptr, CLPtr}}, mem::AbstractMemory) = + throw(ArgumentError("Illegal conversion of a $(typeof(mem)) to a $T")) + +# ccall integration +# +# taking the pointer of a memory object means returning the underlying pointer, +# and not the pointer of the object itself. +Base.unsafe_convert(P::Type{<:Union{Ptr, CLPtr}}, mem::AbstractMemory) = convert(P, mem) + + +## opaque memory objects + +# This should be implemented by all subtypes +#type MemoryType <: AbstractMemoryObject +# id::cl_mem +# ... +#end + +Base.sizeof(mem::AbstractMemoryObject) = mem.size + +release(mem::AbstractMemoryObject) = clReleaseMemObject(mem) + +function Base.getproperty(mem::AbstractMemoryObject, s::Symbol) + if s == :type + result = Ref{cl_mem_object_type}() + clGetMemObjectInfo(mem, CL_MEM_TYPE, sizeof(cl_mem_object_type), result, C_NULL) + return result[] + elseif s == :flags + result = Ref{cl_mem_flags}() + clGetMemObjectInfo(mem, CL_MEM_FLAGS, sizeof(cl_mem_flags), result, C_NULL) + mf = result[] + flags = Symbol[] + if (mf & CL_MEM_READ_WRITE) != 0 + push!(flags, :rw) + end + if (mf & CL_MEM_WRITE_ONLY) != 0 + push!(flags, :w) + end + if (mf & CL_MEM_READ_ONLY) != 0 + push!(flags, :r) + end + if (mf & CL_MEM_USE_HOST_PTR) != 0 + push!(flags, :use) + end + if (mf & CL_MEM_ALLOC_HOST_PTR) != 0 + push!(flags, :alloc) + end + if (mf & CL_MEM_COPY_HOST_PTR) != 0 + push!(flags, :copy) + end + return tuple(flags...) + elseif s == :size + result = Ref{Csize_t}() + clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(Csize_t), result, C_NULL) + return result[] + elseif s == :reference_count + result = Ref{Cuint}() + clGetMemObjectInfo(mem, CL_MEM_REFERENCE_COUNT, sizeof(Cuint), result, C_NULL) + return Int(result[]) + elseif s == :map_count + result = Ref{Cuint}() + clGetMemObjectInfo(mem, CL_MEM_MAP_COUNT, sizeof(Cuint), result, C_NULL) + return Int(result[]) + elseif s == :device_address + result = Ref{cl_mem_device_address_ext}() + clGetMemObjectInfo(mem, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), result, C_NULL) + return CLPtr{Cvoid}(result[]) + else + return getfield(mem, s) + end +end + +# for passing buffers to OpenCL APIs: use the underlying handle +Base.unsafe_convert(::Type{cl_mem}, mem::AbstractMemoryObject) = mem.id + +# for passing buffers to kernels: pass the private device pointer +Base.convert(::Type{CLPtr{T}}, mem::AbstractMemoryObject) where {T} = + convert(CLPtr{T}, pointer(mem)) +# XXX: for passing buffers directly, we can support non-BDA drivers +# by postponing the conversion to `cl.set_arg!` +#Base.unsafe_convert(::Type{<:Ptr}, mem::AbstractMemoryObject) = mem + +include("memory/buffer.jl") + +#TODO: enqueue_migrate_mem_objects(queue, mem_objects, flags=0, wait_for=None) +#TODO: enqueue_migrate_mem_objects_ext(queue, mem_objects, flags=0, wait_for=None) + + +## pointer-based memory + +include("memory/usm.jl") +include("memory/svm.jl") diff --git a/lib/cl/memory/bda.jl b/lib/cl/memory/bda.jl deleted file mode 100644 index 8cbd539d..00000000 --- a/lib/cl/memory/bda.jl +++ /dev/null @@ -1,33 +0,0 @@ -struct BufferDeviceMemory <: AbstractMemory - buf::Union{Buffer, Nothing} - ptr::CLPtr{Cvoid} - bytesize::Int - context::Context -end - -BufferDeviceMemory() = BufferDeviceMemory(nothing, CL_NULL, 0, context()) - -function bda_alloc(bytesize::Integer; - alignment::Integer = 0, device::Symbol = :rw, host::Symbol = :rw - ) - # TODO: use alignment - buf = Buffer(bytesize; device, host, device_private_address=true) - return BufferDeviceMemory(buf, buf.device_address, bytesize, context()) -end - -bda_free(mem::BufferDeviceMemory) = release(mem.buf) - -Base.pointer(mem::BufferDeviceMemory) = mem.ptr -Base.sizeof(mem::BufferDeviceMemory) = mem.bytesize -context(mem::BufferDeviceMemory) = mem.context - -Base.show(io::IO, mem::BufferDeviceMemory) = - @printf(io, "BufferDeviceMemory(%s at %p)", Base.format_bytes(sizeof(mem)), Int(pointer(mem))) - -Base.convert(::Type{Ptr{T}}, mem::BufferDeviceMemory) where {T} = - convert(Ptr{T}, pointer(mem)) - -Base.convert(::Type{CLPtr{T}}, mem::BufferDeviceMemory) where {T} = - reinterpret(CLPtr{T}, pointer(mem)) - -Base.convert(::Type{Buffer}, mem::BufferDeviceMemory) = mem.buf diff --git a/lib/cl/buffer.jl b/lib/cl/memory/buffer.jl similarity index 68% rename from lib/cl/buffer.jl rename to lib/cl/memory/buffer.jl index 22d9b326..0b08458c 100644 --- a/lib/cl/buffer.jl +++ b/lib/cl/memory/buffer.jl @@ -1,92 +1,17 @@ -# OpenCL Memory Object - -abstract type AbstractMemoryObject <: CLObject end - -#This should be implemented by all subtypes -# type MemoryType <: AbstractMemoryObject -# id::cl_mem -# ... -# end - -# for passing buffers to OpenCL APIs: use the underlying handle -Base.unsafe_convert(::Type{cl_mem}, mem::AbstractMemoryObject) = mem.id - -# for passing buffers to kernels: keep the buffer, it's handled by `cl.set_arg!` -Base.unsafe_convert(::Type{<:Ptr}, mem::AbstractMemoryObject) = mem - -Base.sizeof(mem::AbstractMemoryObject) = mem.size - -release(mem::AbstractMemoryObject) = clReleaseMemObject(mem) - -function Base.getproperty(mem::AbstractMemoryObject, s::Symbol) - if s == :context - param = Ref{cl_context}() - clGetMemObjectInfo(mem, CL_MEM_CONTEXT, sizeof(cl_context), param, C_NULL) - return Context(param[], retain = true) - elseif s == :mem_type - result = Ref{cl_mem_object_type}() - clGetMemObjectInfo(mem, CL_MEM_TYPE, sizeof(cl_mem_object_type), result, C_NULL) - return result[] - elseif s == :mem_flags - result = Ref{cl_mem_flags}() - clGetMemObjectInfo(mem, CL_MEM_FLAGS, sizeof(cl_mem_flags), result, C_NULL) - mf = result[] - flags = Symbol[] - if (mf & CL_MEM_READ_WRITE) != 0 - push!(flags, :rw) - end - if (mf & CL_MEM_WRITE_ONLY) != 0 - push!(flags, :w) - end - if (mf & CL_MEM_READ_ONLY) != 0 - push!(flags, :r) - end - if (mf & CL_MEM_USE_HOST_PTR) != 0 - push!(flags, :use) - end - if (mf & CL_MEM_ALLOC_HOST_PTR) != 0 - push!(flags, :alloc) - end - if (mf & CL_MEM_COPY_HOST_PTR) != 0 - push!(flags, :copy) - end - return tuple(flags...) - elseif s == :size - result = Ref{Csize_t}() - clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(Csize_t), result, C_NULL) - return result[] - elseif s == :reference_count - result = Ref{Cuint}() - clGetMemObjectInfo(mem, CL_MEM_REFERENCE_COUNT, sizeof(Cuint), result, C_NULL) - return Int(result[]) - elseif s == :map_count - result = Ref{Cuint}() - clGetMemObjectInfo(mem, CL_MEM_MAP_COUNT, sizeof(Cuint), result, C_NULL) - return Int(result[]) - elseif s == :device_address - result = Ref{cl_mem_device_address_ext}() - clGetMemObjectInfo(mem, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), result, C_NULL) - return CLPtr{Cvoid}(result[]) - else - return getfield(mem, s) - end -end - -# convenience functions -context(mem::AbstractMemoryObject) = mem.context -Base.pointer(mem::AbstractMemoryObject) = mem.pointer - -#TODO: enqueue_migrate_mem_objects(queue, mem_objects, flags=0, wait_for=None) -#TODO: enqueue_migrate_mem_objects_ext(queue, mem_objects, flags=0, wait_for=None) - # OpenCL.Buffer struct Buffer <: AbstractMemoryObject id::cl_mem + ptr::Union{Nothing,CLPtr{Cvoid}} bytesize::Int + context::Context end +Buffer() = Buffer(C_NULL, nothing, 0, context()) + +Base.pointer(buf::Buffer) = @something buf.ptr error("Buffer does not have a device private address") Base.sizeof(buf::Buffer) = buf.bytesize +context(buf::Buffer) = buf.context ## constructors @@ -130,7 +55,16 @@ function Buffer(sz::Int, flags::Integer, hostbuf=nothing; if err_code[] != CL_SUCCESS throw(CLError(err_code[])) end - return Buffer(mem_id, sz) + + ptr = if device_private_address + ptr_ref = Ref{cl_mem_device_address_ext}() + clGetMemObjectInfo(mem_id, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), ptr_ref, C_NULL) + CLPtr{Cvoid}(ptr_ref[]) + else + nothing + end + + return Buffer(mem_id, ptr, sz, context()) end # allocated buffer diff --git a/lib/cl/memory/memory.jl b/lib/cl/memory/memory.jl deleted file mode 100644 index 8f8f65ea..00000000 --- a/lib/cl/memory/memory.jl +++ /dev/null @@ -1,22 +0,0 @@ -# Raw memory management - -export device_alloc, host_alloc, shared_alloc, svm_alloc, free, bda_alloc - -# -# untyped buffers -# - -abstract type AbstractMemory end - -Base.convert(T::Type{<:Union{Ptr, CLPtr}}, buf::AbstractMemory) = - throw(ArgumentError("Illegal conversion of a $(typeof(buf)) to a $T")) - -# ccall integration -# -# taking the pointer of a buffer means returning the underlying pointer, -# and not the pointer of the buffer object itself. -Base.unsafe_convert(P::Type{<:Union{Ptr, CLPtr}}, buf::AbstractMemory) = convert(P, buf) - -include("bda.jl") -include("usm.jl") -include("svm.jl") diff --git a/lib/cl/memory/svm.jl b/lib/cl/memory/svm.jl index 485d6d68..e33e3cfa 100644 --- a/lib/cl/memory/svm.jl +++ b/lib/cl/memory/svm.jl @@ -1,4 +1,4 @@ -struct SharedVirtualMemory <: AbstractMemory +struct SharedVirtualMemory <: AbstractPointerMemory ptr::CLPtr{Cvoid} bytesize::Int context::Context diff --git a/lib/cl/memory/usm.jl b/lib/cl/memory/usm.jl index 002da65c..a12bc52f 100644 --- a/lib/cl/memory/usm.jl +++ b/lib/cl/memory/usm.jl @@ -1,4 +1,4 @@ -abstract type UnifiedMemory <: AbstractMemory end +abstract type UnifiedMemory <: AbstractPointerMemory end function usm_free(mem::UnifiedMemory; blocking::Bool = false) if blocking diff --git a/lib/cl/state.jl b/lib/cl/state.jl index 9cc57d35..c75adc60 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -199,10 +199,10 @@ function default_memory_backend(dev::Device) backend = if backend_str == "usm" USMBackend() - elseif backend_str == "bda" - BDABackend() elseif backend_str == "svm" SVMBackend() + elseif backend_str == "bda" + BDABackend() else error("Unknown memory backend '$backend_str' requested") end diff --git a/src/array.jl b/src/array.jl index 91761300..cac82be4 100644 --- a/src/array.jl +++ b/src/array.jl @@ -99,7 +99,7 @@ function memory_type() elseif cl.memory_backend() == cl.SVMBackend() return cl.SharedVirtualMemory elseif cl.memory_backend() == cl.BDABackend() - return cl.BufferDeviceMemory + return cl.Buffer end end CLArray{T, N}(::UndefInitializer, dims::Dims{N}) where {T, N} = @@ -175,11 +175,14 @@ context(A::CLArray) = cl.context(A.data[].mem) memtype(x::CLArray) = memtype(typeof(x)) memtype(::Type{<:CLArray{<:Any, <:Any, M}}) where {M} = @isdefined(M) ? M : Any -is_device(a::CLArray) = memtype(a) == cl.UnifiedDeviceMemory -is_shared(a::CLArray) = memtype(a) == cl.UnifiedSharedMemory -is_host(a::CLArray) = memtype(a) == cl.UnifiedHostMemory -is_svm(a::CLArray) = memtype(a) == cl.SharedVirtualMemory -is_bda(a::CLArray) = memtype(a) == cl.BufferDeviceMemory +# can we read this array from the device (i.e. derive a CLPtr)? +is_device(a::CLArray) = + memtype(a) in (cl.UnifiedDeviceMemory, cl.UnifiedSharedMemory, cl.SharedVirtualMemory, cl.Buffer) +is_shared(a::CLArray) = + memtype(a) in (cl.UnifiedSharedMemory, cl.SharedVirtualMemory) +is_host(a::CLArray) = + memtype(a) in (cl.UnifiedHostMemory, cl.UnifiedSharedMemory, cl.SharedVirtualMemory) + ## derived types @@ -283,13 +286,16 @@ end ## interop with libraries function Base.unsafe_convert(::Type{Ptr{T}}, x::CLArray{T}) where {T} - if is_device(x) + if !is_host(x) throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) end return convert(Ptr{T}, x.data[]) + x.offset * Base.elsize(x) end function Base.unsafe_convert(::Type{CLPtr{T}}, x::CLArray{T}) where {T} + if !is_device(x) + throw(ArgumentError("cannot take the device address of a $(typeof(x))")) + end return convert(CLPtr{T}, x.data[]) + x.offset * Base.elsize(x) end diff --git a/src/memory.jl b/src/memory.jl index 665270ed..bdb49469 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -113,7 +113,7 @@ end ## public interface -function managed_alloc(t::Type{T}, bytes::Int; kwargs...) where T <: cl.AbstractMemory +function managed_alloc(t::Type{T}, bytes::Int; kwargs...) where T if bytes == 0 return Managed(T()) else @@ -142,12 +142,13 @@ function alloc(::Type{cl.SharedVirtualMemory}, bytes::Int; alignment::Int = 0) return Managed(mem) end -function alloc(::Type{cl.BufferDeviceMemory}, bytes::Int; alignment::Int = 0) - mem = cl.bda_alloc(bytes; alignment) - return Managed(mem) +function alloc(::Type{cl.Buffer}, bytes::Int; alignment::Int = 0) + # TODO: use alignment + buf = cl.Buffer(bytes; device_private_address = true) + return Managed(buf) end -function free(managed::Managed{<:cl.AbstractMemory}) +function free(managed::Managed) sizeof(managed) == 0 && return mem = managed.mem cl.context!(cl.context(mem)) do @@ -162,10 +163,10 @@ function free(managed::Managed{<:cl.AbstractMemory}) if mem isa cl.SharedVirtualMemory cl.svm_free(mem) - elseif mem isa cl.BufferDeviceMemory - cl.bda_free(mem) - else + elseif mem isa cl.UnifiedMemory cl.usm_free(mem) + else + cl.release(mem) end end diff --git a/src/util.jl b/src/util.jl index 4d9a1640..f510e4cf 100644 --- a/src/util.jl +++ b/src/util.jl @@ -85,10 +85,10 @@ function versioninfo(io::IO=stdout) suffix = backend == cl.default_memory_backend(device) ? "*" : "" if backend isa cl.SVMBackend push!(tags, "svm"*suffix) - elseif backend isa cl.BDABackend - push!(tags, "bda"*suffix) elseif backend isa cl.USMBackend push!(tags, "usm"*suffix) + elseif backend isa cl.BDABackend + push!(tags, "bda"*suffix) end end ## relevant extensions diff --git a/test/memory.jl b/test/memory.jl index 7ca07f81..6adc081c 100644 --- a/test/memory.jl +++ b/test/memory.jl @@ -17,8 +17,8 @@ buf = create_test_buffer() expectations = [ - (:mem_type, cl.CL_MEM_OBJECT_BUFFER), - (:mem_flags, (:rw, :copy)), + (:type, cl.CL_MEM_OBJECT_BUFFER), + (:flags, (:rw, :copy)), (:size, sizeof(buf)), (:reference_count, 1), (:map_count, 0) From 08fd080b2bf27f0cb5f798c5aab2d20040ba5cac Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 15:47:43 +0200 Subject: [PATCH 22/26] Work around early pointer conversion issue. --- lib/cl/memory/buffer.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cl/memory/buffer.jl b/lib/cl/memory/buffer.jl index 0b08458c..75487f30 100644 --- a/lib/cl/memory/buffer.jl +++ b/lib/cl/memory/buffer.jl @@ -7,7 +7,7 @@ struct Buffer <: AbstractMemoryObject context::Context end -Buffer() = Buffer(C_NULL, nothing, 0, context()) +Buffer() = Buffer(C_NULL, CL_NULL, 0, context()) Base.pointer(buf::Buffer) = @something buf.ptr error("Buffer does not have a device private address") Base.sizeof(buf::Buffer) = buf.bytesize From 7be0d40024d88f8462470409784f9fe652baca5a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 19:25:37 +0200 Subject: [PATCH 23/26] Keep minimal support for non-BDA capable drivers. --- .github/workflows/Test.yml | 2 +- LocalPreferences.toml | 6 +++--- lib/cl/device.jl | 12 +++--------- lib/cl/kernel.jl | 2 +- lib/cl/memory.jl | 3 --- lib/cl/memory/buffer.jl | 4 ++-- lib/cl/state.jl | 28 +++++++++++++++++++--------- src/array.jl | 2 +- src/memory.jl | 2 +- src/util.jl | 4 ++-- 10 files changed, 33 insertions(+), 32 deletions(-) diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index d11996c3..5922ba28 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -26,7 +26,7 @@ jobs: os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-13, macOS-15, windows-2025] arch: [x64, arm64] pocl: [jll, local] - memory_backend: [usm, bda, svm] + memory_backend: [usm, svm, buffer] exclude: # unsupported combinations - os: ubuntu-24.04 diff --git a/LocalPreferences.toml b/LocalPreferences.toml index 5a4504be..fee95c92 100644 --- a/LocalPreferences.toml +++ b/LocalPreferences.toml @@ -1,7 +1,7 @@ [OpenCL] # Which memory back-end to use for unspecified CLArray allocations. This can be: -# - "usm": Unified Shared Memory (`cl_intel_unified_shared_memory`) -# - "bda": plain buffers (`cl_mem` + `cl_ext_buffer_device_address`) -# - "svm": Shared Virtual Memory (coarse-grained) +# - "buffer": plain buffers (using pointers if `cl_ext_buffer_device_address` is available) +# - "usm": Unified Shared Memory (requiring `cl_intel_unified_shared_memory`) +# - "svm": Shared Virtual Memory (requiring coarse-grained SVM support) # If unspecified, the default will be used based on the platform and device capabilities. #default_memory_backend="..." diff --git a/lib/cl/device.jl b/lib/cl/device.jl index 0651b32c..6622efcd 100644 --- a/lib/cl/device.jl +++ b/lib/cl/device.jl @@ -190,15 +190,7 @@ function exec_capabilities(d::Device) ) end -function bda_supported(d::Device) - "cl_ext_buffer_device_address" in d.extensions || return false - return true -end - -function usm_supported(d::Device) - "cl_intel_unified_shared_memory" in d.extensions || return false - return true -end +usm_supported(d::Device) = "cl_intel_unified_shared_memory" in d.extensions function usm_capabilities(d::Device) usm_supported(d) || throw(ArgumentError("Unified Shared Memory not supported on this device")) @@ -261,6 +253,8 @@ function svm_capabilities(d::Device) ) end +bda_supported(d::Device) = false#"cl_ext_buffer_device_address" in d.extensions + function cl_device_type(dtype::Symbol) if dtype == :all cl_dtype = CL_DEVICE_TYPE_ALL diff --git a/lib/cl/kernel.jl b/lib/cl/kernel.jl index f5615704..4770d9af 100644 --- a/lib/cl/kernel.jl +++ b/lib/cl/kernel.jl @@ -69,7 +69,7 @@ function set_arg!(k::Kernel, idx::Integer, arg::CLPtr{T}) where {T} end # raw memory -function set_arg!(k::Kernel, idx::Integer, arg::AbstractMemory) +function set_arg!(k::Kernel, idx::Integer, arg::AbstractPointerMemory) # XXX: this assumes that the receiving argument is pointer-typed, which is not the case # with Julia's `Ptr` ABI. Instead, one should reinterpret the pointer as a # `Core.LLVMPtr`, which _is_ pointer-valued. We retain this handling for `Ptr` for diff --git a/lib/cl/memory.jl b/lib/cl/memory.jl index 88df52a3..6ad90034 100644 --- a/lib/cl/memory.jl +++ b/lib/cl/memory.jl @@ -83,9 +83,6 @@ Base.unsafe_convert(::Type{cl_mem}, mem::AbstractMemoryObject) = mem.id # for passing buffers to kernels: pass the private device pointer Base.convert(::Type{CLPtr{T}}, mem::AbstractMemoryObject) where {T} = convert(CLPtr{T}, pointer(mem)) -# XXX: for passing buffers directly, we can support non-BDA drivers -# by postponing the conversion to `cl.set_arg!` -#Base.unsafe_convert(::Type{<:Ptr}, mem::AbstractMemoryObject) = mem include("memory/buffer.jl") diff --git a/lib/cl/memory/buffer.jl b/lib/cl/memory/buffer.jl index 75487f30..dfd198d8 100644 --- a/lib/cl/memory/buffer.jl +++ b/lib/cl/memory/buffer.jl @@ -9,7 +9,7 @@ end Buffer() = Buffer(C_NULL, CL_NULL, 0, context()) -Base.pointer(buf::Buffer) = @something buf.ptr error("Buffer does not have a device private address") +Base.pointer(buf::Buffer) = @something buf.ptr error("Conversion of a buffer to a pointer is not supported by this device") Base.sizeof(buf::Buffer) = buf.bytesize context(buf::Buffer) = buf.context @@ -18,7 +18,7 @@ context(buf::Buffer) = buf.context # for internal use function Buffer(sz::Int, flags::Integer, hostbuf=nothing; - device=:rw, host=:rw, device_private_address=false) + device=:rw, host=:rw, device_private_address=bda_supported(cl.device())) if device == :rw flags |= CL_MEM_READ_WRITE elseif device == :r diff --git a/lib/cl/state.jl b/lib/cl/state.jl index c75adc60..de44bd5d 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -161,7 +161,7 @@ end abstract type AbstractMemoryBackend end struct SVMBackend <: AbstractMemoryBackend end struct USMBackend <: AbstractMemoryBackend end -struct BDABackend <: AbstractMemoryBackend end +struct BufferBackend <: AbstractMemoryBackend end function supported_memory_backends(dev::Device) backends = AbstractMemoryBackend[] @@ -175,10 +175,10 @@ function supported_memory_backends(dev::Device) end end - # plain old device buffers are second choice, but require an extension to support being - # referenced by raw pointers. + # plain old buffers are always supported, but we only want to use them if we have the + # buffer device address extension, which allows us to reference them by raw pointers. if bda_supported(dev) - push!(backends, BDABackend()) + push!(backends, BufferBackend()) end # shared virtual memory is last, because it comes at a performance cost. @@ -187,12 +187,17 @@ function supported_memory_backends(dev::Device) push!(backends, SVMBackend()) end + if isempty(backends) + # as a last resort, use plain buffers without the ability to reference by pointer. + # this severely limits compatibility, but it's better than nothing. + push!(backends, BufferBackend()) + end + return backends end function default_memory_backend(dev::Device) supported_backends = supported_memory_backends(dev) - isempty(supported_backends) && return nothing backend_str = load_preference(OpenCL, "default_memory_backend") backend_str === nothing && return first(supported_backends) @@ -201,8 +206,8 @@ function default_memory_backend(dev::Device) USMBackend() elseif backend_str == "svm" SVMBackend() - elseif backend_str == "bda" - BDABackend() + elseif backend_str == "buffer" + BufferBackend() else error("Unknown memory backend '$backend_str' requested") end @@ -212,9 +217,14 @@ end function memory_backend() return get!(task_local_storage(), :CLMemoryBackend) do - backend = default_memory_backend(device()) + dev = device() + backend = default_memory_backend(dev) if backend === nothing - error("Device $(device()) does not support any of the available memory backends") + error("Device $(dev) does not support any of the available memory backends") + end + if backend === BufferBackend() && !bda_supported(dev) + @warn """Your device $(dev.name) does not support the necessary extensions for OpenCL.jl's memory management (requiring either USM, coarse-grained SVM, or BDA). + Falling back to plain OpenCL buffers, which severely limits compatibility with other OpenCL.jl, only supporting OpenCL C kernels.""" maxlog=1 _id="memory_backend_$(dev.name)" end backend end diff --git a/src/array.jl b/src/array.jl index cac82be4..ee5ee637 100644 --- a/src/array.jl +++ b/src/array.jl @@ -98,7 +98,7 @@ function memory_type() return cl.UnifiedDeviceMemory elseif cl.memory_backend() == cl.SVMBackend() return cl.SharedVirtualMemory - elseif cl.memory_backend() == cl.BDABackend() + elseif cl.memory_backend() == cl.BufferBackend() return cl.Buffer end end diff --git a/src/memory.jl b/src/memory.jl index bdb49469..94d8e44c 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -144,7 +144,7 @@ end function alloc(::Type{cl.Buffer}, bytes::Int; alignment::Int = 0) # TODO: use alignment - buf = cl.Buffer(bytes; device_private_address = true) + buf = cl.Buffer(bytes) return Managed(buf) end diff --git a/src/util.jl b/src/util.jl index f510e4cf..d2166025 100644 --- a/src/util.jl +++ b/src/util.jl @@ -87,8 +87,8 @@ function versioninfo(io::IO=stdout) push!(tags, "svm"*suffix) elseif backend isa cl.USMBackend push!(tags, "usm"*suffix) - elseif backend isa cl.BDABackend - push!(tags, "bda"*suffix) + elseif backend isa cl.BufferBackend + push!(tags, "buffer"*suffix) end end ## relevant extensions From e5fd0261715e21b05067e397302a9066f0423fea Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 19 Jun 2025 21:45:32 +0200 Subject: [PATCH 24/26] Oops. --- lib/cl/device.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cl/device.jl b/lib/cl/device.jl index 6622efcd..dd1c4a26 100644 --- a/lib/cl/device.jl +++ b/lib/cl/device.jl @@ -253,7 +253,7 @@ function svm_capabilities(d::Device) ) end -bda_supported(d::Device) = false#"cl_ext_buffer_device_address" in d.extensions +bda_supported(d::Device) = "cl_ext_buffer_device_address" in d.extensions function cl_device_type(dtype::Symbol) if dtype == :all From 54d96100f0bb0133abe7d78e6d9977d566cc1ae9 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Jun 2025 08:50:33 +0200 Subject: [PATCH 25/26] Improve capability reporting. --- src/util.jl | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/src/util.jl b/src/util.jl index d2166025..70a002ff 100644 --- a/src/util.jl +++ b/src/util.jl @@ -80,16 +80,31 @@ function versioninfo(io::IO=stdout) # show a list of tags tags = [] - ## memory back-end - for backend in cl.supported_memory_backends(device) - suffix = backend == cl.default_memory_backend(device) ? "*" : "" - if backend isa cl.SVMBackend - push!(tags, "svm"*suffix) - elseif backend isa cl.USMBackend - push!(tags, "usm"*suffix) - elseif backend isa cl.BufferBackend - push!(tags, "buffer"*suffix) + ## memory back-ends + let + svm_tags = [] + svm_caps = cl.svm_capabilities(device) + if svm_caps.coarse_grain_buffer + push!(svm_tags, "c") end + if svm_caps.fine_grain_buffer + push!(svm_tags, "f") + end + push!(tags, "svm:"*join(svm_tags, "+")) + end + if cl.usm_supported(device) + usm_tags = [] + usm_caps = cl.usm_capabilities(device) + if usm_caps.host.access + push!(usm_tags, "h") + end + if usm_caps.device.access + push!(usm_tags, "d") + end + push!(tags, "usm:"*join(usm_tags, "+")) + end + if cl.bda_supported(device) + push!(tags, "bda") end ## relevant extensions if in("cl_khr_fp16", device.extensions) From f9b0d012677e7a1baae8f6c7772f357d04bcd727 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 20 Jun 2025 08:52:22 +0200 Subject: [PATCH 26/26] Fix CI. --- .github/workflows/Test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index 5922ba28..dd7ff2b1 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -127,13 +127,13 @@ jobs: run(```$(cmake()) --build $builddir --parallel $(Sys.CPU_THREADS) --target install```) end' - echo '[pocl_jll]' > test/LocalPreferences.toml + echo '[pocl_jll]' >> test/LocalPreferences.toml echo 'libpocl_path="${{ github.workspace }}/target/lib/libpocl.so"' >> test/LocalPreferences.toml - name: Setup OpenCL.jl run: | - echo '[OpenCL]' > test/LocalPreferences.toml - echo 'memory_backend="${{ matrix.memory_backend }}"' >> test/LocalPreferences.toml + echo '[OpenCL]' >> test/LocalPreferences.toml + echo 'default_memory_backend="${{ matrix.memory_backend }}"' >> test/LocalPreferences.toml julia --project -e ' using Pkg Pkg.develop(path="lib/intrinsics")'