Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] add dynamic localmem support #11

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,26 @@ Query the workgroupsize on the device.
"""
function groupsize end

const shmem_id = Ref(0)

"""
@localmem T dims
"""
macro localmem(T, dims)
id = (shmem_id[]+= 1)
# Stay in sync with CUDAnative
id = gensym("static_shmem")
quote
$SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id))))
end
end

"""
@dynamic_localmem T N
@dynamic_localmem T (workgroupsize)->N
"""
macro dynamic_localmem(T, N, previous...)
# Stay in sync with CUDAnative
id = gensym("dynamic_shmem")
quote
$SharedMemory($(esc(T)), Val($(esc(dims))), Val($id))
$DynamicSharedMemory($(esc(T)), $(esc(N)), Val($(QuoteNode(id))), $(map(esc, previous))...)
end
end

Expand Down Expand Up @@ -196,6 +206,8 @@ struct Kernel{Device, WorkgroupSize<:_Size, NDRange<:_Size, Fun}
f::Fun
end

function allocator end

workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize
ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize,NDRange} = NDRange

Expand Down Expand Up @@ -281,11 +293,15 @@ include("macros.jl")
###

function Scratchpad(::Type{T}, ::Val{Dims}) where {T, Dims}
throw(MethodError(ScratchArray, (T, Val(Dims))))
throw(MethodError(Scratchpad, (T, Val(Dims))))
end

function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
throw(MethodError(ScratchArray, (T, Val(Dims), Val(Id))))
throw(MethodError(SharedMemory, (T, Val(Dims), Val(Id))))
end

function DynamicSharedMemory(::Type{T}, N, ::Val{Id}, previous...) where {T, Id}
throw(MethodError(DynamicSharedMemory, (T, N, Val(id), previous)))
end

function __synchronize()
Expand Down
10 changes: 10 additions & 0 deletions src/backends/cpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ generate_overdubs(CPUCtx)
MArray{__size(Dims), T}(undef)
end

@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(DynamicSharedMemory), ::Type{T}, alloc, ::Val, previous...) where T
if alloc isa Function
N = alloc(__groupsize(ctx.metadata))
else
N = alloc
end
@assert N isa Int
Vector{T}(undef, N)
end

###
# CPU implementation of scratch memory
# - private memory for each workitem
Expand Down
19 changes: 18 additions & 1 deletion src/backends/cuda.jl
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,24 @@ end
###
@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
ptr = CUDAnative._shmem(Val(Id), T, Val(prod(Dims)))
CUDAnative.CuDeviceArray(Dims, CUDAnative.DevicePtr{T, CUDAnative.AS.Shared}(ptr))
CUDAnative.CuDeviceArray(Dims, ptr)
end

@inline get_offset(threads) = 0
@inline get_offset(threads, elem::Tuple{Int, Int}) = elem[1] * elem[2]
@inline get_offset(threads, elem::Tuple{Int, <:Function}) = elem[1] * elem[2](threads)
@inline get_offset(threads, elem, args...) = get_offset(threads, elem) + get_offset(threads, args...)

@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(DynamicSharedMemory), ::Type{T}, alloc, ::Val{Id}, previous::Vararg{Any, N}) where {T, Id, N}
ptr = CUDAnative._shmem(Val(Id), T, Val(0))
nthreads = __gpu_groupsize(ctx.metadata)
offset = get_offset(nthreads, previous...)
if alloc isa Function
size = alloc(nthreads)
else
size = alloc
end
CUDAnative.CuDeviceArray{T}((size,), ptr + offset)
end

###
Expand Down
42 changes: 41 additions & 1 deletion src/macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ function __kernel(expr)
# 2. CPU function with work-group loops inserted
gpu_name = esc(gensym(Symbol(:gpu_, name)))
cpu_name = esc(gensym(Symbol(:cpu_, name)))
alloc_name = esc(gensym(Symbol(:alloc_, name)))
name = esc(name)

gpu_decl = Expr(:call, gpu_name, arglist...)
cpu_decl = Expr(:call, cpu_name, arglist...)
alloc_decl = Expr(:call, alloc_name, arglist...)

# Without the deepcopy we might accidentially modify expr shared between CPU and GPU
gpu_body = transform_gpu(deepcopy(body), args)
Expand All @@ -44,6 +46,9 @@ function __kernel(expr)
cpu_body = transform_cpu(deepcopy(body), args)
cpu_function = Expr(:function, cpu_decl, cpu_body)

dynamic_allocator_body = transform_dynamic(body, args)
dynamic_allocator_function = Expr(:function, alloc_decl, dynamic_allocator_body)

# create constructor functions
constructors = quote
$name(dev::Device) = $name(dev, $DynamicSize(), $DynamicSize())
Expand All @@ -55,9 +60,11 @@ function __kernel(expr)
function $name(::Device, ::S, ::NDRange) where {Device<:$GPU, S<:$_Size, NDRange<:$_Size}
return $Kernel{Device, S, NDRange, typeof($gpu_name)}($gpu_name)
end
KernelAbstractions.allocator(::typeof($gpu_name)) = $alloc_name
KernelAbstractions.allocator(::typeof($cpu_name)) = $alloc_name
end

return Expr(:toplevel, cpu_function, gpu_function, constructors)
return Expr(:toplevel, cpu_function, gpu_function, dynamic_allocator_function, constructors)
end

# Transform function for GPU execution
Expand All @@ -83,6 +90,7 @@ function split(stmts)
# 2. Aggregate the index and allocation expressions seen at the sync points
indicies = Any[]
allocations = Any[]
dynamic_allocs = Any[]
loops = Any[]
current = Any[]

Expand All @@ -102,6 +110,15 @@ function split(stmts)
elseif callee === Symbol("@localmem") || callee === Symbol("@private")
push!(allocations, stmt)
continue
elseif callee === Symbol("@dynamic_localmem")
# args[2] LineNumberNode
Tvar = rhs.args[3]
size = rhs.args[4]
# add all previous dynamic allocations
append!(rhs.args, dynamic_allocs)
push!(allocations, stmt)
push!(dynamic_allocs, Expr(:tuple, Tvar, size))
continue
end
end
end
Expand Down Expand Up @@ -161,3 +178,26 @@ function transform_cpu(stmts, args)
push!(new_stmts, :(return nothing))
return Expr(:block, new_stmts...)
end

function transform_dynamic(stmts, args)
# 1. Find dynamic allocations
allocators = Expr[]
for stmt in stmts.args
if isexpr(stmt, :(=))
rhs = stmt.args[2]
if isexpr(rhs, :macrocall)
callee = rhs.args[1]
if callee === Symbol("@dynamic_localmem")
# args[2] LineNumberNode
Tvar = rhs.args[3]
size = rhs.args[4]
push!(allocators, Expr(:tuple, Expr(:call, :sizeof, Tvar), size))
continue
end
end
end
end
return Expr(:block, Expr(:tuple, allocators...))
end


58 changes: 58 additions & 0 deletions test/localmem.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
using KernelAbstractions
using Test
using CUDAapi
if has_cuda_gpu()
using CuArrays
CuArrays.allowscalar(false)
end

@kernel function dynamic(A)
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem = @dynamic_localmem eltype(A) (wkrgpsize)->2*wkrgpsize
lmem[2*i] = A[I]
@synchronize
A[I] = lmem[2*i]
end

@kernel function dynamic2(A)
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem = @dynamic_localmem eltype(A) (wkrgpsize)->2*wkrgpsize
lmem2 = @dynamic_localmem Int (wkrgpsize)->wkrgpsize
lmem2[i] = i
lmem[2*i] = A[I]
@synchronize
A[I] = lmem[2*lmem2[i]]
end

@kernel function dynamic_mixed(A)
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem = @dynamic_localmem eltype(A) (wkrgpsize)->2*wkrgpsize
lmem2 = @localmem Int groupsize() # Ok iff groupsize is static
lmem2[i] = i
lmem[2*i] = A[I]
@synchronize
A[I] = lmem[2*lmem2[i]]
end


function harness(backend, ArrayT)
A = ArrayT{Float64}(undef, 16, 16)
A .= 1.0
wait(dynamic(backend, 16)(A, ndrange=size(A)))

A = ArrayT{Float64}(undef, 16, 16)
wait(dynamic2(backend, 16)(A, ndrange=size(A)))

A = ArrayT{Float64}(undef, 16, 16)
wait(dynamic_mixed(backend, 16)(A, ndrange=size(A)))
end

@testset "kernels" begin
harness(CPU(), Array)
if has_cuda_gpu()
harness(CUDA(), CuArray)
end
end
26 changes: 26 additions & 0 deletions test/macros.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using KernelAbstractions
using Test

@kernel function nodynamic(A)
I = @index(Global, Linear)
A[I] = I
end

@kernel function dynamic(A)
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem = @dynamic_localmem eltype(A) (wkrgpsize)->2*wkrgpsize
lmem[2*i] = A[I]
@synchronize
A[I] = lmem[2*i]
end

nodyn_kernel = nodynamic(CPU())
dyn_kernel = dynamic(CPU())

@test KernelAbstractions.allocator(nodyn_kernel.f)(zeros(32, 32)) == ()
let allocs = KernelAbstractions.allocator(dyn_kernel.f)(zeros(32, 32))
bytes, alloc = allocs[1]
@test bytes == sizeof(Float64)
@test alloc(32) == 64
end
8 changes: 8 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,12 @@ using Test
include("test.jl")
end

@testset "Macro" begin
include("macros.jl")
end

@testset "Localmem" begin
include("localmem.jl")
end

include("examples.jl")