JuliaGPU · vchuravy · Feb 7, 2020 · Feb 13, 2020
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -79,16 +79,26 @@ Query the workgroupsize on the device.
 """
 function groupsize end
 
-const shmem_id = Ref(0)
-
 """
    @localmem T dims
 """
 macro localmem(T, dims)
-    id = (shmem_id[]+= 1)
+    # Stay in sync with CUDAnative
+    id = gensym("static_shmem")
+    quote
+        $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id))))
+    end
+end
 
+"""
+    @dynamic_localmem T N
+    @dynamic_localmem T (workgroupsize)->N
+"""
+macro dynamic_localmem(T, N, previous...)
+    # Stay in sync with CUDAnative
+    id = gensym("dynamic_shmem")
     quote
-        $SharedMemory($(esc(T)), Val($(esc(dims))), Val($id))
+        $DynamicSharedMemory($(esc(T)), $(esc(N)), Val($(QuoteNode(id))), $(map(esc, previous))...)
     end
 end
 
@@ -196,6 +206,8 @@ struct Kernel{Device, WorkgroupSize<:_Size, NDRange<:_Size, Fun}
     f::Fun
 end
 
+function allocator end
+
 workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize
 ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize,NDRange} = NDRange
 
@@ -281,11 +293,15 @@ include("macros.jl")
 ###
 
 function Scratchpad(::Type{T}, ::Val{Dims}) where {T, Dims}
-    throw(MethodError(ScratchArray, (T, Val(Dims))))
+    throw(MethodError(Scratchpad, (T, Val(Dims))))
 end
 
 function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
-    throw(MethodError(ScratchArray, (T, Val(Dims), Val(Id))))
+    throw(MethodError(SharedMemory, (T, Val(Dims), Val(Id))))
+end
+
+function DynamicSharedMemory(::Type{T}, N, ::Val{Id}, previous...) where {T, Id}
+    throw(MethodError(DynamicSharedMemory, (T, N, Val(id), previous)))
 end
 
 function __synchronize()

diff --git a/src/backends/cpu.jl b/src/backends/cpu.jl
@@ -108,6 +108,16 @@ generate_overdubs(CPUCtx)
     MArray{__size(Dims), T}(undef)
 end
 
+@inline function Cassette.overdub(ctx::CPUCtx, ::typeof(DynamicSharedMemory), ::Type{T}, alloc, ::Val, previous...) where T 
+    if alloc isa Function
+        N = alloc(__groupsize(ctx.metadata))
+    else
+        N = alloc
+    end
+    @assert N isa Int
+    Vector{T}(undef, N)
+end
+
 ###
 # CPU implementation of scratch memory
 # - private memory for each workitem

diff --git a/src/backends/cuda.jl b/src/backends/cuda.jl
@@ -203,7 +203,24 @@ end
 ###
 @inline function Cassette.overdub(ctx::CUDACtx, ::typeof(SharedMemory), ::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
     ptr = CUDAnative._shmem(Val(Id), T, Val(prod(Dims)))
-    CUDAnative.CuDeviceArray(Dims, CUDAnative.DevicePtr{T, CUDAnative.AS.Shared}(ptr))
+    CUDAnative.CuDeviceArray(Dims, ptr)
+end
+
+@inline get_offset(threads) = 0
+@inline get_offset(threads, elem::Tuple{Int, Int}) = elem[1] * elem[2] 
+@inline get_offset(threads, elem::Tuple{Int, <:Function}) = elem[1] * elem[2](threads) 
+@inline get_offset(threads, elem, args...) = get_offset(threads, elem) + get_offset(threads, args...)
+
+@inline function Cassette.overdub(ctx::CUDACtx, ::typeof(DynamicSharedMemory), ::Type{T}, alloc, ::Val{Id}, previous::Vararg{Any, N}) where {T, Id, N} 
+    ptr = CUDAnative._shmem(Val(Id), T, Val(0))
+    nthreads = __gpu_groupsize(ctx.metadata)
+    offset = get_offset(nthreads, previous...)
+    if alloc isa Function
+        size = alloc(nthreads)
+    else
+        size = alloc
+    end
+    CUDAnative.CuDeviceArray{T}((size,), ptr + offset)
 end
 
 ###

diff --git a/src/macros.jl b/src/macros.jl
@@ -32,10 +32,12 @@ function __kernel(expr)
     # 2. CPU function with work-group loops inserted
     gpu_name = esc(gensym(Symbol(:gpu_, name)))
     cpu_name = esc(gensym(Symbol(:cpu_, name)))
+    alloc_name = esc(gensym(Symbol(:alloc_, name)))
     name = esc(name)
 
     gpu_decl = Expr(:call, gpu_name, arglist...)
     cpu_decl = Expr(:call, cpu_name, arglist...)
+    alloc_decl = Expr(:call, alloc_name, arglist...)
 
     # Without the deepcopy we might accidentially modify expr shared between CPU and GPU
     gpu_body = transform_gpu(deepcopy(body), args)
@@ -44,6 +46,9 @@ function __kernel(expr)
     cpu_body = transform_cpu(deepcopy(body), args)
     cpu_function = Expr(:function, cpu_decl, cpu_body)
 
+    dynamic_allocator_body = transform_dynamic(body, args)
+    dynamic_allocator_function = Expr(:function, alloc_decl, dynamic_allocator_body)
+
     # create constructor functions
     constructors = quote
         $name(dev::Device) = $name(dev, $DynamicSize(), $DynamicSize())
@@ -55,9 +60,11 @@ function __kernel(expr)
         function $name(::Device, ::S, ::NDRange) where {Device<:$GPU, S<:$_Size, NDRange<:$_Size}
             return $Kernel{Device, S, NDRange, typeof($gpu_name)}($gpu_name)
         end
+        KernelAbstractions.allocator(::typeof($gpu_name)) = $alloc_name
+        KernelAbstractions.allocator(::typeof($cpu_name)) = $alloc_name
     end
 
-    return Expr(:toplevel, cpu_function, gpu_function, constructors)
+    return Expr(:toplevel, cpu_function, gpu_function, dynamic_allocator_function, constructors)
 end
 
 # Transform function for GPU execution
@@ -83,6 +90,7 @@ function split(stmts)
     # 2. Aggregate the index and allocation expressions seen at the sync points
     indicies    = Any[]
     allocations = Any[]
+    dynamic_allocs = Any[]
     loops       = Any[]
     current     = Any[]
 
@@ -102,6 +110,15 @@ function split(stmts)
                 elseif callee === Symbol("@localmem") || callee === Symbol("@private")
                     push!(allocations, stmt)
                     continue
+                elseif callee === Symbol("@dynamic_localmem")
+                    # args[2] LineNumberNode
+                    Tvar = rhs.args[3]
+                    size = rhs.args[4]
+                    # add all previous dynamic allocations
+                    append!(rhs.args, dynamic_allocs)
+                    push!(allocations, stmt)
+                    push!(dynamic_allocs, Expr(:tuple, Tvar, size))
+                    continue
                 end
             end
         end
@@ -161,3 +178,26 @@ function transform_cpu(stmts, args)
     push!(new_stmts, :(return nothing))
     return Expr(:block, new_stmts...)
 end
+
+function transform_dynamic(stmts, args)
+    # 1. Find dynamic allocations
+    allocators = Expr[]
+    for stmt in stmts.args
+        if isexpr(stmt, :(=))
+            rhs = stmt.args[2]
+            if isexpr(rhs, :macrocall)
+                callee = rhs.args[1]
+                if callee === Symbol("@dynamic_localmem")
+                    # args[2] LineNumberNode
+                    Tvar = rhs.args[3]
+                    size = rhs.args[4]
+                    push!(allocators, Expr(:tuple, Expr(:call, :sizeof, Tvar), size))
+                    continue
+                end
+            end
+        end
+    end
+    return Expr(:block, Expr(:tuple, allocators...))
+end
+
+
diff --git a/test/localmem.jl b/test/localmem.jl
@@ -0,0 +1,58 @@
+using KernelAbstractions
+using Test
+using CUDAapi
+if has_cuda_gpu()
+    using CuArrays
+    CuArrays.allowscalar(false)
+end
+
+@kernel function dynamic(A)
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    lmem = @dynamic_localmem eltype(A) (wkrgpsize)->2*wkrgpsize
+    lmem[2*i] = A[I]
+    @synchronize
+    A[I] = lmem[2*i]
+end
+
+@kernel function dynamic2(A)
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    lmem  = @dynamic_localmem eltype(A) (wkrgpsize)->2*wkrgpsize
+    lmem2 = @dynamic_localmem Int (wkrgpsize)->wkrgpsize
+    lmem2[i] = i
+    lmem[2*i] = A[I]
+    @synchronize
+    A[I] = lmem[2*lmem2[i]]
+end
+
+@kernel function dynamic_mixed(A)
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    lmem  = @dynamic_localmem eltype(A) (wkrgpsize)->2*wkrgpsize
+    lmem2 = @localmem Int groupsize() # Ok iff groupsize is static 
+    lmem2[i] = i
+    lmem[2*i] = A[I]
+    @synchronize
+    A[I] = lmem[2*lmem2[i]]
+end
+
+
+function harness(backend, ArrayT)
+    A = ArrayT{Float64}(undef, 16, 16)
+    A .= 1.0
+    wait(dynamic(backend, 16)(A, ndrange=size(A)))
+
+    A = ArrayT{Float64}(undef, 16, 16)
+    wait(dynamic2(backend, 16)(A, ndrange=size(A)))
+
+    A = ArrayT{Float64}(undef, 16, 16)
+    wait(dynamic_mixed(backend, 16)(A, ndrange=size(A)))
+end
+
+@testset "kernels" begin
+    harness(CPU(), Array)
+    if has_cuda_gpu()
+        harness(CUDA(), CuArray)
+    end
+end
diff --git a/test/macros.jl b/test/macros.jl
@@ -0,0 +1,26 @@
+using KernelAbstractions
+using Test
+
+@kernel function nodynamic(A)
+    I = @index(Global, Linear)
+    A[I] = I
+end
+
+@kernel function dynamic(A)
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    lmem = @dynamic_localmem eltype(A) (wkrgpsize)->2*wkrgpsize
+    lmem[2*i] = A[I] 
+    @synchronize
+    A[I] = lmem[2*i]
+end
+
+nodyn_kernel = nodynamic(CPU())
+dyn_kernel   = dynamic(CPU())
+
+@test KernelAbstractions.allocator(nodyn_kernel.f)(zeros(32, 32)) == ()
+let allocs = KernelAbstractions.allocator(dyn_kernel.f)(zeros(32, 32))
+    bytes, alloc = allocs[1]
+    @test bytes == sizeof(Float64)
+    @test alloc(32) == 64
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -5,4 +5,12 @@ using Test
     include("test.jl")
 end
 
+@testset "Macro" begin
+    include("macros.jl")
+end
+
+@testset "Localmem" begin
+    include("localmem.jl")
+end
+
 include("examples.jl")