diff --git a/Project.toml b/Project.toml
index 088a62ea..e885c99c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,11 +4,11 @@ authors = ["Frank Hellmann <hellmann@pik-potsdam.de>, Michael Lindner <michaelli
 version = "0.9.0"
 
 [deps]
-Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
@@ -30,17 +30,22 @@ SymbolicIndexingInterface = "2efcf032-c050-4f8e-a9bb-153293bab1f5"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [weakdeps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
 
 [extensions]
-ModelingToolkitExt = "ModelingToolkit"
+CUDAExt = ["CUDA", "Adapt"]
+MTKExt = "ModelingToolkit"
 
 [compat]
 Adapt = "4.0.4"
 ArgCheck = "2.3.0"
 Atomix = "0.1.0"
+CUDA = "5.5.2"
 DocStringExtensions = "0.9.3"
 FastClosures = "0.3.2"
+ForwardDiff = "0.10.36"
 Graphs = "1"
 InteractiveUtils = "1"
 KernelAbstractions = "0.9.18"
diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl
index c4f8e96b..238cfcbc 100755
--- a/benchmark/run_benchmarks.jl
+++ b/benchmark/run_benchmarks.jl
@@ -21,6 +21,7 @@ Pkg.activate(BMPATH);
 if VERSION < v"1.11.0-0"
     Pkg.develop(; path=NDPATH);
 end
+Pkg.update();
 Pkg.precompile();
 
 using PkgBenchmark
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
new file mode 100644
index 00000000..d291503e
--- /dev/null
+++ b/ext/CUDAExt.jl
@@ -0,0 +1,118 @@
+module CUDAExt
+using NetworkDynamics: Network, NetworkLayer, VertexBatch, EdgeBatch,
+                       KAAggregator, AggregationMap, SparseAggregator,
+                       LazyGBufProvider, EagerGBufProvider, LazyGBuf,
+                       dispatchT, compf, iscudacompatible, executionstyle
+using NetworkDynamics.PreallocationTools: DiffCache
+using NetworkDynamics: KernelAbstractions as KA
+
+using CUDA: CuArray
+using Adapt: Adapt, adapt
+
+# main entry for bringing Network to GPU
+function Adapt.adapt_structure(to, n::Network)
+    if to isa KA.GPU
+        throw(ArgumentError("Looks like to passed an KernelAbstractions backend to adapt Network to GPU. \
+            this is not supported as the internal cache types cannot be infered without known the eltype. \
+            Please adapt using `CuArray{Float32}` or `CuArray{Float64}`!"))
+    end
+    if !(to isa Type{<:CuArray})
+        throw(ArgumentError("Can't handle Adaptor $to. \
+            Please adapt using `CuArray{Float32}` or `CuArray{Float64}`!"))
+    end
+    if eltype(to) ∉ (Float32, Float64)
+        throw(ArgumentError("Use adapt on Network with either `CuArray{Float32}` or `CuArray{Float64}` \
+            such that internal caches can be created with the correct type!"))
+    end
+    if !iscudacompatible(n)
+        throw(ArgumentError("The provided network has non-cuda compatible aggregator or exectuion strategies."))
+    end
+    vb = adapt(to, n.vertexbatches)
+    layer = adapt(to, n.layer)
+    mm = adapt(to, n.mass_matrix)
+    gbp = adapt(to, n.gbufprovider)
+    caches = (;state = _adapt_diffcache(to, n.caches.state),
+              aggregation = _adapt_diffcache(to, n.caches.aggregation))
+    exT = typeof(executionstyle(n))
+    gT = typeof(n.im.g)
+
+    Network{exT,gT,typeof(layer),typeof(vb),typeof(mm),eltype(caches),typeof(gbp)}(
+        vb, layer, n.im, caches, mm, gbp)
+end
+
+Adapt.@adapt_structure NetworkLayer
+
+
+####
+#### Adapt Aggregators
+####
+Adapt.@adapt_structure KAAggregator
+
+# overload to retain int types for aggregation map
+Adapt.@adapt_structure AggregationMap
+function Adapt.adapt_structure(to::Type{<:CuArray{<:AbstractFloat}}, am::AggregationMap)
+    map = adapt(CuArray, am.map)
+    symmap = adapt(CuArray, am.symmap)
+    AggregationMap(am.range, map, am.symrange, symmap)
+end
+
+Adapt.@adapt_structure SparseAggregator
+
+
+####
+#### Adapt GBufProviders
+####
+Adapt.@adapt_structure LazyGBufProvider
+function Adapt.adapt_structure(to::Type{<:CuArray{<:AbstractFloat}}, gbp::LazyGBufProvider)
+    adapt(CuArray, gbp) # preserve Vector{UnitRange}
+end
+Adapt.@adapt_structure LazyGBuf
+
+# overload to retain int types for eager gbuf map
+function Adapt.adapt_structure(to::Type{<:CuArray{<:AbstractFloat}}, gbp::EagerGBufProvider)
+    _adapt_eager_gbufp(CuArray, to, gbp)
+end
+function Adapt.adapt_structure(to, gbp::EagerGBufProvider)
+    _adapt_eager_gbufp(to, to, gbp)
+end
+function _adapt_eager_gbufp(mapto, cacheto, gbp)
+    map = adapt(mapto, gbp.map)
+    cache = _adapt_diffcache(cacheto, gbp.diffcache)
+    EagerGBufProvider(map, cache)
+end
+
+
+####
+#### Adapt VertexBatch/EdgeBatch
+####
+function Adapt.adapt_structure(to::Type{<:CuArray{<:AbstractFloat}}, b::VertexBatch)
+    Adapt.adapt_structure(CuArray, b)
+end
+function Adapt.adapt_structure(to::Type{<:CuArray{<:AbstractFloat}}, b::EdgeBatch)
+    Adapt.adapt_structure(CuArray, b)
+end
+function Adapt.adapt_structure(to, b::VertexBatch)
+    idxs = adapt(to, b.indices)
+    VertexBatch{dispatchT(b), typeof(compf(b)), typeof(idxs)}(
+        idxs, compf(b), b.statestride, b.pstride, b.aggbufstride)
+end
+function Adapt.adapt_structure(to, b::EdgeBatch)
+    idxs = adapt(to, b.indices)
+    EdgeBatch{dispatchT(b), typeof(compf(b)), typeof(idxs)}(
+        idxs, compf(b), b.statestride, b.pstride, b.gbufstride)
+end
+
+
+####
+#### utils
+####
+# define similar to adapt_structure for DiffCache without type piracy
+function _adapt_diffcache(to, c::DiffCache)
+    du = adapt(to, c.du)
+    dual_du = adapt(to, c.dual_du)
+    DiffCache(du, dual_du, c.any_du)
+    # N = length(c.dual_du) ÷ length(c.du) - 1
+    # DiffCache(du, N)
+end
+
+end
diff --git a/ext/ModelingToolkitExt.jl b/ext/MTKExt.jl
similarity index 99%
rename from ext/ModelingToolkitExt.jl
rename to ext/MTKExt.jl
index 4c84560f..a3ff37cb 100644
--- a/ext/ModelingToolkitExt.jl
+++ b/ext/MTKExt.jl
@@ -1,4 +1,4 @@
-module ModelingToolkitExt
+module MTKExt
 
 using ModelingToolkit: Symbolic, iscall, operation, arguments, build_function
 using ModelingToolkit: ModelingToolkit, Equation, ODESystem, Differential
@@ -11,7 +11,7 @@ using LinearAlgebra: Diagonal, I
 using NetworkDynamics: Fiducial
 import NetworkDynamics: ODEVertex, StaticEdge
 
-include("ModelingToolkitUtils.jl")
+include("MTKUtils.jl")
 
 function ODEVertex(sys::ODESystem, inputs, outputs; verbose=false, name=getname(sys))
     warn_events(sys)
diff --git a/ext/ModelingToolkitUtils.jl b/ext/MTKUtils.jl
similarity index 100%
rename from ext/ModelingToolkitUtils.jl
rename to ext/MTKUtils.jl
diff --git a/src/NetworkDynamics.jl b/src/NetworkDynamics.jl
index c1d01f4f..78b1de41 100644
--- a/src/NetworkDynamics.jl
+++ b/src/NetworkDynamics.jl
@@ -4,7 +4,7 @@ using Graphs: Graphs, AbstractGraph, SimpleEdge, edges, vertices, ne, nv,
 using TimerOutputs: @timeit_debug, reset_timer!
 
 using ArgCheck: @argcheck
-using PreallocationTools: PreallocationTools, LazyBufferCache, DiffCache
+using PreallocationTools: PreallocationTools, DiffCache, get_tmp
 using SciMLBase: SciMLBase
 using Base.Threads: @threads
 using NNlib: NNlib
@@ -18,6 +18,7 @@ using DocStringExtensions: FIELDS, TYPEDEF
 using StyledStrings: StyledStrings, @styled_str
 using RecursiveArrayTools: DiffEqArray
 using FastClosures: @closure
+using ForwardDiff: ForwardDiff
 
 @static if VERSION ≥ v"1.11.0-0"
     using Base: AnnotatedIOBuffer, AnnotatedString
@@ -25,8 +26,6 @@ else
     using StyledStrings: AnnotatedIOBuffer, AnnotatedString
 end
 
-using Adapt: Adapt, adapt
-
 using Base: @propagate_inbounds
 using InteractiveUtils: subtypes
 
@@ -54,11 +53,10 @@ include("network_structure.jl")
 export NaiveAggregator, KAAggregator, SequentialAggregator,
        PolyesterAggregator, ThreadedAggregator
 include("aggregators.jl")
+include("gbufs.jl")
 include("construction.jl")
 include("coreloop.jl")
 
-include("adapt.jl")
-
 # XXX: have both, s[:] and uflat(s) ?
 export VIndex, EIndex, VPIndex, EPIndex, NWState, NWParameter, uflat, pflat
 export vidxs, eidxs, vpidxs, epidxs
diff --git a/src/adapt.jl b/src/adapt.jl
deleted file mode 100644
index ae5fbfba..00000000
--- a/src/adapt.jl
+++ /dev/null
@@ -1,25 +0,0 @@
-function Adapt.adapt_structure(to, n::Network)
-    vb = adapt(to, n.vertexbatches)
-    layer = adapt(to, n.layer)
-    mm = adapt(to, n.mass_matrix)
-    exT = typeof(executionstyle(n))
-    gT = typeof(n.im.g)
-    Network{exT,gT,typeof(layer),typeof(vb),typeof(mm)}(
-        vb, layer, n.im, n.cachepool, mm)
-end
-
-Adapt.@adapt_structure NetworkLayer
-Adapt.@adapt_structure KAAggregator
-Adapt.@adapt_structure AggregationMap
-Adapt.@adapt_structure SparseAggregator
-
-function Adapt.adapt_structure(to, b::VertexBatch)
-    idxs = adapt(to, b.indices)
-    VertexBatch{dispatchT(b), typeof(b.compf), typeof(idxs)}(
-        idxs, b.compf, b.statestride, b.pstride, b.aggbufstride)
-end
-function Adapt.adapt_structure(to, b::EdgeBatch)
-    idxs = adapt(to, b.indices)
-    EdgeBatch{dispatchT(b), typeof(b.compf), typeof(idxs)}(
-        idxs, b.compf, b.statestride, b.pstride, b.gbufstride)
-end
diff --git a/src/aggregators.jl b/src/aggregators.jl
index 235d07f1..bd31e809 100644
--- a/src/aggregators.jl
+++ b/src/aggregators.jl
@@ -107,6 +107,7 @@ function aggregate!(a::KAAggregator, aggbuf, data)
     # kernel(a.f, aggbuf, view(data, am.range), am.map)
     kernel = agg_kernel!(_backend)
     kernel(a.f, aggbuf, view(data, am.range), am.map; ndrange=length(am.map))
+    # TODO: synchronize after both aggregation sweeps?
     KernelAbstractions.synchronize(_backend)
 
     if !isempty(am.symrange)
@@ -268,6 +269,8 @@ function SparseAggregator(f)
     SparseAggregator
 end
 function SparseAggregator(im, batches)
+    # sparse multiply is faster with Matrix{Float} . Vector{Float} than int!
+    # (both on GPU and CPU)
     I, J, V = Float64[], Float64[], Float64[]
     unrolled_foreach(batches) do batch
         for eidx in batch.indices
diff --git a/src/construction.jl b/src/construction.jl
index 38287efd..8efc04bf 100644
--- a/src/construction.jl
+++ b/src/construction.jl
@@ -102,15 +102,27 @@ function Network(g::AbstractGraph,
             _aggregator = aggregator(im, edgebatches)
         end
 
-        nl = NetworkLayer(im, edgebatches, _aggregator)
+        nl = NetworkLayer(im.g, edgebatches, _aggregator, im.edepth, im.vdepth)
 
         @assert isdense(im)
         mass_matrix = construct_mass_matrix(im)
-        nw = Network{typeof(execution),typeof(g),typeof(nl),typeof(vertexbatches),typeof(mass_matrix)}(
+        N = ForwardDiff.pickchunksize(max(im.lastidx_dynamic, im.lastidx_p))
+        caches = (;state = DiffCache(zeros(im.lastidx_static), N),
+                  aggregation = DiffCache(zeros(im.lastidx_aggr), N))
+
+        gbufprovider = if usebuffer(execution)
+            EagerGBufProvider(im, edgebatches)
+        else
+            LazyGBufProvider(im, edgebatches)
+        end
+
+        nw = Network{typeof(execution),typeof(g),typeof(nl), typeof(vertexbatches),
+                     typeof(mass_matrix),eltype(caches),typeof(gbufprovider)}(
             vertexbatches,
             nl, im,
-            LazyBufferCache(),
-            mass_matrix
+            caches,
+            mass_matrix,
+            gbufprovider
         )
 
     end
@@ -236,22 +248,6 @@ function EdgeBatch(im::IndexManager, idxs::Vector{Int}; verbose)
     end
 end
 
-function NetworkLayer(im::IndexManager, batches, agg)
-    map = zeros(Int, ne(im.g) * im.vdepth, 2)
-    for batch in batches
-        for i in 1:length(batch)
-            eidx = batch.indices[i]
-            e = im.edgevec[eidx]
-            dst_range = im.v_data[e.src][1:im.vdepth]
-            src_range = im.v_data[e.dst][1:im.vdepth]
-            range = gbuf_range(batch, i)
-            map[range, 1] .= dst_range
-            map[range, 2] .= src_range
-        end
-    end
-    NetworkLayer(im.g, batches, agg, im.edepth, im.vdepth, map)
-end
-
 batch_by_idxs(v, idxs::Vector{Vector{Int}}) = [v for batch in idxs]
 function batch_by_idxs(v::AbstractVector, batches::Vector{Vector{Int}})
     @assert length(v) == sum(length.(batches))
diff --git a/src/coreloop.jl b/src/coreloop.jl
index 701578e0..fa53a7ef 100644
--- a/src/coreloop.jl
+++ b/src/coreloop.jl
@@ -11,7 +11,7 @@ function (nw::Network{A,B,C,D,E})(du::dT, u::T, p, t) where {A,B,C,D,E,dT,T}
             fill!(du, zero(eltype(du)))
         end
         @timeit_debug "create _u" begin
-            _u = nw.cachepool[du, nw.im.lastidx_static]
+            _u = get_state_cache(nw, du)
             _u[1:nw.im.lastidx_dynamic] .= u
         end
 
@@ -22,14 +22,11 @@ function (nw::Network{A,B,C,D,E})(du::dT, u::T, p, t) where {A,B,C,D,E,dT,T}
 
         # NOTE: first all static vertices, than all edges (regarless) then dyn vertices
         # maybe disallow static vertices entierly, otherwise the order gets complicated
-
-        @timeit_debug "process layer" process_layer!(ex, nw, nw.layer, dupt)
+        gbuf =  get_gbuf(nw.gbufprovider, _u)
+        @timeit_debug "process layer" process_layer!(ex, nw, nw.layer, gbuf, dupt)
 
         @timeit_debug "aggregate" begin
-            if nw.im.lastidx_aggr == nw.im.lastidx_static
-                error("Aggbuf and _u buf cannot be the same size! This is a known bug.")
-            end
-            aggbuf = nw.cachepool[_u, nw.im.lastidx_aggr]
+            aggbuf = get_aggregation_cache(nw, _u)
             aggregate!(nw.layer.aggregator, aggbuf, _u)
         end
 
@@ -40,12 +37,13 @@ end
 
 get_ustacked_buf(s) = get_ustacked_buf(s.nw, uflat(s), pflat(s), s.t)
 function get_ustacked_buf(nw, u, p, t)
-    _u = nw.cachepool[u, nw.im.lastidx_static]
+    _u = get_state_cache(nw, u)
     _u[1:nw.im.lastidx_dynamic] .= u
     dupt = (nothing, _u, p, t)
     ex = executionstyle(nw)
-    process_layer!(ex, nw, nw.layer, dupt; filt=isstatic)
-    aggbuf = nw.cachepool[_u, nw.im.lastidx_aggr]
+    gbuf = get_gbuf(nw.gbufprovider, _u)
+    process_layer!(ex, nw, nw.layer, gbuf, dupt; filt=isstatic)
+    aggbuf = get_aggregation_cache(nw, _u)
     aggregate!(nw.layer.aggregator, aggbuf, _u)
     process_vertices!(ex, nw, aggbuf, dupt; filt=isstatic)
     _u, aggbuf
@@ -129,162 +127,68 @@ end
 end
 
 ####
-#### Edge Layer Execution unbuffered
+#### Edge Layer Execution
 ####
-@inline function process_layer!(::SequentialExecution{false}, nw, layer, dupt; filt=nofilt)
+@inline function process_layer!(::SequentialExecution, nw, layer, gbuf, dupt; filt=nofilt)
     unrolled_foreach(filt, layer.edgebatches) do batch
         (du, u, p, t) = dupt
         for i in 1:length(batch)
             _type = dispatchT(batch)
-            apply_edge_unbuffered!(_type, batch, i, du, u, nw.im.e_src, nw.im.e_dst, p, t)
+            apply_edge!(_type, batch, i, du, u, gbuf, p, t)
         end
     end
 end
 
-@inline function process_layer!(::ThreadedExecution{false}, nw, layer, dupt; filt=nofilt)
+@inline function process_layer!(::ThreadedExecution, nw, layer, gbuf, dupt; filt=nofilt)
     unrolled_foreach(filt, layer.edgebatches) do batch
         (du, u, p, t) = dupt
         Threads.@threads for i in 1:length(batch)
             _type = dispatchT(batch)
-            apply_edge_unbuffered!(_type, batch, i, du, u, nw.im.e_src, nw.im.e_dst, p, t)
+            apply_edge!(_type, batch, i, du, u, gbuf, p, t)
         end
     end
 end
 
-@inline function process_layer!(::PolyesterExecution{false}, nw, layer, dupt; filt=nofilt)
+@inline function process_layer!(::PolyesterExecution, nw, layer, gbuf, dupt; filt=nofilt)
     unrolled_foreach(filt, layer.edgebatches) do batch
         (du, u, p, t) = dupt
         Polyester.@batch for i in 1:length(batch)
             _type = dispatchT(batch)
-            apply_edge_unbuffered!(_type, batch, i, du, u, nw.im.e_src, nw.im.e_dst, p, t)
+            apply_edge!(_type, batch, i, du, u, gbuf, p, t)
         end
     end
 end
 
-@inline function process_layer!(::KAExecution{false}, nw, layer, dupt; filt=nofilt)
+@inline function process_layer!(::KAExecution, nw, layer, gbuf, dupt; filt=nofilt)
     _backend = get_backend(dupt[2])
     unrolled_foreach(filt, layer.edgebatches) do batch
         (du, u, p, t) = dupt
         kernel = ekernel!(_backend)
         kernel(dispatchT(batch), batch,
-               du, u, nw.im.e_src, nw.im.e_dst, p, t; ndrange=length(batch))
+               du, u, gbuf, p, t; ndrange=length(batch))
     end
     KernelAbstractions.synchronize(_backend)
 end
 @kernel function ekernel!(::Type{T}, @Const(batch),
-                          du, @Const(u),
-                          @Const(srcrange), @Const(dstrange),
+                          du, @Const(u), @Const(gbuf),
                           @Const(p), @Const(t)) where {T<:ODEEdge}
     I = @index(Global)
-    apply_edge_unbuffered!(T, batch, I, du, u, srcrange, dstrange, p, t)
+    apply_edge!(T, batch, I, du, u, gbuf, p, t)
 end
 @kernel function ekernel!(::Type{T}, @Const(batch),
-                          @Const(du), u,
-                          @Const(srcrange), @Const(dstrange),
+                          @Const(du), u, @Const(gbuf),
                           @Const(p), @Const(t)) where {T<:StaticEdge}
     I = @index(Global)
-    apply_edge_unbuffered!(T, batch, I, du, u, srcrange, dstrange, p, t)
+    apply_edge!(T, batch, I, du, u, gbuf, p, t)
 end
 
-@inline function apply_edge_unbuffered!(::Type{T}, batch, i,
-                                        du, u, srcrange, dstrange, p, t) where {T}
+@inline function apply_edge!(::Type{T}, batch, i, du, u, gbuf, p, t) where {T}
     @inbounds begin
-        _du  = _has_dynamic(T) ? view(du, state_range(batch, i))   : nothing
-        _u   = _has_dynamic(T) ? view(u,  state_range(batch, i))   : nothing
-        _s   = _has_static(T)  ? view(u,  state_range(batch, i))    : nothing
-        _p   = _indexable(p)   ? view(p,  parameter_range(batch, i)) : p
-        eidx = @views batch.indices[i]
-        _src = @views u[srcrange[eidx]]
-        _dst = @views u[dstrange[eidx]]
-        apply_compf(T, compf(batch), _du, _u, _s, _src, _dst, _p, t)
-    end
-    nothing
-end
-
-####
-#### Edge Layer Execution buffered
-####
-@inline function process_layer!(::SequentialExecution{true}, nw, layer, dupt; filt=nofilt)
-    u = dupt[2]
-    gbuf = nw.cachepool[u, size(layer.gather_map)]
-    NNlib.gather!(gbuf, u, layer.gather_map)
-
-    unrolled_foreach(filt, layer.edgebatches) do batch
-        (_du, _u, _p, _t) = dupt
-        for i in 1:length(batch)
-            _type = dispatchT(batch)
-            apply_edge_buffered!(_type, batch, i, _du, _u, gbuf, _p, _t)
-        end
-    end
-end
-
-@inline function process_layer!(::ThreadedExecution{true}, nw, layer, dupt; filt=nofilt)
-    u = dupt[2]
-    gbuf = nw.cachepool[u, size(layer.gather_map)]
-    NNlib.gather!(gbuf, u, layer.gather_map)
-
-    unrolled_foreach(filt, layer.edgebatches) do batch
-        (_du, _u, _p, _t) = dupt
-        Threads.@threads for i in 1:length(batch)
-            _type = dispatchT(batch)
-            apply_edge_buffered!(_type, batch, i, _du, _u, gbuf, _p, _t)
-        end
-    end
-end
-
-@inline function process_layer!(::PolyesterExecution{true}, nw, layer, dupt; filt=nofilt)
-    u = dupt[2]
-    gbuf = nw.cachepool[u, size(layer.gather_map)]
-    NNlib.gather!(gbuf, u, layer.gather_map)
-
-    unrolled_foreach(filt, layer.edgebatches) do batch
-        (_du, _u, _p, _t) = dupt
-        Polyester.@batch for i in 1:length(batch)
-            _type = dispatchT(batch)
-            apply_edge_buffered!(_type, batch, i, _du, _u, gbuf, _p, _t)
-        end
-    end
-end
-
-@inline function process_layer!(::KAExecution{true}, nw, layer, dupt; filt=nofilt)
-    # buffered/gathered
-    u = dupt[2]
-    gbuf = nw.cachepool[u, size(layer.gather_map)]
-    NNlib.gather!(gbuf, u, layer.gather_map)
-
-    backend = get_backend(u)
-    unrolled_foreach(filt, layer.edgebatches) do batch
-        (_du, _u, _p, _t) = dupt
-        kernel = ekernel_buffered!(backend)
-        kernel(dispatchT(batch), batch,
-               _du, _u, gbuf, _p, _t; ndrange=length(batch))
-    end
-    KernelAbstractions.synchronize(backend)
-end
-
-@kernel function ekernel_buffered!(::Type{T}, @Const(batch),
-                                   du, @Const(u),
-                                   @Const(gbuf), @Const(p), @Const(t)) where {T<:ODEEdge}
-    I = @index(Global)
-    apply_edge_buffered!(T, batch, I, du, u, gbuf, p, t)
-end
-@kernel function ekernel_buffered!(::Type{T}, @Const(batch),
-                                   @Const(du), u,
-                                   @Const(gbuf), @Const(p), @Const(t)) where {T<:StaticEdge}
-    I = @index(Global)
-    apply_edge_buffered!(T, batch, I, du, u, gbuf, p, t)
-end
-
-@inline function apply_edge_buffered!(::Type{T}, batch, i,
-                                      du, u, gbuf, p, t) where {T}
-    @inbounds begin
-        _du  = _has_dynamic(T) ? view(du, state_range(batch, i))   : nothing
-        _u   = _has_dynamic(T) ? view(u,  state_range(batch, i))   : nothing
-        _s   = _has_static(T)  ? view(u,  state_range(batch, i))    : nothing
+        _du  = _has_dynamic(T) ? view(du, state_range(batch, i))     : nothing
+        _u   = _has_dynamic(T) ? view(u,  state_range(batch, i))     : nothing
+        _s   = _has_static(T)  ? view(u,  state_range(batch, i))     : nothing
         _p   = _indexable(p)   ? view(p,  parameter_range(batch, i)) : p
-        bufr = @views gbuf_range(batch, i)
-        _src = @views gbuf[bufr, 1]
-        _dst = @views gbuf[bufr, 2]
+        _src, _dst = get_src_dst(gbuf, batch, i)
         apply_compf(T, compf(batch), _du, _u, _s, _src, _dst, _p, t)
     end
     nothing
diff --git a/src/gbufs.jl b/src/gbufs.jl
new file mode 100644
index 00000000..6ad60346
--- /dev/null
+++ b/src/gbufs.jl
@@ -0,0 +1,62 @@
+abstract type GBufProvider end
+
+####
+#### Eager version
+####
+
+struct EagerGBufProvider{MT,C} <: GBufProvider
+    "mapping e_idx -> [v_src_idx_in_fullflat; v_dst_idx_in_fullflat]"
+    map::MT # input_map[:, e_idx] = [v_src_idx, v_dst_idx]
+    diffcache::C
+end
+
+function EagerGBufProvider(im::IndexManager, batches)
+    map = zeros(Int, ne(im.g) * im.vdepth, 2)
+    for i in 1:ne(im.g)
+        map[im.e_gbufr[i], 1] .= im.e_src[i]
+        map[im.e_gbufr[i], 2] .= im.e_dst[i]
+    end
+
+    N = ForwardDiff.pickchunksize(max(im.lastidx_dynamic, im.lastidx_p))
+    EagerGBufProvider(map, DiffCache(Float64.(map), N))
+end
+
+function get_gbuf(bufp::EagerGBufProvider, u)
+    gbuf = get_tmp(bufp.diffcache, u)
+    NNlib.gather!(gbuf, u, bufp.map)
+    gbuf
+end
+
+Base.@propagate_inbounds function get_src_dst(gbuf::AbstractArray, batch, i)
+    bufr = @views gbuf_range(batch, i)
+    src = @views gbuf[bufr, 1]
+    dst = @views gbuf[bufr, 2]
+    src, dst
+end
+
+####
+#### Lazy version
+####
+
+struct LazyGBufProvider{SM,DM} <: GBufProvider
+    e_src::SM
+    e_dst::DM
+end
+struct LazyGBuf{LBP,UT}
+    lbp::LBP
+    u::UT
+end
+
+LazyGBufProvider(im::IndexManager, _) = LazyGBufProvider(copy(im.e_src), copy(im.e_dst))
+
+function get_gbuf(bufp::LazyGBufProvider, u)
+    LazyGBuf(bufp, u)
+end
+
+Base.@propagate_inbounds function get_src_dst(gbuf::LazyGBuf, batch, i)
+    eidx =  batch.indices[i]
+    lbp = gbuf.lbp
+    src = view(gbuf.u, lbp.e_src[eidx])
+    dst = view(gbuf.u, lbp.e_dst[eidx])
+    src, dst
+end
diff --git a/src/network_structure.jl b/src/network_structure.jl
index bb1e9118..2112444b 100644
--- a/src/network_structure.jl
+++ b/src/network_structure.jl
@@ -47,9 +47,9 @@ usebuffer(::Type{<:ExecutionStyle{buffered}}) where {buffered} = buffered
 # check cuda compatibliity
 iscudacompatible(x) = iscudacompatible(typeof(x))
 iscudacompatible(::Type{<:ExecutionStyle}) = false
-iscudacompatible(::Type{<:KAExecution{true}}) = true
+iscudacompatible(::Type{<:KAExecution}) = true
 
-struct Network{EX<:ExecutionStyle,G,NL,VTup,MM}
+struct Network{EX<:ExecutionStyle,G,NL,VTup,MM,CT,GBT}
     "vertex batches of same function"
     vertexbatches::VTup
     "network layer"
@@ -57,9 +57,11 @@ struct Network{EX<:ExecutionStyle,G,NL,VTup,MM}
     "index manager"
     im::IndexManager{G}
     "lazy cache pool"
-    cachepool::LazyBufferCache{typeof(identity),typeof(identity)}
+    caches::@NamedTuple{state::CT,aggregation::CT}
     "mass matrix"
     mass_matrix::MM
+    "Gather buffer provider (lazy or eager)"
+    gbufprovider::GBT
 end
 executionstyle(::Network{ex}) where {ex} = ex()
 nvbatches(::Network) = length(vertexbatches)
@@ -83,7 +85,18 @@ Graphs.nv(nw::Network) = nv(nw.im.g)
 Graphs.ne(nw::Network) = ne(nw.im.g)
 Base.broadcastable(nw::Network) = Ref(nw)
 
-struct NetworkLayer{GT,ETup,AF,MT}
+function get_state_cache(nw::Network, T)
+    if eltype(T) <: AbstractFloat && eltype(nw.caches.state.du) != eltype(T)
+        throw(ArgumentError("Network caches are initialized with $(eltype(nw.caches.state.du)) \
+            but is used for $(eltype(T)) data!"))
+    end
+    get_tmp(nw.caches.state, T)
+end
+get_aggregation_cache(nw::Network, T) = get_tmp(nw.caches.aggregation, T)
+
+iscudacompatible(nw::Network) = iscudacompatible(executionstyle(nw)) && iscudacompatible(nw.layer.aggregator)
+
+struct NetworkLayer{GT,ETup,AF}
     "graph/toplogy of layer"
     g::GT
     "edge batches with same function"
@@ -94,13 +107,11 @@ struct NetworkLayer{GT,ETup,AF,MT}
     edepth::Int # potential becomes range for multilayer
     "vertex dimensions visible to edges"
     vdepth::Int # potential becomes range for multilayer
-    "mapping e_idx -> [v_src_idx_in_fullflat; v_dst_idx_in_fullflat]"
-    gather_map::MT # input_map[:, e_idx] = [v_src_idx, v_dst_idx]
 end
 
 abstract type ComponentBatch{F} end
 
-struct VertexBatch{T<:VertexFunction,F,IV<:AbstractVector{<:Int}} <: ComponentBatch{T}
+struct VertexBatch{T<:VertexFunction,F,IV<:AbstractVector{<:Integer}} <: ComponentBatch{T}
     "vertex indices contained in batch"
     indices::IV
     "vertex function"
@@ -113,7 +124,7 @@ struct VertexBatch{T<:VertexFunction,F,IV<:AbstractVector{<:Int}} <: ComponentBa
     aggbufstride::BatchStride
 end
 
-struct EdgeBatch{T<:EdgeFunction,F,IV<:AbstractVector{<:Int}} <: ComponentBatch{T}
+struct EdgeBatch{T<:EdgeFunction,F,IV<:AbstractVector{<:Integer}} <: ComponentBatch{T}
     "edge indices (as in edge iterator) contained in batch"
     indices::IV
     "edge function"
diff --git a/test/AD_test.jl b/test/AD_test.jl
index 84364077..7768769f 100644
--- a/test/AD_test.jl
+++ b/test/AD_test.jl
@@ -35,7 +35,7 @@ fp = function(p)
     nw(dx, x0, p, 0.0)
     dx
 end
-# jacobian(fp, AutoEnzyme(), pflat(p0)
+# jacobian(fp, AutoEnzyme(), pflat(p0))
 # jacobian(fp, AutoZygote(), pflat(p0))
 # jacobian(fp, AutoFiniteDifferences(), pflat(p0))
 # jacobian(fp, AutoForwardDiff(), pflat(p0))
@@ -44,7 +44,8 @@ end
 
 scenarios = [Scenario{:jacobian, :in}(fx, x0; res1=jacobian(fx, AutoFiniteDiff(), x0)) ,
              Scenario{:jacobian, :in}(fp, pflat(p0); res1=jacobian(fp, AutoFiniteDiff(), pflat(p0)))]
-backends = [AutoForwardDiff(), AutoReverseDiff()]
+# backends = [AutoForwardDiff(), AutoReverseDiff()]
+backends = [AutoForwardDiff()]
 test_differentiation(
     backends,             # the backends you want to compare
     scenarios,            # the scenarios you defined,
diff --git a/test/GPU_test.jl b/test/GPU_test.jl
index 9bb485bc..7a53b23b 100644
--- a/test/GPU_test.jl
+++ b/test/GPU_test.jl
@@ -6,6 +6,7 @@ using StableRNGs
 using KernelAbstractions
 using Graphs
 using Random
+using Test
 (isinteractive() && @__MODULE__()==Main ? includet : include)("ComponentLibrary.jl")
 
 rng = StableRNG(1)
@@ -17,34 +18,67 @@ ef = [Lib.diffusion_odeedge(),
       Lib.diffusion_edge_fid(),
       Lib.diffusion_odeedge(),
       Lib.diffusion_edge_fid()]
-nw = Network(g, vf, ef; execution=KAExecution{true}(), aggregator=KAAggregator(+))
+nw = Network(g, vf, ef)
+@test_throws ArgumentError adapt(CuArray{Float32}, nw) # wrong execution
 
+nw = Network(g, vf, ef; execution=KAExecution{true}(), aggregator=KAAggregator(+))
 x0 = rand(rng, dim(nw))
 p  = rand(rng, pdim(nw))
 dx = zeros(length(x0))
 nw(dx, x0, p, NaN)
 
 to = CUDABackend()
-nw_d = adapt(to, nw)
-@test nw_d.vertexbatches[1].indices isa CuArray
-@test nw_d.layer.edgebatches[1].indices isa CuArray
-@test nw_d.layer.gather_map isa CuArray
-@test nw_d.layer.aggregator.m.map isa CuArray
-@test nw_d.layer.aggregator.m.symmap isa CuArray
-x0_d = adapt(to, x0)
-p_d = adapt(to, p)
-dx_d = adapt(to, zeros(length(x0)))
-nw_d(dx_d, x0_d, p_d, NaN)
-@test Vector(dx_d) ≈ dx
+@test_throws ArgumentError adapt(to, nw)
+to = :foo
+@test_throws ArgumentError adapt(to, nw)
+to = CuArray([1,2,3])
+@test_throws ArgumentError adapt(to, nw)
+to = cu(rand(3))
+@test_throws ArgumentError adapt(to, nw)
+to = CuArray
+@test_throws ArgumentError adapt(to, nw)
+
+to1 = CuArray{Float32}
+to2 = CuArray{Float64}
+nw1 = adapt(to1, nw)
+nw2 = adapt(to2, nw)
+
+for nw in (nw1, nw2)
+    @test nw.vertexbatches[1].indices isa CuArray{Int}
+    @test nw.layer.edgebatches[1].indices isa CuArray{Int}
+    @test nw.gbufprovider.map isa CuArray{Int}
+    @test nw.layer.aggregator.m.map isa CuArray{Int}
+    @test nw.layer.aggregator.m.symmap isa CuArray{Int}
+end
+
+@test nw1.caches.state.du isa CuArray{Float32}
+@test nw1.caches.aggregation.du isa CuArray{Float32}
+@test nw2.caches.state.du isa CuArray{Float64}
+@test nw2.caches.aggregation.du isa CuArray{Float64}
+
+x0_d1 = adapt(to1, x0)
+p_d1 = adapt(to1, p)
+dx_d1 = adapt(to1, zeros(length(x0)))
+nw1(dx_d1, x0_d1, p_d1, NaN)
+@test Vector(dx_d1) ≈ dx
+@test_throws ArgumentError nw2(dx_d1, x0_d1, p_d1, NaN) # wrong type for cache
+
+x0_d2 = adapt(to2, x0)
+p_d2 = adapt(to2, p)
+dx_d2 = adapt(to2, zeros(length(x0)))
+nw2(dx_d2, x0_d2, p_d2, NaN)
+@test Vector(dx_d2) ≈ dx
+@test_throws ArgumentError nw1(dx_d2, x0_d2, p_d2, NaN) # wrong type for cache
+
 
 # try SparseAggregator
 nw2 = Network(g, vf, ef; execution=KAExecution{true}(), aggregator=SparseAggregator(+))
-nw2_d = adapt(CuArray, nw2)
+nw2_d = adapt(CuArray{Float32}, nw2)
 
 @test nw2_d.layer.aggregator.m isa CuSparseMatrixCSC
-fill!(dx_d, 0)
-nw2_d(dx_d, x0_d, p_d, NaN)
-@test Vector(dx_d) ≈ dx
+fill!(dx_d1, 0)
+nw2_d(dx_d1, x0_d1, p_d1, NaN)
+@test Vector(dx_d1) ≈ dx
 
 # mini benchmark
 
diff --git a/test/construction_test.jl b/test/construction_test.jl
index d534fb7f..881c670f 100644
--- a/test/construction_test.jl
+++ b/test/construction_test.jl
@@ -124,14 +124,14 @@ end
     @test nd.mass_matrix == I && nd.mass_matrix isa UniformScaling
 end
 
-@testset "gbuf map construction" begin
+@testset "eager gbuf map construction" begin
     using NetworkDynamics: gbuf_range
     e1 = StaticEdge(x->x^1, 1, 0, AntiSymmetric())
     e2 = StaticEdge(x->x^1, 1, 0, AntiSymmetric())
     v = ODEVertex(x->x^1, 1, 0)
     g = path_graph(4)
     nd = Network(g, v, [e1,e2,e1])
-    map = nd.layer.gather_map
+    map = nd.gbufprovider.map
     for batch in nd.layer.edgebatches
         for batch_subi in 1:length(batch)
             eidx = batch.indices[batch_subi]
diff --git a/test/testutils.jl b/test/testutils.jl
index 28fc5c16..fa91e2de 100644
--- a/test/testutils.jl
+++ b/test/testutils.jl
@@ -64,7 +64,7 @@ function test_execution_styles(prob)
         end
 
         if CUDA.functional()
-            to = CuArray
+            to = CuArray{Float64}
             u_d = adapt(to, u)
             p_d = adapt(to, p)
 
@@ -78,7 +78,7 @@ function test_execution_styles(prob)
                 _nw_d(_du_d, u_d, p_d, t)
                 issame = isapprox(Vector(_du_d), du; atol=1e-10)
                 if !issame
-                    println("CUDA execution lead to different results: extrema(Δ) = $(extrema(_du - du))")
+                    println("CUDA execution lead to different results: extrema(Δ) = $(extrema(Vector(_du_d) - du))")
                 end
                 @test issame
             end