From d61a2aa623630f2bc697a32d29c217e908ea8f21 Mon Sep 17 00:00:00 2001 From: PentaKon Date: Wed, 31 Jan 2024 00:29:00 +0200 Subject: [PATCH 01/25] First commit for global scan implementation. Ported most of the code but not working yet --- .../nbl/builtin/hlsl/glsl_compat/core.hlsl | 5 + .../nbl/builtin/hlsl/scan/declarations.hlsl | 75 ++++++----- .../builtin/hlsl/scan/default_scheduler.hlsl | 106 +++++++--------- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 116 +++++++++++++++++- include/nbl/builtin/hlsl/scan/direct.hlsl | 53 ++++---- include/nbl/builtin/hlsl/scan/indirect.hlsl | 34 ++--- .../builtin/hlsl/scan/parameters_struct.hlsl | 30 ----- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 42 +++---- 8 files changed, 256 insertions(+), 205 deletions(-) delete mode 100644 include/nbl/builtin/hlsl/scan/parameters_struct.hlsl diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl index 3cb34d6f16..b08073beae 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl @@ -83,6 +83,11 @@ void tess_ctrl_barrier() { void memoryBarrierShared() { spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask); } + +void memoryBarrierBuffer() { + spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsUniformMemoryMask); +} + #endif } diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl index 2d2e66e66d..ec3e558dfc 100644 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl @@ -1,66 +1,63 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + #ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ #define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ // REVIEW: Not sure if this file is needed in HLSL implementation -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS +#define NBL_BUILTIN_MAX_SCAN_LEVELS 7 +#endif -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ namespace nbl { namespace hlsl { namespace scan { + // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) + struct Parameters_t { + uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; + uint32_t topLevel; + uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; + } + Parameters_t getParameters(); -} -} -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ -#endif -#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ + struct DefaultSchedulerParameters_t + { + uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; + uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; + + }; + + DefaultSchedulerParameters_t getSchedulerParameters(); + template void getData( - inout Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel + NBL_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel ); -} -} -} -#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -#endif -#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ template void setData( - in Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel, - in bool inRange + NBL_CONST_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel, + NBL_CONST_REF_ARG(bool) inRange ); + } } } -#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -#endif #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 450368475d..67c62aadae 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -1,31 +1,13 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + #ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ #define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" - -#ifdef __cplusplus -#define uint uint32_t -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - struct DefaultSchedulerParameters_t - { - uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; - uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; - - }; -} -} -} - -#ifdef __cplusplus -#undef uint -#else +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/scan/descriptors.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" namespace nbl { @@ -33,6 +15,8 @@ namespace hlsl { namespace scan { + +#ifdef HLSL namespace scheduler { /** @@ -58,17 +42,17 @@ namespace scheduler * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps * synchronizing between workgroups with while-loop spinning. */ - void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) + void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) { #define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u) + const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x); _scanParams.lastElement[0] = elementCount-1u; - _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_; - // REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime? + _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2; - for (uint i=0; i>_NBL_HLSL_WORKGROUP_SIZE_LOG2_; + const uint32_t next = i+1; + _scanParams.lastElement[next] = _scanParams.lastElement[i]>>workgroupSizeLog2; i = next; } _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0); @@ -130,34 +114,31 @@ namespace scheduler * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. */ - template - bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex) + template + bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex) { - ScratchAccessor sharedScratch; if(SubgroupContiguousIndex() == 0u) { - uint64_t original; - InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared - sharedScratch.set(SubgroupContiguousIndex(), original); + uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u); + sharedScratch.set(0u, originalWGs); } else if (SubgroupContiguousIndex() == 1u) { - sharedScratch.set(SubgroupContiguousIndex(), 0u); + sharedScratch.set(1u, 0u); } - GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology? - - const uint globalWorkgroupIndex; // does every thread need to know? - sharedScratch.get(0u, globalWorkgroupIndex); - const uint lastLevel = topLevel<<1u; + sharedScratch.workgroupExecutionAndMemoryBarrier(); + + const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know? + const uint32_t lastLevel = topLevel<<1u; if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) { - InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ? + sharedScratch.atomicAdd(1u, 1u); } - GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor? + sharedScratch.workgroupExecutionAndMemoryBarrier(); - sharedScratch.get(1u, treeLevel); + treeLevel = sharedScratch.get(1u); if(treeLevel>lastLevel) - return true; + return true; localWorkgroupIndex = globalWorkgroupIndex; const bool dependentLevel = treeLevel != 0u; @@ -170,52 +151,55 @@ namespace scheduler uint dependentsCount = 1u; if(treeLevel <= topLevel) { - dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file? + dependentsCount = glsl::gl_WorkGroupSize().x; const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; if (lastWorkgroup) { - const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader + const Parameters_t scanParams = getParameters(); dependentsCount = scanParams.lastElement[treeLevel]+1u; if (treeLeveltopLevel) // !(prevLevel globallycoherent \ No newline at end of file +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" + +// coherent -> globallycoherent + +// (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... +#ifndef SCRATCH_SIZE +#error "Must manually define SCRATCH_SIZE for now" +#endif + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct Scratch +{ + uint32_t workgroupsStarted; + uint32_t data[scratchElementCount]; +}; + +[[vk::binding(0 ,0)]] StructuredBuffer scanInputBuf; // (REVIEW): Make the type externalizable. Decide how (#define?) +[[vk::binding(1 ,0)]] RWStructuredBuffer globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model + +template +void getData( + NBL_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel +) +{ + const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders + + uint32_t offset = levelInvocationIndex; + const bool notFirstOrLastLevel = bool(pseudoLevel); + if (notFirstOrLastLevel) + offset += params.temporaryStorageOffset[pseudoLevel-1u]; + + if (pseudoLevel!=treeLevel) // downsweep + { + const bool firstInvocationInGroup = SubgroupContiguousIndex()==0u; + if (bool(localWorkgroupIndex) && firstInvocationInGroup) + data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]]; + + if (notFirstOrLastLevel) + { + if (!firstInvocationInGroup) + data = scanScratchBuf[0].data[offset-1u]; + } + else + { + if(isExclusive) + { + if (!firstInvocationInGroup) + data += scanInputBuf[offset-1u]; + } + else + { + data += scanInputBuf[offset]; + } + } + } + else + { + if (notFirstOrLastLevel) + data = scanScratchBuf[0].data[offset]; + else + data = scanInputBuf[offset]; + } +} + +template +void setData( + NBL_CONST_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel, + NBL_CONST_REF_ARG(bool) inRange +) +{ + const Parameters_t params = getParameters(); + if (treeLevel - void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex) + template + void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); - const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex(); - const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1); - - const uint lastLevel = params.topLevel << 1u; - const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel]; + const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex(); + const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1); + const uint32_t lastLevel = params.topLevel << 1u; + const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; - Storage_t data = Binop::identity(); + Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t? if(inRange) { getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); @@ -36,17 +34,19 @@ namespace scan if(treeLevel < params.topLevel) { - #error "Must also define some scratch accessor when calling operation()" - data = workgroup::reduction()(data); + data = workgroup::reduction::template __call(value,accessor); } - // REVIEW: missing _TYPE_ check and extra case here + else if (!isExclusive && params.topLevel == 0u) + { + data = workgroup::inclusive_scan::template __call(value,accessor); + } else if (treeLevel != params.topLevel) { - data = workgroup::inclusive_scan()(data); + data = workgroup::inclusive_scan::template __call(value,accessor); } else { - data = workgroup::exclusive_scan()(data); + data = workgroup::exclusive_scan::template __call(value,accessor); } setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); } @@ -54,8 +54,6 @@ namespace scan } } -#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored? -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" namespace nbl { namespace hlsl @@ -66,11 +64,13 @@ namespace scan void main() { const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); - const uint topLevel = getParameters().topLevel; + const uint32_t topLevel = getParameters().topLevel; // persistent workgroups while (true) { - uint treeLevel,localWorkgroupIndex; + // REVIEW: Need to create accessor here. + // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call? + uint32_t treeLevel,localWorkgroupIndex; if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex)) { return; @@ -84,9 +84,5 @@ namespace scan } } } -#endif - -#define _NBL_HLSL_SCAN_MAIN_DEFINED_ -#endif #endif \ No newline at end of file From 0eec0ee44b9661fd3317e4968a33b5d88fd372bb Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 12 Feb 2024 01:17:25 +0200 Subject: [PATCH 02/25] Global scan migration of code to new APIs --- .../builtin/hlsl/scan/default_scheduler.hlsl | 2 +- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 17 ++---- include/nbl/builtin/hlsl/scan/direct.hlsl | 49 ++++++++++++++- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 29 ++++----- include/nbl/video/utilities/CScanner.h | 34 +++++------ src/nbl/video/utilities/CScanner.cpp | 61 ++++++++++++++----- 6 files changed, 130 insertions(+), 62 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 67c62aadae..5ec112b065 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -115,7 +115,7 @@ namespace scheduler * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. */ template - bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex) + bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) { if(SubgroupContiguousIndex() == 0u) { diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl index 0adee5dc32..169257e30a 100644 --- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl +++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl @@ -10,11 +10,6 @@ // coherent -> globallycoherent -// (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... -#ifndef SCRATCH_SIZE -#error "Must manually define SCRATCH_SIZE for now" -#endif - namespace nbl { namespace hlsl @@ -22,14 +17,14 @@ namespace hlsl namespace scan { -template +template // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... struct Scratch { uint32_t workgroupsStarted; uint32_t data[scratchElementCount]; }; -[[vk::binding(0 ,0)]] StructuredBuffer scanInputBuf; // (REVIEW): Make the type externalizable. Decide how (#define?) +[[vk::binding(0 ,0)]] RWStructuredBuffer scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) [[vk::binding(1 ,0)]] RWStructuredBuffer globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model template @@ -64,11 +59,11 @@ void getData( if(isExclusive) { if (!firstInvocationInGroup) - data += scanInputBuf[offset-1u]; + data += scanBuffer[offset-1u]; } else { - data += scanInputBuf[offset]; + data += scanBuffer[offset]; } } } @@ -77,7 +72,7 @@ void getData( if (notFirstOrLastLevel) data = scanScratchBuf[0].data[offset]; else - data = scanInputBuf[offset]; + data = scanBuffer[offset]; } } @@ -106,7 +101,7 @@ void setData( scanScratchBuf[0].data[levelInvocationIndex+offset] = data; } else - scanInputBuf[levelInvocationIndex] = data; + scanBuffer[levelInvocationIndex] = data; } } diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl index cb2e0ce078..76003d8e4a 100644 --- a/include/nbl/builtin/hlsl/scan/direct.hlsl +++ b/include/nbl/builtin/hlsl/scan/direct.hlsl @@ -1,9 +1,45 @@ // Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +#pragma shader_stage(compute) + +#include "nbl/builtin/hlsl/functional.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" #include "nbl/builtin/hlsl/scan/declarations.hlsl" +static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; + +// TODO: Can we make it a static variable? +groupshared uint32_t wgScratch[SharedScratchSz]; + +#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" + +template +struct WGScratchProxy +{ + uint32_t get(const uint32_t ix) + { + return wgScratch[ix+offset]; + } + void set(const uint32_t ix, const uint32_t value) + { + wgScratch[ix+offset] = value; + } + + uint32_t atomicAdd(uint32_t ix, uint32_t val) + { + return glsl::atomicAdd(wgScratch[ix + offset], val); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; +static WGScratchProxy<0> accessor; + // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} @@ -16,6 +52,17 @@ struct ScanPushConstants [[vk::push_constant]] ScanPushConstants spc; +/** + * Required since we rely on SubgroupContiguousIndex instead of + * gl_LocalInvocationIndex which means to match the global index + * we can't use the gl_GlobalInvocationID but an index based on + * SubgroupContiguousIndex. + */ +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + namespace nbl { namespace hlsl @@ -39,5 +86,5 @@ DefaultSchedulerParameters_t getSchedulerParameters() [numthreads(WORKGROUP_SIZE,1,1)] void main() { - nbl::hlsl::scan::main(); + nbl::hlsl::scan::main(accessor); } \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 3563f9546e..a8319f4366 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -15,7 +15,7 @@ namespace hlsl { namespace scan { - template + template void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); @@ -26,6 +26,9 @@ namespace scan const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; + // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel + // so that it exits after reaching the top? + Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t? if(inRange) { @@ -34,34 +37,26 @@ namespace scan if(treeLevel < params.topLevel) { - data = workgroup::reduction::template __call(value,accessor); + data = workgroup::reduction::template __call(data,accessor); } else if (!isExclusive && params.topLevel == 0u) { - data = workgroup::inclusive_scan::template __call(value,accessor); + data = workgroup::inclusive_scan::template __call(data,accessor); } else if (treeLevel != params.topLevel) { - data = workgroup::inclusive_scan::template __call(value,accessor); + data = workgroup::inclusive_scan::template __call(data,accessor); } else { - data = workgroup::exclusive_scan::template __call(value,accessor); + data = workgroup::exclusive_scan::template __call(data,accessor); } setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); } -} -} -} -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components - void main() + template + void main(NBL_REF_ARG(Accessor) accessor) { const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); const uint32_t topLevel = getParameters().topLevel; @@ -71,12 +66,12 @@ namespace scan // REVIEW: Need to create accessor here. // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call? uint32_t treeLevel,localWorkgroupIndex; - if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex)) + if (scheduler::getWork(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor)) { return; } - virtualWorkgroup(treeLevel,localWorkgroupIndex); + virtualWorkgroup(treeLevel, localWorkgroupIndex, accessor); scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex); } diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index 2ae67163eb..9f1f75b117 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -9,14 +9,12 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/utilities/IDescriptorSetCache.h" +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS & 0x1, "NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!"); namespace nbl::video { -#include "nbl/builtin/glsl/scan/parameters_struct.glsl" -#include "nbl/builtin/glsl/scan/default_scheduler.glsl" -static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS&0x1,"NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!"); - /** Utility class to help you perform the equivalent of `std::inclusive_scan` and `std::exclusive_scan` with data on the GPU. @@ -150,9 +148,9 @@ class CScanner final : public core::IReferenceCounted enum E_SCAN_TYPE : uint8_t { // computes output[n] = Sum_{i<=n}(input[i]) - EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_, + EST_INCLUSIVE = 0u, // computes output[n] = Sum_{i(getDefaultShader(scanType,dataType,op)); - cpuShader->setFilePathHint("nbl/builtin/glsl/scan/direct.comp"); + cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); auto gpushader = m_device->createShader(std::move(cpuShader)); m_specialized_shaders[scanType][dataType][op] = m_device->createSpecializedShader( gpushader.get(),{nullptr,nullptr,"main"}); - // , asset::IShader::ESS_COMPUTE, "nbl/builtin/glsl/scan/direct.comp" + // , asset::IShader::ESS_COMPUTE, "nbl/builtin/hlsl/scan/direct.hlsl" } return m_specialized_shaders[scanType][dataType][op].get(); } diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index ec31272358..6fb88074b6 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -3,11 +3,10 @@ using namespace nbl; using namespace video; -#if 0 // TODO: port core::smart_refctd_ptr CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const { auto system = m_device->getPhysicalDevice()->getSystem(); - core::smart_refctd_ptr glsl; + core::smart_refctd_ptr hlsl; { auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr { @@ -19,24 +18,59 @@ core::smart_refctd_ptr CScanner::createShader(const bool indi }; if(indirect) - glsl = loadBuiltinData("nbl/builtin/glsl/scan/indirect.comp"); + hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect.hlsl"); else - glsl = loadBuiltinData("nbl/builtin/glsl/scan/direct.comp"); + hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/direct.hlsl"); } - auto buffer = core::make_smart_refctd_ptr(glsl->getSize()); - memcpy(buffer->getPointer(), glsl->getMappedPointer(), glsl->getSize()); - auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????"); + auto buffer = core::make_smart_refctd_ptr(hlsl->getSize()); + memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize()); + auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????"); + + // (REVIEW): All of the below, that rely on enumerations, should probably be changed to take advantage of HLSL-CPP compatibility. + // Issue is that CScanner caches all shaders for all permutations of scanType-dataType-operator and it's not clear how to + // do this without enums const char* storageType = nullptr; switch (dataType) { case EDT_UINT: - storageType = "uint"; + storageType = "uint32_t"; break; case EDT_INT: - storageType = "int"; + storageType = "int32_t"; break; case EDT_FLOAT: - storageType = "float"; + storageType = "float32_t"; + break; + default: + assert(false); + break; + } + + bool isExclusive = scanType == EST_EXCLUSIVE; + + const char* binop = nullptr; + switch (op) + { + case EO_AND: + binop = "bit_and"; + break; + case EO_OR: + binop = "bit_or"; + break; + case EO_XOR: + binop = "bit_xor"; + break; + case EO_ADD: + binop = "plus"; + break; + case EO_MUL: + binop = "multiplies"; + break; + case EO_MAX: + binop = "maximum"; + break; + case EO_MIN: + binop = "minimum"; break; default: assert(false); @@ -45,8 +79,7 @@ core::smart_refctd_ptr CScanner::createShader(const bool indi return asset::CGLSLCompiler::createOverridenCopy( cpushader.get(), - "#define _NBL_GLSL_WORKGROUP_SIZE_ %d\n#define _NBL_GLSL_WORKGROUP_SIZE_LOG2_ %d\n#define _NBL_GLSL_SCAN_TYPE_ %d\n#define _NBL_GLSL_SCAN_STORAGE_TYPE_ %s\n#define _NBL_GLSL_SCAN_BIN_OP_ %d\n", - m_workgroupSize,hlsl::findMSB(m_workgroupSize),uint32_t(scanType),storageType,uint32_t(op) + "#define WORKGROUP_SIZE %d\nconst bool isExclusive = %b\ntypedef storageType %s\n#define BINOP nbl::hlsl::%s\n", + m_workgroupSize,isExclusive,storageType,binop ); -} -#endif \ No newline at end of file +} \ No newline at end of file From 2df4ad7e374ab34c94b3e4dda279f33885d51758 Mon Sep 17 00:00:00 2001 From: PentaKon Date: Wed, 31 Jan 2024 00:29:00 +0200 Subject: [PATCH 03/25] First commit for global scan implementation. Ported most of the code but not working yet --- .../nbl/builtin/hlsl/glsl_compat/core.hlsl | 4 + .../nbl/builtin/hlsl/scan/declarations.hlsl | 75 ++++++----- .../builtin/hlsl/scan/default_scheduler.hlsl | 106 +++++++--------- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 116 +++++++++++++++++- include/nbl/builtin/hlsl/scan/direct.hlsl | 53 ++++---- include/nbl/builtin/hlsl/scan/indirect.hlsl | 34 ++--- .../builtin/hlsl/scan/parameters_struct.hlsl | 30 ----- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 42 +++---- 8 files changed, 255 insertions(+), 205 deletions(-) delete mode 100644 include/nbl/builtin/hlsl/scan/parameters_struct.hlsl diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl index 6f5d290dc3..faa45c8ed4 100644 --- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl +++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl @@ -85,6 +85,10 @@ void memoryBarrierShared() { spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask); } +void memoryBarrierBuffer() { + spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsUniformMemoryMask); +} + namespace impl { diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl index 2d2e66e66d..ec3e558dfc 100644 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl @@ -1,66 +1,63 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + #ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ #define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_ // REVIEW: Not sure if this file is needed in HLSL implementation -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS +#define NBL_BUILTIN_MAX_SCAN_LEVELS 7 +#endif -#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ namespace nbl { namespace hlsl { namespace scan { + // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) + struct Parameters_t { + uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; + uint32_t topLevel; + uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; + } + Parameters_t getParameters(); -} -} -} -#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_ -#endif -#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ + struct DefaultSchedulerParameters_t + { + uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; + uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; + + }; + + DefaultSchedulerParameters_t getSchedulerParameters(); + template void getData( - inout Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel + NBL_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel ); -} -} -} -#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ -#endif -#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ template void setData( - in Storage_t data, - in uint levelInvocationIndex, - in uint localWorkgroupIndex, - in uint treeLevel, - in uint pseudoLevel, - in bool inRange + NBL_CONST_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel, + NBL_CONST_REF_ARG(bool) inRange ); + } } } -#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_ -#endif #endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 450368475d..67c62aadae 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -1,31 +1,13 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + #ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ #define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_ -#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl" - -#ifdef __cplusplus -#define uint uint32_t -#endif - -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ - struct DefaultSchedulerParameters_t - { - uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; - uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; - - }; -} -} -} - -#ifdef __cplusplus -#undef uint -#else +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/scan/descriptors.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" namespace nbl { @@ -33,6 +15,8 @@ namespace hlsl { namespace scan { + +#ifdef HLSL namespace scheduler { /** @@ -58,17 +42,17 @@ namespace scheduler * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps * synchronizing between workgroups with while-loop spinning. */ - void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) + void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) { #define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u) + const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x); _scanParams.lastElement[0] = elementCount-1u; - _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_; - // REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime? + _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2; - for (uint i=0; i>_NBL_HLSL_WORKGROUP_SIZE_LOG2_; + const uint32_t next = i+1; + _scanParams.lastElement[next] = _scanParams.lastElement[i]>>workgroupSizeLog2; i = next; } _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0); @@ -130,34 +114,31 @@ namespace scheduler * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. */ - template - bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex) + template + bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex) { - ScratchAccessor sharedScratch; if(SubgroupContiguousIndex() == 0u) { - uint64_t original; - InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared - sharedScratch.set(SubgroupContiguousIndex(), original); + uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u); + sharedScratch.set(0u, originalWGs); } else if (SubgroupContiguousIndex() == 1u) { - sharedScratch.set(SubgroupContiguousIndex(), 0u); + sharedScratch.set(1u, 0u); } - GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology? - - const uint globalWorkgroupIndex; // does every thread need to know? - sharedScratch.get(0u, globalWorkgroupIndex); - const uint lastLevel = topLevel<<1u; + sharedScratch.workgroupExecutionAndMemoryBarrier(); + + const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know? + const uint32_t lastLevel = topLevel<<1u; if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) { - InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ? + sharedScratch.atomicAdd(1u, 1u); } - GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor? + sharedScratch.workgroupExecutionAndMemoryBarrier(); - sharedScratch.get(1u, treeLevel); + treeLevel = sharedScratch.get(1u); if(treeLevel>lastLevel) - return true; + return true; localWorkgroupIndex = globalWorkgroupIndex; const bool dependentLevel = treeLevel != 0u; @@ -170,52 +151,55 @@ namespace scheduler uint dependentsCount = 1u; if(treeLevel <= topLevel) { - dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file? + dependentsCount = glsl::gl_WorkGroupSize().x; const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; if (lastWorkgroup) { - const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader + const Parameters_t scanParams = getParameters(); dependentsCount = scanParams.lastElement[treeLevel]+1u; if (treeLeveltopLevel) // !(prevLevel globallycoherent \ No newline at end of file +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" + +// coherent -> globallycoherent + +// (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... +#ifndef SCRATCH_SIZE +#error "Must manually define SCRATCH_SIZE for now" +#endif + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct Scratch +{ + uint32_t workgroupsStarted; + uint32_t data[scratchElementCount]; +}; + +[[vk::binding(0 ,0)]] StructuredBuffer scanInputBuf; // (REVIEW): Make the type externalizable. Decide how (#define?) +[[vk::binding(1 ,0)]] RWStructuredBuffer globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model + +template +void getData( + NBL_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel +) +{ + const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders + + uint32_t offset = levelInvocationIndex; + const bool notFirstOrLastLevel = bool(pseudoLevel); + if (notFirstOrLastLevel) + offset += params.temporaryStorageOffset[pseudoLevel-1u]; + + if (pseudoLevel!=treeLevel) // downsweep + { + const bool firstInvocationInGroup = SubgroupContiguousIndex()==0u; + if (bool(localWorkgroupIndex) && firstInvocationInGroup) + data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]]; + + if (notFirstOrLastLevel) + { + if (!firstInvocationInGroup) + data = scanScratchBuf[0].data[offset-1u]; + } + else + { + if(isExclusive) + { + if (!firstInvocationInGroup) + data += scanInputBuf[offset-1u]; + } + else + { + data += scanInputBuf[offset]; + } + } + } + else + { + if (notFirstOrLastLevel) + data = scanScratchBuf[0].data[offset]; + else + data = scanInputBuf[offset]; + } +} + +template +void setData( + NBL_CONST_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel, + NBL_CONST_REF_ARG(bool) inRange +) +{ + const Parameters_t params = getParameters(); + if (treeLevel - void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex) + template + void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); - const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex(); - const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1); - - const uint lastLevel = params.topLevel << 1u; - const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel]; + const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex(); + const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1); + const uint32_t lastLevel = params.topLevel << 1u; + const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; - Storage_t data = Binop::identity(); + Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t? if(inRange) { getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); @@ -36,17 +34,19 @@ namespace scan if(treeLevel < params.topLevel) { - #error "Must also define some scratch accessor when calling operation()" - data = workgroup::reduction()(data); + data = workgroup::reduction::template __call(value,accessor); } - // REVIEW: missing _TYPE_ check and extra case here + else if (!isExclusive && params.topLevel == 0u) + { + data = workgroup::inclusive_scan::template __call(value,accessor); + } else if (treeLevel != params.topLevel) { - data = workgroup::inclusive_scan()(data); + data = workgroup::inclusive_scan::template __call(value,accessor); } else { - data = workgroup::exclusive_scan()(data); + data = workgroup::exclusive_scan::template __call(value,accessor); } setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); } @@ -54,8 +54,6 @@ namespace scan } } -#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored? -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" namespace nbl { namespace hlsl @@ -66,11 +64,13 @@ namespace scan void main() { const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); - const uint topLevel = getParameters().topLevel; + const uint32_t topLevel = getParameters().topLevel; // persistent workgroups while (true) { - uint treeLevel,localWorkgroupIndex; + // REVIEW: Need to create accessor here. + // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call? + uint32_t treeLevel,localWorkgroupIndex; if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex)) { return; @@ -84,9 +84,5 @@ namespace scan } } } -#endif - -#define _NBL_HLSL_SCAN_MAIN_DEFINED_ -#endif #endif \ No newline at end of file From 20a60f5fa4af48be81a511ed7db9ee5b6f106a89 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 12 Feb 2024 01:17:25 +0200 Subject: [PATCH 04/25] Global scan migration of code to new APIs --- .../builtin/hlsl/scan/default_scheduler.hlsl | 2 +- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 17 ++---- include/nbl/builtin/hlsl/scan/direct.hlsl | 49 ++++++++++++++- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 29 ++++----- include/nbl/video/utilities/CScanner.h | 34 +++++------ src/nbl/video/utilities/CScanner.cpp | 61 ++++++++++++++----- 6 files changed, 130 insertions(+), 62 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 67c62aadae..5ec112b065 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -115,7 +115,7 @@ namespace scheduler * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. */ template - bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex) + bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) { if(SubgroupContiguousIndex() == 0u) { diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl index 0adee5dc32..169257e30a 100644 --- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl +++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl @@ -10,11 +10,6 @@ // coherent -> globallycoherent -// (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... -#ifndef SCRATCH_SIZE -#error "Must manually define SCRATCH_SIZE for now" -#endif - namespace nbl { namespace hlsl @@ -22,14 +17,14 @@ namespace hlsl namespace scan { -template +template // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... struct Scratch { uint32_t workgroupsStarted; uint32_t data[scratchElementCount]; }; -[[vk::binding(0 ,0)]] StructuredBuffer scanInputBuf; // (REVIEW): Make the type externalizable. Decide how (#define?) +[[vk::binding(0 ,0)]] RWStructuredBuffer scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) [[vk::binding(1 ,0)]] RWStructuredBuffer globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model template @@ -64,11 +59,11 @@ void getData( if(isExclusive) { if (!firstInvocationInGroup) - data += scanInputBuf[offset-1u]; + data += scanBuffer[offset-1u]; } else { - data += scanInputBuf[offset]; + data += scanBuffer[offset]; } } } @@ -77,7 +72,7 @@ void getData( if (notFirstOrLastLevel) data = scanScratchBuf[0].data[offset]; else - data = scanInputBuf[offset]; + data = scanBuffer[offset]; } } @@ -106,7 +101,7 @@ void setData( scanScratchBuf[0].data[levelInvocationIndex+offset] = data; } else - scanInputBuf[levelInvocationIndex] = data; + scanBuffer[levelInvocationIndex] = data; } } diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl index cb2e0ce078..76003d8e4a 100644 --- a/include/nbl/builtin/hlsl/scan/direct.hlsl +++ b/include/nbl/builtin/hlsl/scan/direct.hlsl @@ -1,9 +1,45 @@ // Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +#pragma shader_stage(compute) + +#include "nbl/builtin/hlsl/functional.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" #include "nbl/builtin/hlsl/scan/declarations.hlsl" +static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; + +// TODO: Can we make it a static variable? +groupshared uint32_t wgScratch[SharedScratchSz]; + +#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" + +template +struct WGScratchProxy +{ + uint32_t get(const uint32_t ix) + { + return wgScratch[ix+offset]; + } + void set(const uint32_t ix, const uint32_t value) + { + wgScratch[ix+offset] = value; + } + + uint32_t atomicAdd(uint32_t ix, uint32_t val) + { + return glsl::atomicAdd(wgScratch[ix + offset], val); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; +static WGScratchProxy<0> accessor; + // https://github.com/microsoft/DirectXShaderCompiler/issues/6144 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} @@ -16,6 +52,17 @@ struct ScanPushConstants [[vk::push_constant]] ScanPushConstants spc; +/** + * Required since we rely on SubgroupContiguousIndex instead of + * gl_LocalInvocationIndex which means to match the global index + * we can't use the gl_GlobalInvocationID but an index based on + * SubgroupContiguousIndex. + */ +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + namespace nbl { namespace hlsl @@ -39,5 +86,5 @@ DefaultSchedulerParameters_t getSchedulerParameters() [numthreads(WORKGROUP_SIZE,1,1)] void main() { - nbl::hlsl::scan::main(); + nbl::hlsl::scan::main(accessor); } \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 3563f9546e..a8319f4366 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -15,7 +15,7 @@ namespace hlsl { namespace scan { - template + template void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); @@ -26,6 +26,9 @@ namespace scan const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; + // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel + // so that it exits after reaching the top? + Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t? if(inRange) { @@ -34,34 +37,26 @@ namespace scan if(treeLevel < params.topLevel) { - data = workgroup::reduction::template __call(value,accessor); + data = workgroup::reduction::template __call(data,accessor); } else if (!isExclusive && params.topLevel == 0u) { - data = workgroup::inclusive_scan::template __call(value,accessor); + data = workgroup::inclusive_scan::template __call(data,accessor); } else if (treeLevel != params.topLevel) { - data = workgroup::inclusive_scan::template __call(value,accessor); + data = workgroup::inclusive_scan::template __call(data,accessor); } else { - data = workgroup::exclusive_scan::template __call(value,accessor); + data = workgroup::exclusive_scan::template __call(data,accessor); } setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); } -} -} -} -namespace nbl -{ -namespace hlsl -{ -namespace scan -{ DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components - void main() + template + void main(NBL_REF_ARG(Accessor) accessor) { const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); const uint32_t topLevel = getParameters().topLevel; @@ -71,12 +66,12 @@ namespace scan // REVIEW: Need to create accessor here. // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call? uint32_t treeLevel,localWorkgroupIndex; - if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex)) + if (scheduler::getWork(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor)) { return; } - virtualWorkgroup(treeLevel,localWorkgroupIndex); + virtualWorkgroup(treeLevel, localWorkgroupIndex, accessor); scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex); } diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index 2ae67163eb..9f1f75b117 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -9,14 +9,12 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/utilities/IDescriptorSetCache.h" +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS & 0x1, "NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!"); namespace nbl::video { -#include "nbl/builtin/glsl/scan/parameters_struct.glsl" -#include "nbl/builtin/glsl/scan/default_scheduler.glsl" -static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS&0x1,"NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!"); - /** Utility class to help you perform the equivalent of `std::inclusive_scan` and `std::exclusive_scan` with data on the GPU. @@ -150,9 +148,9 @@ class CScanner final : public core::IReferenceCounted enum E_SCAN_TYPE : uint8_t { // computes output[n] = Sum_{i<=n}(input[i]) - EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_, + EST_INCLUSIVE = 0u, // computes output[n] = Sum_{i(getDefaultShader(scanType,dataType,op)); - cpuShader->setFilePathHint("nbl/builtin/glsl/scan/direct.comp"); + cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); auto gpushader = m_device->createShader(std::move(cpuShader)); m_specialized_shaders[scanType][dataType][op] = m_device->createSpecializedShader( gpushader.get(),{nullptr,nullptr,"main"}); - // , asset::IShader::ESS_COMPUTE, "nbl/builtin/glsl/scan/direct.comp" + // , asset::IShader::ESS_COMPUTE, "nbl/builtin/hlsl/scan/direct.hlsl" } return m_specialized_shaders[scanType][dataType][op].get(); } diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index ec31272358..6fb88074b6 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -3,11 +3,10 @@ using namespace nbl; using namespace video; -#if 0 // TODO: port core::smart_refctd_ptr CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const { auto system = m_device->getPhysicalDevice()->getSystem(); - core::smart_refctd_ptr glsl; + core::smart_refctd_ptr hlsl; { auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr { @@ -19,24 +18,59 @@ core::smart_refctd_ptr CScanner::createShader(const bool indi }; if(indirect) - glsl = loadBuiltinData("nbl/builtin/glsl/scan/indirect.comp"); + hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect.hlsl"); else - glsl = loadBuiltinData("nbl/builtin/glsl/scan/direct.comp"); + hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/direct.hlsl"); } - auto buffer = core::make_smart_refctd_ptr(glsl->getSize()); - memcpy(buffer->getPointer(), glsl->getMappedPointer(), glsl->getSize()); - auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????"); + auto buffer = core::make_smart_refctd_ptr(hlsl->getSize()); + memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize()); + auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????"); + + // (REVIEW): All of the below, that rely on enumerations, should probably be changed to take advantage of HLSL-CPP compatibility. + // Issue is that CScanner caches all shaders for all permutations of scanType-dataType-operator and it's not clear how to + // do this without enums const char* storageType = nullptr; switch (dataType) { case EDT_UINT: - storageType = "uint"; + storageType = "uint32_t"; break; case EDT_INT: - storageType = "int"; + storageType = "int32_t"; break; case EDT_FLOAT: - storageType = "float"; + storageType = "float32_t"; + break; + default: + assert(false); + break; + } + + bool isExclusive = scanType == EST_EXCLUSIVE; + + const char* binop = nullptr; + switch (op) + { + case EO_AND: + binop = "bit_and"; + break; + case EO_OR: + binop = "bit_or"; + break; + case EO_XOR: + binop = "bit_xor"; + break; + case EO_ADD: + binop = "plus"; + break; + case EO_MUL: + binop = "multiplies"; + break; + case EO_MAX: + binop = "maximum"; + break; + case EO_MIN: + binop = "minimum"; break; default: assert(false); @@ -45,8 +79,7 @@ core::smart_refctd_ptr CScanner::createShader(const bool indi return asset::CGLSLCompiler::createOverridenCopy( cpushader.get(), - "#define _NBL_GLSL_WORKGROUP_SIZE_ %d\n#define _NBL_GLSL_WORKGROUP_SIZE_LOG2_ %d\n#define _NBL_GLSL_SCAN_TYPE_ %d\n#define _NBL_GLSL_SCAN_STORAGE_TYPE_ %s\n#define _NBL_GLSL_SCAN_BIN_OP_ %d\n", - m_workgroupSize,hlsl::findMSB(m_workgroupSize),uint32_t(scanType),storageType,uint32_t(op) + "#define WORKGROUP_SIZE %d\nconst bool isExclusive = %b\ntypedef storageType %s\n#define BINOP nbl::hlsl::%s\n", + m_workgroupSize,isExclusive,storageType,binop ); -} -#endif \ No newline at end of file +} \ No newline at end of file From 7117cc31e061392025484d8d856baadbd0c28037 Mon Sep 17 00:00:00 2001 From: PentaKon Date: Sun, 31 Mar 2024 14:26:04 +0300 Subject: [PATCH 05/25] Update CScanner.h to new NBL API --- .../nbl/builtin/hlsl/scan/declarations.hlsl | 2 +- include/nbl/video/utilities/CScanner.h | 49 +++++++++---------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl index ec3e558dfc..4dbbf93d9a 100644 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl @@ -24,7 +24,7 @@ namespace scan uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; uint32_t topLevel; uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; - } + }; Parameters_t getParameters(); diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index 9f1f75b117..41dacf4328 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -141,7 +141,7 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about Console devs get to bring a gun to a knife fight... **/ -#if 0 // legacy & port to HLSL +#if 1 // legacy & port to HLSL class CScanner final : public core::IReferenceCounted { public: @@ -267,15 +267,14 @@ class CScanner final : public core::IReferenceCounted CScanner(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize) { assert(core::isPoT(m_workgroupSize)); - - const asset::SPushConstantRange pc_range = { asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants) }; + const asset::SPushConstantRange pc_range[] = {asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)}; const IGPUDescriptorSetLayout::SBinding bindings[2] = { { 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer { 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch }; - m_ds_layout = m_device->createDescriptorSetLayout(bindings,bindings+sizeof(bindings)/sizeof(IGPUDescriptorSetLayout::SBinding)); - m_pipeline_layout = m_device->createPipelineLayout(&pc_range,&pc_range+1,core::smart_refctd_ptr(m_ds_layout)); + m_ds_layout = m_device->createDescriptorSetLayout(bindings); + m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout)); } // @@ -298,7 +297,7 @@ class CScanner final : public core::IReferenceCounted return m_shaders[scanType][dataType][op].get(); } // - inline IGPUSpecializedShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) + inline IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) { if (!m_specialized_shaders[scanType][dataType][op]) { @@ -306,11 +305,9 @@ class CScanner final : public core::IReferenceCounted cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); - auto gpushader = m_device->createShader(std::move(cpuShader)); + auto gpushader = m_device->createShader(cpuShader.get()); - m_specialized_shaders[scanType][dataType][op] = m_device->createSpecializedShader( - gpushader.get(),{nullptr,nullptr,"main"}); - // , asset::IShader::ESS_COMPUTE, "nbl/builtin/hlsl/scan/direct.hlsl" + m_specialized_shaders[scanType][dataType][op] = gpushader; } return m_specialized_shaders[scanType][dataType][op].get(); } @@ -319,11 +316,18 @@ class CScanner final : public core::IReferenceCounted inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) { // ondemand - if (!m_pipelines[scanType][dataType][op]) - m_pipelines[scanType][dataType][op] = m_device->createComputePipeline( - nullptr,core::smart_refctd_ptr(m_pipeline_layout), - core::smart_refctd_ptr(getDefaultSpecializedShader(scanType,dataType,op)) + if (!m_pipelines[scanType][dataType][op]) { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pipeline_layout.get(); + // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") + params.shader.entryPoint = "main"; + params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op); + + m_device->createComputePipelines( + nullptr, { ¶ms,1 }, + & m_pipelines[scanType][dataType][op] ); + } return m_pipelines[scanType][dataType][op].get(); } @@ -349,12 +353,7 @@ class CScanner final : public core::IReferenceCounted video::IGPUDescriptorSet::SWriteDescriptorSet writes[2]; for (auto i=0u; i<2u; i++) { - writes[i].dstSet = set; - writes[i].binding = i; - writes[i].arrayElement = 0u; - writes[i].count = 1u; - writes[i].descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER; - writes[i].info = infos+i; + writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i }; } device->updateDescriptorSets(2, writes, 0u, nullptr); @@ -370,17 +369,13 @@ class CScanner final : public core::IReferenceCounted cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants); if (srcBufferBarrierCount) { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {}; - info.bufBarrierCount = srcBufferBarrierCount; - info.bufBarriers = srcBufferBarriers; + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { srcBufferBarriers, 1} }; cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); } cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u); if (srcBufferBarrierCount) { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {}; - info.bufBarrierCount = dstBufferBarrierCount; - info.bufBarriers = dstBufferBarriers; + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { dstBufferBarriers, 1} }; cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); } } @@ -400,7 +395,7 @@ class CScanner final : public core::IReferenceCounted core::smart_refctd_ptr m_ds_layout; core::smart_refctd_ptr m_pipeline_layout; core::smart_refctd_ptr m_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; - core::smart_refctd_ptr m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; + core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; core::smart_refctd_ptr m_pipelines[EST_COUNT][EDT_COUNT][EO_COUNT]; const uint32_t m_workgroupSize; }; From 51b4f741bb946343f70331ac37aa7006db5c328e Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 22 Apr 2024 00:12:34 +0300 Subject: [PATCH 06/25] Fix CScanner and related shaders to properly compile --- .../nbl/builtin/hlsl/scan/declarations.hlsl | 2 +- .../builtin/hlsl/scan/default_scheduler.hlsl | 15 ++++++--------- include/nbl/builtin/hlsl/scan/descriptors.hlsl | 11 ++++++----- include/nbl/builtin/hlsl/scan/direct.hlsl | 8 +++++--- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 12 +++++------- include/nbl/video/utilities/CScanner.h | 18 +++++++++--------- include/nbl/video/utilities/IUtilities.h | 8 +++----- src/nbl/video/utilities/CScanner.cpp | 12 ++++++------ 8 files changed, 41 insertions(+), 45 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl index 4dbbf93d9a..4a3a473f74 100644 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl @@ -37,7 +37,7 @@ namespace scan DefaultSchedulerParameters_t getSchedulerParameters(); - template + template void getData( NBL_REF_ARG(Storage_t) data, NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 5ec112b065..40d746d7c1 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -15,8 +15,6 @@ namespace hlsl { namespace scan { - -#ifdef HLSL namespace scheduler { /** @@ -115,14 +113,14 @@ namespace scheduler * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. */ template - bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) + bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) { - if(SubgroupContiguousIndex() == 0u) + if(workgroup::SubgroupContiguousIndex() == 0u) { uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u); sharedScratch.set(0u, originalWGs); } - else if (SubgroupContiguousIndex() == 1u) + else if (workgroup::SubgroupContiguousIndex() == 1u) { sharedScratch.set(1u, 0u); } @@ -130,7 +128,7 @@ namespace scheduler const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know? const uint32_t lastLevel = topLevel<<1u; - if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) + if (workgroup::SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[workgroup::SubgroupContiguousIndex()]) { sharedScratch.atomicAdd(1u, 1u); } @@ -146,7 +144,7 @@ namespace scheduler { const uint prevLevel = treeLevel - 1u; localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; - if(SubgroupContiguousIndex() == 0u) + if(workgroup::SubgroupContiguousIndex() == 0u) { uint dependentsCount = 1u; if(treeLevel <= topLevel) @@ -179,7 +177,7 @@ namespace scheduler void markComplete(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex) { glsl::memoryBarrierBuffer(); - if (SubgroupContiguousIndex()==0u) + if (workgroup::SubgroupContiguousIndex()==0u) { uint32_t finishedFlagOffset = params.finishedFlagOffset[treeLevel]; if (treeLevel // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... +template // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... struct Scratch { uint32_t workgroupsStarted; @@ -25,9 +25,9 @@ struct Scratch }; [[vk::binding(0 ,0)]] RWStructuredBuffer scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) -[[vk::binding(1 ,0)]] RWStructuredBuffer globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model +[[vk::binding(1 ,0)]] RWStructuredBuffer /*globallycoherent (seems we can't use along with VMM)*/ scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model -template +template void getData( NBL_REF_ARG(Storage_t) data, NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, @@ -36,6 +36,7 @@ void getData( NBL_CONST_REF_ARG(uint32_t) pseudoLevel ) { + glsl::memoryBarrierBuffer(); // scanScratchBuf can't be declared as coherent due to VMM(?) const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders uint32_t offset = levelInvocationIndex; @@ -45,7 +46,7 @@ void getData( if (pseudoLevel!=treeLevel) // downsweep { - const bool firstInvocationInGroup = SubgroupContiguousIndex()==0u; + const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; if (bool(localWorkgroupIndex) && firstInvocationInGroup) data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]]; @@ -89,7 +90,7 @@ void setData( const Parameters_t params = getParameters(); if (treeLevel::value; +// ITEMS_PER_WG = WORKGROUP_SIZE +static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; // TODO: Can we make it a static variable? groupshared uint32_t wgScratch[SharedScratchSz]; @@ -29,7 +31,7 @@ struct WGScratchProxy uint32_t atomicAdd(uint32_t ix, uint32_t val) { - return glsl::atomicAdd(wgScratch[ix + offset], val); + return nbl::hlsl::glsl::atomicAdd(wgScratch[ix + offset], val); } void workgroupExecutionAndMemoryBarrier() @@ -86,5 +88,5 @@ DefaultSchedulerParameters_t getSchedulerParameters() [numthreads(WORKGROUP_SIZE,1,1)] void main() { - nbl::hlsl::scan::main(accessor); + nbl::hlsl::scan::main, Storage_t, true, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor); } \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index a8319f4366..7c39dd98c0 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -2,12 +2,10 @@ #define _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_ // TODO (PentaKon): Decide if these are needed once we have a clearer picture of the refactor -#include "nbl/builtin/hlsl/limits/numeric.hlsl" -#include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl" +#include "nbl/builtin/hlsl/functional.hlsl" #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined #include "nbl/builtin/hlsl/scan/declarations.hlsl" #include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" -#include "nbl/builtin/hlsl/binops.hlsl" namespace nbl { @@ -19,8 +17,8 @@ namespace scan void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); - const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex(); - const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1); + const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex(); + const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex() == (glsl::gl_WorkGroupSize().x - 1); const uint32_t lastLevel = params.topLevel << 1u; const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; @@ -29,10 +27,10 @@ namespace scan // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel // so that it exits after reaching the top? - Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t? + Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t? if(inRange) { - getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); + getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); } if(treeLevel < params.topLevel) diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index 41dacf4328..ff2250e544 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -284,24 +284,24 @@ class CScanner final : public core::IReferenceCounted inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); } // You need to override this shader with your own defintion of `nbl_glsl_scan_getIndirectElementCount` for it to even compile, so we always give you a new shader - core::smart_refctd_ptr getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const + core::smart_refctd_ptr getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const { - return createShader(true,scanType,dataType,op); + return createShader(true,scanType,dataType,op,scratchSz); } // - inline asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) + inline asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) { if (!m_shaders[scanType][dataType][op]) - m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op); + m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz); return m_shaders[scanType][dataType][op].get(); } // - inline IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) + inline IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) { if (!m_specialized_shaders[scanType][dataType][op]) { - auto cpuShader = core::smart_refctd_ptr(getDefaultShader(scanType,dataType,op)); + auto cpuShader = core::smart_refctd_ptr(getDefaultShader(scanType,dataType,op,scratchSz)); cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); @@ -313,7 +313,7 @@ class CScanner final : public core::IReferenceCounted } // - inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) + inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) { // ondemand if (!m_pipelines[scanType][dataType][op]) { @@ -321,7 +321,7 @@ class CScanner final : public core::IReferenceCounted params.layout = m_pipeline_layout.get(); // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") params.shader.entryPoint = "main"; - params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op); + params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op, scratchSz); m_device->createComputePipelines( nullptr, { ¶ms,1 }, @@ -388,7 +388,7 @@ class CScanner final : public core::IReferenceCounted // all drop themselves automatically } - core::smart_refctd_ptr createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const; + core::smart_refctd_ptr createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const; core::smart_refctd_ptr m_device; diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 23e609e14c..73304645d1 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -134,9 +134,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted m_propertyPoolHandler = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device)); // smaller workgroups fill occupancy gaps better, especially on new Nvidia GPUs, but we don't want too small workgroups on mobile // TODO: investigate whether we need to clamp against 256u instead of 128u on mobile +#endif const auto scan_workgroup_size = core::max(core::roundDownToPoT(limits.maxWorkgroupSize[0]) >> 1u, 128u); m_scanner = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device), scan_workgroup_size); -#endif } inline ~IUtilities() @@ -162,13 +162,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted { return m_propertyPoolHandler.get(); } - - //! +#endif virtual CScanner* getDefaultScanner() const { return m_scanner.get(); } -#endif //! This function provides some guards against streamingBuffer fragmentation or allocation failure static uint32_t getAllocationSizeForStreamingBuffer(const size_t size, const uint64_t alignment, uint32_t maxFreeBlock, const uint32_t optimalTransferAtom) { @@ -613,8 +611,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted #if 0 // TODO: port core::smart_refctd_ptr m_propertyPoolHandler; - core::smart_refctd_ptr m_scanner; #endif + core::smart_refctd_ptr m_scanner; }; class ImageRegionIterator diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index 6fb88074b6..06dfd076e3 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -3,7 +3,7 @@ using namespace nbl; using namespace video; -core::smart_refctd_ptr CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const +core::smart_refctd_ptr CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const { auto system = m_device->getPhysicalDevice()->getSystem(); core::smart_refctd_ptr hlsl; @@ -46,7 +46,7 @@ core::smart_refctd_ptr CScanner::createShader(const bool indi break; } - bool isExclusive = scanType == EST_EXCLUSIVE; + const char* isExclusive = scanType == EST_EXCLUSIVE ? "true" : "false"; const char* binop = nullptr; switch (op) @@ -76,10 +76,10 @@ core::smart_refctd_ptr CScanner::createShader(const bool indi assert(false); break; } - - return asset::CGLSLCompiler::createOverridenCopy( + + return asset::CHLSLCompiler::createOverridenCopy( cpushader.get(), - "#define WORKGROUP_SIZE %d\nconst bool isExclusive = %b\ntypedef storageType %s\n#define BINOP nbl::hlsl::%s\n", - m_workgroupSize,isExclusive,storageType,binop + "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n", + m_workgroupSize, storageType, binop, scratchSz ); } \ No newline at end of file From 3d29bc3f2a3f13f95452987a04ecf0bafc34bfaf Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sat, 4 May 2024 17:53:48 +0300 Subject: [PATCH 07/25] Update formatting and fix scratch buffer size --- .../nbl/builtin/hlsl/scan/declarations.hlsl | 48 +-- .../builtin/hlsl/scan/default_scheduler.hlsl | 324 +++++++++--------- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 76 ++-- include/nbl/builtin/hlsl/scan/direct.hlsl | 28 +- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 88 ++--- 5 files changed, 282 insertions(+), 282 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl index 4a3a473f74..3dafd0b1b6 100644 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl @@ -20,13 +20,13 @@ namespace hlsl namespace scan { // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) - struct Parameters_t { - uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; - uint32_t topLevel; - uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; - }; + struct Parameters_t { + uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; + uint32_t topLevel; + uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; + }; - Parameters_t getParameters(); + Parameters_t getParameters(); struct DefaultSchedulerParameters_t { @@ -37,24 +37,24 @@ namespace scan DefaultSchedulerParameters_t getSchedulerParameters(); - template - void getData( - NBL_REF_ARG(Storage_t) data, - NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, - NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, - NBL_CONST_REF_ARG(uint32_t) treeLevel, - NBL_CONST_REF_ARG(uint32_t) pseudoLevel - ); - - template - void setData( - NBL_CONST_REF_ARG(Storage_t) data, - NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, - NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, - NBL_CONST_REF_ARG(uint32_t) treeLevel, - NBL_CONST_REF_ARG(uint32_t) pseudoLevel, - NBL_CONST_REF_ARG(bool) inRange - ); + template + void getData( + NBL_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel + ); + + template + void setData( + NBL_CONST_REF_ARG(Storage_t) data, + NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, + NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) treeLevel, + NBL_CONST_REF_ARG(uint32_t) pseudoLevel, + NBL_CONST_REF_ARG(bool) inRange + ); } } diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 40d746d7c1..71dbc152aa 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -17,181 +17,181 @@ namespace scan { namespace scheduler { - /** - * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation - * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions - * that are used for synchronization etc. - * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is - * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of - * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and - * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will - * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results - * in the downsweep phase of Blelloch Scan. - * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount. - * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase. - * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the - * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40. - * - * Parameter meaning: - * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer - * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize) - * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan - * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep) - * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps - * synchronizing between workgroups with while-loop spinning. - */ - void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) - { + /** + * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation + * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions + * that are used for synchronization etc. + * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is + * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of + * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and + * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will + * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results + * in the downsweep phase of Blelloch Scan. + * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount. + * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase. + * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the + * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40. + * + * Parameter meaning: + * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer + * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize) + * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan + * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep) + * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps + * synchronizing between workgroups with while-loop spinning. + */ + void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) + { #define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u) const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x); - _scanParams.lastElement[0] = elementCount-1u; - _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2; - - for (uint32_t i=0; i>workgroupSizeLog2; - i = next; - } - _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0); - _schedulerParams.finishedFlagOffset[0] = 0u; - switch(_scanParams.topLevel) - { - case 1u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u; - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = 1u; - - _scanParams.temporaryStorageOffset[0] = 2u; - break; - case 2u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u; - _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); - _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u; - // climb down - _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u; - - _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1); - _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); - break; - case 3u: - _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2); - _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u; - _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2); - _schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1); - _schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0); - // climb up - _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); - _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2); - _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u; - // climb down - _schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u; - _schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2); - - _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1); - _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); - _scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1); - break; - default: - break; + _scanParams.lastElement[0] = elementCount-1u; + _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2; + + for (uint32_t i=0; i>workgroupSizeLog2; + i = next; + } + _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0); + _schedulerParams.finishedFlagOffset[0] = 0u; + switch(_scanParams.topLevel) + { + case 1u: + _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u; + _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0); + // climb up + _schedulerParams.finishedFlagOffset[1] = 1u; + + _scanParams.temporaryStorageOffset[0] = 2u; + break; + case 2u: + _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); + _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u; + _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1); + _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0); + // climb up + _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); + _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u; + // climb down + _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u; + + _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1); + _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); + break; + case 3u: + _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); + _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2); + _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u; + _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2); + _schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1); + _schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0); + // climb up + _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); + _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2); + _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u; + // climb down + _schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u; + _schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2); + + _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1); + _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); + _scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1); + break; + default: + break; #if NBL_BUILTIN_MAX_SCAN_LEVELS>7 #error "Switch needs more cases" #endif - } + } #undef WorkgroupCount - } - - /** - * treeLevel - the current level in the Blelloch Scan - * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. - * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. - */ - template - bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) - { - if(workgroup::SubgroupContiguousIndex() == 0u) - { - uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u); + } + + /** + * treeLevel - the current level in the Blelloch Scan + * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. + * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. + */ + template + bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) + { + if(workgroup::SubgroupContiguousIndex() == 0u) + { + uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u); sharedScratch.set(0u, originalWGs); - } - else if (workgroup::SubgroupContiguousIndex() == 1u) - { - sharedScratch.set(1u, 0u); - } - sharedScratch.workgroupExecutionAndMemoryBarrier(); + } + else if (workgroup::SubgroupContiguousIndex() == 1u) + { + sharedScratch.set(1u, 0u); + } + sharedScratch.workgroupExecutionAndMemoryBarrier(); - const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know? - const uint32_t lastLevel = topLevel<<1u; - if (workgroup::SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[workgroup::SubgroupContiguousIndex()]) - { + const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know? + const uint32_t lastLevel = topLevel<<1u; + if (workgroup::SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[workgroup::SubgroupContiguousIndex()]) + { sharedScratch.atomicAdd(1u, 1u); - } - sharedScratch.workgroupExecutionAndMemoryBarrier(); - - treeLevel = sharedScratch.get(1u); - if(treeLevel>lastLevel) + } + sharedScratch.workgroupExecutionAndMemoryBarrier(); + + treeLevel = sharedScratch.get(1u); + if(treeLevel>lastLevel) return true; - - localWorkgroupIndex = globalWorkgroupIndex; - const bool dependentLevel = treeLevel != 0u; - if(dependentLevel) - { - const uint prevLevel = treeLevel - 1u; - localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; - if(workgroup::SubgroupContiguousIndex() == 0u) - { - uint dependentsCount = 1u; - if(treeLevel <= topLevel) - { - dependentsCount = glsl::gl_WorkGroupSize().x; - const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; - if (lastWorkgroup) - { - const Parameters_t scanParams = getParameters(); - dependentsCount = scanParams.lastElement[treeLevel]+1u; - if (treeLeveltopLevel) // !(prevLeveltopLevel) // !(prevLevel // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... +template // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... struct Scratch { uint32_t workgroupsStarted; @@ -42,21 +42,21 @@ void getData( uint32_t offset = levelInvocationIndex; const bool notFirstOrLastLevel = bool(pseudoLevel); if (notFirstOrLastLevel) - offset += params.temporaryStorageOffset[pseudoLevel-1u]; + offset += params.temporaryStorageOffset[pseudoLevel-1u]; if (pseudoLevel!=treeLevel) // downsweep - { - const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; - if (bool(localWorkgroupIndex) && firstInvocationInGroup) - data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]]; + { + const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; + if (bool(localWorkgroupIndex) && firstInvocationInGroup) + data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]]; - if (notFirstOrLastLevel) - { - if (!firstInvocationInGroup) - data = scanScratchBuf[0].data[offset-1u]; - } - else - { + if (notFirstOrLastLevel) + { + if (!firstInvocationInGroup) + data = scanScratchBuf[0].data[offset-1u]; + } + else + { if(isExclusive) { if (!firstInvocationInGroup) @@ -66,15 +66,15 @@ void getData( { data += scanBuffer[offset]; } - } - } - else - { - if (notFirstOrLastLevel) - data = scanScratchBuf[0].data[offset]; - else - data = scanBuffer[offset]; - } + } + } + else + { + if (notFirstOrLastLevel) + data = scanScratchBuf[0].data[offset]; + else + data = scanBuffer[offset]; + } } template @@ -88,22 +88,22 @@ void setData( ) { const Parameters_t params = getParameters(); - if (treeLevel struct WGScratchProxy { - uint32_t get(const uint32_t ix) - { - return wgScratch[ix+offset]; - } - void set(const uint32_t ix, const uint32_t value) - { - wgScratch[ix+offset] = value; - } + uint32_t get(const uint32_t ix) + { + return wgScratch[ix+offset]; + } + void set(const uint32_t ix, const uint32_t value) + { + wgScratch[ix+offset] = value; + } uint32_t atomicAdd(uint32_t ix, uint32_t val) { return nbl::hlsl::glsl::atomicAdd(wgScratch[ix + offset], val); } - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } }; static WGScratchProxy<0> accessor; @@ -62,7 +62,7 @@ ScanPushConstants spc; */ uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); } namespace nbl diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 7c39dd98c0..8425b096a3 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -13,67 +13,67 @@ namespace hlsl { namespace scan { - template - void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) - { - const Parameters_t params = getParameters(); - const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex(); - const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex() == (glsl::gl_WorkGroupSize().x - 1); + template + void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) + { + const Parameters_t params = getParameters(); + const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex(); + const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex() == (glsl::gl_WorkGroupSize().x - 1); - const uint32_t lastLevel = params.topLevel << 1u; - const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; - const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; + const uint32_t lastLevel = params.topLevel << 1u; + const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; + const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel // so that it exits after reaching the top? - Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t? - if(inRange) - { - getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); - } + Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t? + if(inRange) + { + getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); + } - if(treeLevel < params.topLevel) - { + if(treeLevel < params.topLevel) + { data = workgroup::reduction::template __call(data,accessor); - } + } else if (!isExclusive && params.topLevel == 0u) { data = workgroup::inclusive_scan::template __call(data,accessor); } - else if (treeLevel != params.topLevel) - { - data = workgroup::inclusive_scan::template __call(data,accessor); - } - else - { - data = workgroup::exclusive_scan::template __call(data,accessor); - } - setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); - } + else if (treeLevel != params.topLevel) + { + data = workgroup::inclusive_scan::template __call(data,accessor); + } + else + { + data = workgroup::exclusive_scan::template __call(data,accessor); + } + setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); + } - DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components - template + DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components + template void main(NBL_REF_ARG(Accessor) accessor) - { - const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); - const uint32_t topLevel = getParameters().topLevel; - // persistent workgroups - while (true) - { + { + const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); + const uint32_t topLevel = getParameters().topLevel; + // persistent workgroups + while (true) + { // REVIEW: Need to create accessor here. // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call? - uint32_t treeLevel,localWorkgroupIndex; - if (scheduler::getWork(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor)) - { - return; - } + uint32_t treeLevel,localWorkgroupIndex; + if (scheduler::getWork(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor)) + { + return; + } - virtualWorkgroup(treeLevel, localWorkgroupIndex, accessor); + virtualWorkgroup(treeLevel, localWorkgroupIndex, accessor); - scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex); - } - } + scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex); + } + } } } } From af438bc619c72eede0ca42f7932028df9f94648e Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Fri, 10 May 2024 14:38:23 +0300 Subject: [PATCH 08/25] Intermediary update of code before re-implementation --- .../builtin/hlsl/scan/default_scheduler.hlsl | 30 ++++++++----------- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 1 - .../builtin/hlsl/scan/virtual_workgroup.hlsl | 7 ++++- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 71dbc152aa..d240a271a1 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -115,24 +115,20 @@ namespace scheduler template bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) { - if(workgroup::SubgroupContiguousIndex() == 0u) - { - uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u); - sharedScratch.set(0u, originalWGs); - } - else if (workgroup::SubgroupContiguousIndex() == 1u) - { - sharedScratch.set(1u, 0u); - } - sharedScratch.workgroupExecutionAndMemoryBarrier(); - - const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know? const uint32_t lastLevel = topLevel<<1u; - if (workgroup::SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[workgroup::SubgroupContiguousIndex()]) - { - sharedScratch.atomicAdd(1u, 1u); - } - sharedScratch.workgroupExecutionAndMemoryBarrier(); + if(workgroup::SubgroupContiguousIndex() == 0u) + { + uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u); + sharedScratch.set(0u, originalWGs); + treeLevel = 0; + while (originalWGs>=params.cumulativeWorkgroupCount[treeLevel]) { // doesn't work for now because PushConstant arrays can only be accessed by dynamically uniform indices + treeLevel++; + } + sharedScratch.set(1u, treeLevel); + } + sharedScratch.workgroupExecutionAndMemoryBarrier(); + const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); + treeLevel = sharedScratch.get(1u); treeLevel = sharedScratch.get(1u); if(treeLevel>lastLevel) diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl index 9aaa672d3b..10572e984a 100644 --- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl +++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl @@ -36,7 +36,6 @@ void getData( NBL_CONST_REF_ARG(uint32_t) pseudoLevel ) { - glsl::memoryBarrierBuffer(); // scanScratchBuf can't be declared as coherent due to VMM(?) const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders uint32_t offset = levelInvocationIndex; diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 8425b096a3..97b76035af 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -26,7 +26,12 @@ namespace scan // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel // so that it exits after reaching the top? - + + // Seems that even though it's a mem barrier is must NOT diverge! + // This was called inside getData but due to inRange it can possibly diverge + // so it must be called outside! Not yet sure why a mem barrier must be uniformly called. + glsl::memoryBarrierBuffer(); // scanScratchBuf can't be declared as coherent due to VMM(?) + Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t? if(inRange) { From 61d480678a3801b314bf1bdb5c28952157f4e22e Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 20 May 2024 03:16:23 +0300 Subject: [PATCH 09/25] Initial implementation of global reduce --- .../nbl/builtin/hlsl/scan/declarations.hlsl | 3 +- .../builtin/hlsl/scan/default_scheduler.hlsl | 3 +- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 14 ++-- include/nbl/builtin/hlsl/scan/scheduler.hlsl | 80 +++++++++++++++++++ .../builtin/hlsl/scan/virtual_workgroup.hlsl | 40 +++++----- .../nbl/builtin/hlsl/workgroup/broadcast.hlsl | 2 +- include/nbl/video/utilities/CScanner.h | 33 ++++++-- 7 files changed, 137 insertions(+), 38 deletions(-) create mode 100644 include/nbl/builtin/hlsl/scan/scheduler.hlsl diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl index 3dafd0b1b6..699d345f15 100644 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl @@ -30,8 +30,9 @@ namespace scan struct DefaultSchedulerParameters_t { - uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; + uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_SCAN_LEVELS]; + uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_SCAN_LEVELS]; }; diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index d240a271a1..4ffe926a4b 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -110,7 +110,7 @@ namespace scheduler /** * treeLevel - the current level in the Blelloch Scan * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. - * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5. + * For example, if we have dispatched 10 workgroups and we the virtu al workgroup number is 35, then the localWorkgroupIndex should be 5. */ template bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) @@ -128,7 +128,6 @@ namespace scheduler } sharedScratch.workgroupExecutionAndMemoryBarrier(); const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); - treeLevel = sharedScratch.get(1u); treeLevel = sharedScratch.get(1u); if(treeLevel>lastLevel) diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl index 10572e984a..60ea6a771a 100644 --- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl +++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl @@ -17,21 +17,21 @@ namespace hlsl namespace scan { -template // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... +template struct Scratch { uint32_t workgroupsStarted; uint32_t data[scratchElementCount]; }; -[[vk::binding(0 ,0)]] RWStructuredBuffer scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) +[[vk::binding(0 ,0)]] RWStructuredBuffer scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) [[vk::binding(1 ,0)]] RWStructuredBuffer /*globallycoherent (seems we can't use along with VMM)*/ scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model template void getData( NBL_REF_ARG(Storage_t) data, NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, - NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) pseudoLevel ) @@ -46,8 +46,8 @@ void getData( if (pseudoLevel!=treeLevel) // downsweep { const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; - if (bool(localWorkgroupIndex) && firstInvocationInGroup) - data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]]; + if (bool(levelWorkgroupIndex) && firstInvocationInGroup) + data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]]; if (notFirstOrLastLevel) { @@ -80,7 +80,7 @@ template void setData( NBL_CONST_REF_ARG(Storage_t) data, NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, - NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, + NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) pseudoLevel, NBL_CONST_REF_ARG(bool) inRange @@ -91,7 +91,7 @@ void setData( { const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1); if (lastInvocationInGroup) - scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data; + scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data; } else if (inRange) { diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl new file mode 100644 index 0000000000..9aad4f0121 --- /dev/null +++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl @@ -0,0 +1,80 @@ +// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_ +#define _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_ + +#include "nbl/builtin/hlsl/scan/declarations.hlsl" +#include "nbl/builtin/hlsl/scan/descriptors.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace scan +{ + +template +struct Scheduler +{ + + static Scheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams) + { + Scheduler scheduler; + scheduler.params = params; + scheduler.schedParams = schedParams; + return scheduler; + } + + template + bool getWork(NBL_REF_ARG(Accessor) accessor) + { + if (workgroup::SubgroupContiguousIndex()==0u) + { + uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted,1u); + } + uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor); + // nothing really happens, we just check if there's a workgroup in the level to grab + return uberWorkgroupIndex <= schedParams.cumulativeWorkgroupCount[level] - 1u; + } + + template + void markDone(NBL_REF_ARG(Accessor) accessor) + { + if (level==(params.topLevel)) // check if we reached the lastLevel + { + uberWorkgroupIndex = ~0u; + return; + } + + if (workgroup::SubgroupContiguousIndex()==0u) + { + // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset + const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize; + const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u) + 1u; + if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) + { + level++; + } + } + level = workgroup::BroadcastFirst(level, accessor); + } + + uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level) + { + return level == 0u ? uberWorkgroupIndex : (uberWorkgroupIndex - schedParams.cumulativeWorkgroupCount[level-1u]); + } + + Parameters_t params; + DefaultSchedulerParameters_t schedParams; + uint32_t uberWorkgroupIndex; + uint16_t level; +}; + +} +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 97b76035af..638593ae42 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -5,7 +5,7 @@ #include "nbl/builtin/hlsl/functional.hlsl" #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined #include "nbl/builtin/hlsl/scan/declarations.hlsl" -#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl" +#include "nbl/builtin/hlsl/scan/scheduler.hlsl" namespace nbl { @@ -13,14 +13,14 @@ namespace hlsl { namespace scan { - template - void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) + template + void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); - const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex(); - const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex() == (glsl::gl_WorkGroupSize().x - 1); - + const uint32_t levelInvocationIndex = levelWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex(); const uint32_t lastLevel = params.topLevel << 1u; + + // pseudoLevel is the level index going up until toplevel then back down const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; @@ -35,48 +35,46 @@ namespace scan Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t? if(inRange) { - getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel); + getData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel); } if(treeLevel < params.topLevel) { - data = workgroup::reduction::template __call(data,accessor); + data = workgroup::reduction::template __call(data,accessor); } else if (!isExclusive && params.topLevel == 0u) { - data = workgroup::inclusive_scan::template __call(data,accessor); + data = workgroup::inclusive_scan::template __call(data,accessor); } else if (treeLevel != params.topLevel) { - data = workgroup::inclusive_scan::template __call(data,accessor); + data = workgroup::inclusive_scan::template __call(data,accessor); } else { - data = workgroup::exclusive_scan::template __call(data,accessor); + data = workgroup::exclusive_scan::template __call(data,accessor); } - setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange); + setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange); } DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components - template + template void main(NBL_REF_ARG(Accessor) accessor) { + const Parameters_t params = getParameters(); const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); - const uint32_t topLevel = getParameters().topLevel; + Scheduler scheduler = Scheduler::create(params, schedulerParams); // persistent workgroups while (true) { - // REVIEW: Need to create accessor here. - // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call? - uint32_t treeLevel,localWorkgroupIndex; - if (scheduler::getWork(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor)) + if (!scheduler.getWork(accessor)) { return; } - virtualWorkgroup(treeLevel, localWorkgroupIndex, accessor); - - scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex); + virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); + accessor.workgroupExecutionAndMemoryBarrier(); + scheduler.markDone(accessor); } } } diff --git a/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl b/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl index 97ef1f7dd0..ac5c889ad0 100644 --- a/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl +++ b/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl @@ -35,7 +35,7 @@ T Broadcast(NBL_CONST_REF_ARG(T) val, NBL_REF_ARG(Accessor) accessor, const uint template T BroadcastFirst(NBL_CONST_REF_ARG(T) val, NBL_REF_ARG(Accessor) accessor) { - return Broadcast(val,0); + return Broadcast(val, accessor, 0); } } diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index ff2250e544..1868d14f77 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -212,14 +212,25 @@ class CScanner final : public core::IReferenceCounted { SchedulerParameters() { - std::fill_n(finishedFlagOffset,Parameters::MaxScanLevels-1,0u); + //std::fill_n(finishedFlagOffset,Parameters::MaxScanLevels-1,0u); std::fill_n(cumulativeWorkgroupCount,Parameters::MaxScanLevels,0u); + std::fill_n(workgroupFinishFlagsOffset, Parameters::MaxScanLevels, 0u); + std::fill_n(lastWorkgroupSetCountForLevel,Parameters::MaxScanLevels,0u); } // given the number of elements and workgroup size, figure out how many atomics we need // also account for the fact that we will want to use the same scratch buffer both for the // scheduler's atomics and the aux data storage SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters() { + /*For 696969 + We will launch 1362 WGs + CumulativeWGCountPerLvl will be : + [1362 + 3 + 1] + WGFinFlagOFfsets + inclusive_scan([0, roundUp(1362 / 512), roundUp(3 / 512)]) + LastWorkgroupSetCountForLevel + [1362 & (WGSZ - 1), 3 & (WGSZ - 1), 1 & (WGSZ - 1)]*/ + outScanParams = Parameters(_elementCount,workgroupSize); const auto topLevel = outScanParams.topLevel; @@ -228,16 +239,24 @@ class CScanner final : public core::IReferenceCounted cumulativeWorkgroupCount[i] += 1u; std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u); - std::copy_n(cumulativeWorkgroupCount+1u,topLevel,finishedFlagOffset); + for (auto i = 0u; i <= topLevel; i++) { + workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1; + lastWorkgroupSetCountForLevel[i] = cumulativeWorkgroupCount[i] & (workgroupSize - 1u); + } + + /*std::copy_n(cumulativeWorkgroupCount+1u,topLevel,finishedFlagOffset); std::copy_n(cumulativeWorkgroupCount+topLevel,topLevel,finishedFlagOffset+topLevel); const auto finishedFlagCount = sizeof(finishedFlagOffset)/sizeof(uint32_t); const auto finishedFlagsSize = std::accumulate(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,0u); - std::exclusive_scan(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,finishedFlagOffset,0u); + std::exclusive_scan(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,finishedFlagOffset,0u);*/ + + const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxScanLevels, 0u); + std::exclusive_scan(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxScanLevels, workgroupFinishFlagsOffset, 0u); for (auto i=0u; icreateDescriptorSetLayout(bindings); + assert(m_ds_layout && "CScanner Descriptor Set Layout was not created successfully"); m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout)); + assert(m_pipeline_layout && "CScanner Pipeline Layout was not created successfully"); } // From 44edb59cccd76edd78ae335f9c052af62f71ca1a Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 20 May 2024 04:41:16 +0300 Subject: [PATCH 10/25] Fix issues when WG > 1 --- include/nbl/builtin/hlsl/scan/scheduler.hlsl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl index 9aad4f0121..28f3f31d09 100644 --- a/include/nbl/builtin/hlsl/scan/scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl @@ -32,6 +32,11 @@ struct Scheduler template bool getWork(NBL_REF_ARG(Accessor) accessor) { + +glsl::memoryBarrierBuffer(); +if(scanScratchBuf[0u].workgroupsStarted >= schedParams.cumulativeWorkgroupCount[level]) + return false; + if (workgroup::SubgroupContiguousIndex()==0u) { uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted,1u); @@ -54,8 +59,10 @@ struct Scheduler { // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize; + const uint32_t lastWorkgroupSetIndexForLevel = (level == 0u ? schedParams.cumulativeWorkgroupCount[level] : (schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u])) / WorkgroupSize; const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u) + 1u; - if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) + //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) + if ((uberWorkgroupIndex < lastWorkgroupSetIndexForLevel ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) { level++; } From cdcc5d0ea55472a682d6dcd8095a6404f97e09d1 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 2 Jun 2024 12:58:49 +0300 Subject: [PATCH 11/25] Change .gitmodules examples to point to fork --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 20441ba171..478f7fa2f4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -69,7 +69,7 @@ branch = main [submodule "examples_tests"] path = examples_tests - url = git@github.com:Devsh-Graphics-Programming/Nabla-Examples-and-Tests.git + url = git@github.com:kpentaris/Nabla-Examples-and-Tests.git [submodule "3rdparty/dxc/dxc"] path = 3rdparty/dxc/dxc url = git@github.com:Devsh-Graphics-Programming/DirectXShaderCompiler.git From 1565fb82090995957b466accdba902fdd7950428 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 2 Jun 2024 13:06:07 +0300 Subject: [PATCH 12/25] update .gitmodules to also point to branch for examples submodule --- .gitmodules | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitmodules b/.gitmodules index 478f7fa2f4..3a54465877 100644 --- a/.gitmodules +++ b/.gitmodules @@ -70,6 +70,7 @@ [submodule "examples_tests"] path = examples_tests url = git@github.com:kpentaris/Nabla-Examples-and-Tests.git + branch = master [submodule "3rdparty/dxc/dxc"] path = 3rdparty/dxc/dxc url = git@github.com:Devsh-Graphics-Programming/DirectXShaderCompiler.git From 734e84a6aa805365318af9cadf04f41e22dc979b Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 10 Jun 2024 01:58:44 +0300 Subject: [PATCH 13/25] CReduce implementation --- .../nbl/builtin/hlsl/scan/declarations.hlsl | 16 +- .../builtin/hlsl/scan/default_scheduler.hlsl | 4 +- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 10 +- include/nbl/builtin/hlsl/scan/direct.hlsl | 2 +- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 12 +- include/nbl/video/utilities/CArithmeticOps.h | 286 ++++++++++++++++++ include/nbl/video/utilities/CReduce.h | 36 +++ include/nbl/video/utilities/CScanner.h | 270 +---------------- include/nbl/video/utilities/IUtilities.h | 8 + src/nbl/CMakeLists.txt | 1 + src/nbl/video/utilities/CReduce.cpp | 56 ++++ src/nbl/video/utilities/CScanner.cpp | 119 +++----- 12 files changed, 470 insertions(+), 350 deletions(-) create mode 100644 include/nbl/video/utilities/CArithmeticOps.h create mode 100644 include/nbl/video/utilities/CReduce.h create mode 100644 src/nbl/video/utilities/CReduce.cpp diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl index 699d345f15..6e6c0f5563 100644 --- a/include/nbl/builtin/hlsl/scan/declarations.hlsl +++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl @@ -9,8 +9,8 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" -#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS -#define NBL_BUILTIN_MAX_SCAN_LEVELS 7 +#ifndef NBL_BUILTIN_MAX_LEVELS +#define NBL_BUILTIN_MAX_LEVELS 7 #endif namespace nbl @@ -21,18 +21,18 @@ namespace scan { // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) struct Parameters_t { - uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1]; + uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1]; uint32_t topLevel; - uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; + uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_LEVELS/2]; }; Parameters_t getParameters(); struct DefaultSchedulerParameters_t { - uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; - uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_SCAN_LEVELS]; - uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_SCAN_LEVELS]; + uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS]; + uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS]; + uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS]; }; @@ -47,7 +47,7 @@ namespace scan NBL_CONST_REF_ARG(uint32_t) pseudoLevel ); - template + template void setData( NBL_CONST_REF_ARG(Storage_t) data, NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl index 4ffe926a4b..7f341b4cc2 100644 --- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl @@ -47,7 +47,7 @@ namespace scheduler _scanParams.lastElement[0] = elementCount-1u; _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2; - for (uint32_t i=0; i>workgroupSizeLog2; @@ -100,7 +100,7 @@ namespace scheduler break; default: break; -#if NBL_BUILTIN_MAX_SCAN_LEVELS>7 +#if NBL_BUILTIN_MAX_LEVELS>7 #error "Switch needs more cases" #endif } diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl index 60ea6a771a..c46b2d38b3 100644 --- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl +++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl @@ -76,7 +76,7 @@ void getData( } } -template +template void setData( NBL_CONST_REF_ARG(Storage_t) data, NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, @@ -95,13 +95,19 @@ void setData( } else if (inRange) { - if (bool(pseudoLevel)) + if (!isScan && treeLevel == params.topLevel) + { + scanBuffer[levelInvocationIndex] = data; + } + else if (bool(pseudoLevel)) { const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u]; scanScratchBuf[0].data[levelInvocationIndex+offset] = data; } else + { scanBuffer[levelInvocationIndex] = data; + } } } diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl index f7a6d0fe5b..e7bbcf7233 100644 --- a/include/nbl/builtin/hlsl/scan/direct.hlsl +++ b/include/nbl/builtin/hlsl/scan/direct.hlsl @@ -88,5 +88,5 @@ DefaultSchedulerParameters_t getSchedulerParameters() [numthreads(WORKGROUP_SIZE,1,1)] void main() { - nbl::hlsl::scan::main, Storage_t, true, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor); + nbl::hlsl::scan::main, Storage_t, IS_SCAN, IS_EXCLUSIVE, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor); } \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 638593ae42..72434d2286 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -13,7 +13,7 @@ namespace hlsl { namespace scan { - template + template void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); @@ -38,7 +38,9 @@ namespace scan getData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel); } - if(treeLevel < params.topLevel) + bool doReduce = isScan ? treeLevel < params.topLevel : treeLevel <= params.topLevel; + + if(doReduce) { data = workgroup::reduction::template __call(data,accessor); } @@ -54,11 +56,11 @@ namespace scan { data = workgroup::exclusive_scan::template __call(data,accessor); } - setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange); + setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange); } DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components - template + template void main(NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); @@ -72,7 +74,7 @@ namespace scan return; } - virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); + virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); accessor.workgroupExecutionAndMemoryBarrier(); scheduler.markDone(accessor); } diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h new file mode 100644 index 0000000000..c57f18839d --- /dev/null +++ b/include/nbl/video/utilities/CArithmeticOps.h @@ -0,0 +1,286 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_VIDEO_C_ARITHMETIC_OPS_H_INCLUDED_ +#define _NBL_VIDEO_C_ARITHMETIC_OPS_H_INCLUDED_ + + +#include "nbl/video/IPhysicalDevice.h" +#include "nbl/video/utilities/IDescriptorSetCache.h" + +#include "nbl/builtin/hlsl/scan/declarations.hlsl" + + +namespace nbl::video +{ +class CArithmeticOps : public core::IReferenceCounted +{ + public: + // Only 4 byte wide data types supported due to need to trade the via shared memory, + // different combinations of data type and operator have different identity elements. + // `EDT_INT` and `EO_MIN` will have `INT_MAX` as identity, while `EDT_UINT` would have `UINT_MAX` + enum E_DATA_TYPE : uint8_t + { + EDT_UINT=0u, + EDT_INT, + EDT_FLOAT, + EDT_COUNT + }; + enum E_OPERATOR : uint8_t + { + EO_AND = 0u, + EO_XOR, + EO_OR, + EO_ADD, + EO_MUL, + EO_MIN, + EO_MAX, + EO_COUNT + }; + + // This struct is only for managing where to store intermediate results of the scans + struct Parameters : nbl::hlsl::scan::Parameters_t // this struct and its methods are also available in HLSL + { + static inline constexpr uint32_t MaxLevels = NBL_BUILTIN_MAX_LEVELS; + + Parameters() + { + std::fill_n(lastElement,MaxLevels/2+1,0u); + std::fill_n(temporaryStorageOffset,MaxLevels/2,0u); + } + // build the constant tables for each level given the number of elements to scan and workgroupSize + Parameters(const uint32_t _elementCount, const uint32_t workgroupSize) : Parameters() + { + assert(_elementCount!=0u && "Input element count can't be 0!"); + const auto maxReductionLog2 = hlsl::findMSB(workgroupSize)*(MaxLevels/2u+1u); + assert(maxReductionLog2>=32u||((_elementCount-1u)>>maxReductionLog2)==0u && "Can't scan this many elements with such small workgroups!"); + + lastElement[0u] = _elementCount-1u; + for (topLevel=0u; lastElement[topLevel]>=workgroupSize;) + temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = lastElement[topLevel]/workgroupSize; + + std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u); + } + // given already computed tables of lastElement indices per level, number of levels, and storage offsets, tell us total auxillary buffer size needed + inline uint32_t getScratchSize(uint32_t ssboAlignment=256u) + { + uint32_t uint_count = 1u; // workgroup enumerator + uint_count += temporaryStorageOffset[MaxLevels/2u-1u]; // last scratch offset + uint_count += lastElement[topLevel]+1u; // and its size + return core::roundUp(uint_count*sizeof(uint32_t),ssboAlignment); + } + }; + + // the default scheduler we provide works as described above in the big documentation block + struct SchedulerParameters : nbl::hlsl::scan::DefaultSchedulerParameters_t // this struct and its methods are also available in HLSL + { + SchedulerParameters() + { + std::fill_n(cumulativeWorkgroupCount,Parameters::MaxLevels,0u); + std::fill_n(workgroupFinishFlagsOffset, Parameters::MaxLevels, 0u); + std::fill_n(lastWorkgroupSetCountForLevel,Parameters::MaxLevels,0u); + } + + // given the number of elements and workgroup size, figure out how many atomics we need + // also account for the fact that we will want to use the same scratch buffer both for the + // scheduler's atomics and the aux data storage + SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters() + { + outScanParams = Parameters(_elementCount,workgroupSize); + const auto topLevel = outScanParams.topLevel; + + std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount); + for (auto i=0u; i<=topLevel; i++) + cumulativeWorkgroupCount[i] += 1u; + std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u); + + for (auto i = 0u; i <= topLevel; i++) { + workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1; + lastWorkgroupSetCountForLevel[i] = cumulativeWorkgroupCount[i] & (workgroupSize - 1u); + } + + const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, 0u); + std::exclusive_scan(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, workgroupFinishFlagsOffset, 0u); + for (auto i=0u; i&& device) : CArithmeticOps(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} + + CArithmeticOps(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize) + { + assert(core::isPoT(m_workgroupSize)); + const asset::SPushConstantRange pc_range[] = {asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)}; + const IGPUDescriptorSetLayout::SBinding bindings[2] = { + { 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer + { 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch + }; + + m_ds_layout = m_device->createDescriptorSetLayout(bindings); + assert(m_ds_layout && "CArithmeticOps Descriptor Set Layout was not created successfully"); + m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout)); + assert(m_pipeline_layout && "CArithmeticOps Pipeline Layout was not created successfully"); + } + + inline auto getDefaultDescriptorSetLayout() const { return m_ds_layout.get(); } + + inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); } + + inline uint32_t getWorkgroupSize() const {return m_workgroupSize;} + + inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo) + { + pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize); + dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize); + } + + static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange& input_range, const asset::SBufferRange& scratch_range) + { + IGPUDescriptorSet::SDescriptorInfo infos[2]; + infos[0].desc = input_range.buffer; + infos[0].info.buffer = {input_range.offset,input_range.size}; + infos[1].desc = scratch_range.buffer; + infos[1].info.buffer = {scratch_range.offset,scratch_range.size}; + + video::IGPUDescriptorSet::SWriteDescriptorSet writes[2]; + for (auto i=0u; i<2u; i++) + { + writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i }; + } + + device->updateDescriptorSets(2, writes, 0u, nullptr); + } + + static inline void dispatchHelper( + IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipeline_layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo, + const uint32_t srcBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier* srcBufferBarriers, + const uint32_t dstBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier* dstBufferBarriers + ) + { + cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants); + if (srcBufferBarrierCount) + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { srcBufferBarriers, 1} }; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); + } + cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u); + if (srcBufferBarrierCount) + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { dstBufferBarriers, 1} }; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); + } + } + + inline core::smart_refctd_ptr createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const + { + auto system = m_device->getPhysicalDevice()->getSystem(); + core::smart_refctd_ptr hlsl; + { + auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr + { + nbl::system::ISystem::future_t> future; + system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE); + if (future.wait()) + return future.copy(); + return nullptr; + }; + + // hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect_reduce.hlsl"); ?? + hlsl = loadBuiltinData(shaderFile); + } + + auto buffer = core::make_smart_refctd_ptr(hlsl->getSize()); + memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize()); + auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????"); + + // REVIEW: Probably combine the commons parts of CReduce and CScanner + const char* storageType = nullptr; + switch (dataType) + { + case EDT_UINT: + storageType = "uint32_t"; + break; + case EDT_INT: + storageType = "int32_t"; + break; + case EDT_FLOAT: + storageType = "float32_t"; + break; + default: + assert(false); + break; + } + + const char* binop = nullptr; + switch (op) + { + case EO_AND: + binop = "bit_and"; + break; + case EO_OR: + binop = "bit_or"; + break; + case EO_XOR: + binop = "bit_xor"; + break; + case EO_ADD: + binop = "plus"; + break; + case EO_MUL: + binop = "multiplies"; + break; + case EO_MAX: + binop = "maximum"; + break; + case EO_MIN: + binop = "minimum"; + break; + default: + assert(false); + break; + } + + return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n", m_workgroupSize, storageType, binop, scratchSz); + } + + inline ILogicalDevice* getDevice() const {return m_device.get();} + protected: + // REVIEW: Does it need an empty destructor? + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_ds_layout; + core::smart_refctd_ptr m_pipeline_layout; + + const uint32_t m_workgroupSize; +}; + +} + +#endif \ No newline at end of file diff --git a/include/nbl/video/utilities/CReduce.h b/include/nbl/video/utilities/CReduce.h new file mode 100644 index 0000000000..2b4e1a6374 --- /dev/null +++ b/include/nbl/video/utilities/CReduce.h @@ -0,0 +1,36 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_VIDEO_C_REDUCE_H_INCLUDED_ +#define _NBL_VIDEO_C_REDUCE_H_INCLUDED_ + +#include "nbl/video/utilities/CArithmeticOps.h" + +namespace nbl::video +{ + + class CReduce final : public CArithmeticOps + { + + public: + CReduce(core::smart_refctd_ptr&& device) : CReduce(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} + CReduce(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {} + asset::ICPUShader* getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz); + IGPUShader* getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz); + IGPUComputePipeline* getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz); + core::smart_refctd_ptr createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const; + + protected: + ~CReduce() + { + // all drop themselves automatically + } + + core::smart_refctd_ptr m_shaders[EDT_COUNT][EO_COUNT]; + core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EDT_COUNT][EO_COUNT]; + core::smart_refctd_ptr m_pipelines[EDT_COUNT][EO_COUNT]; + }; +} + +#endif \ No newline at end of file diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index 1868d14f77..bef690f50c 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -8,9 +8,10 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/utilities/IDescriptorSetCache.h" +#include "nbl/video/utilities/CArithmeticOps.h" #include "nbl/builtin/hlsl/scan/declarations.hlsl" -static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS & 0x1, "NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!"); +static_assert(NBL_BUILTIN_MAX_LEVELS & 0x1, "NBL_BUILTIN_MAX_LEVELS must be odd!"); namespace nbl::video { @@ -141,8 +142,7 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about Console devs get to bring a gun to a knife fight... **/ -#if 1 // legacy & port to HLSL -class CScanner final : public core::IReferenceCounted +class CScanner final : public CArithmeticOps { public: enum E_SCAN_TYPE : uint8_t @@ -153,256 +153,20 @@ class CScanner final : public core::IReferenceCounted EST_EXCLUSIVE, EST_COUNT }; - // Only 4 byte wide data types supported due to need to trade the via shared memory, - // different combinations of data type and operator have different identity elements. - // `EDT_INT` and `EO_MIN` will have `INT_MAX` as identity, while `EDT_UINT` would have `UINT_MAX` - enum E_DATA_TYPE : uint8_t - { - EDT_UINT=0u, - EDT_INT, - EDT_FLOAT, - EDT_COUNT - }; - enum E_OPERATOR : uint8_t - { - EO_AND = 0u, - EO_XOR, - EO_OR, - EO_ADD, - EO_MUL, - EO_MIN, - EO_MAX, - EO_COUNT - }; - - // This struct is only for managing where to store intermediate results of the scans - struct Parameters : nbl::hlsl::scan::Parameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches - { - static inline constexpr uint32_t MaxScanLevels = NBL_BUILTIN_MAX_SCAN_LEVELS; - - Parameters() - { - std::fill_n(lastElement,MaxScanLevels/2+1,0u); - std::fill_n(temporaryStorageOffset,MaxScanLevels/2,0u); - } - // build the constant tables for each level given the number of elements to scan and workgroupSize - Parameters(const uint32_t _elementCount, const uint32_t workgroupSize) : Parameters() - { - assert(_elementCount!=0u && "Input element count can't be 0!"); - const auto maxReductionLog2 = hlsl::findMSB(workgroupSize)*(MaxScanLevels/2u+1u); - assert(maxReductionLog2>=32u||((_elementCount-1u)>>maxReductionLog2)==0u && "Can't scan this many elements with such small workgroups!"); - - lastElement[0u] = _elementCount-1u; - for (topLevel=0u; lastElement[topLevel]>=workgroupSize;) - temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = lastElement[topLevel]/workgroupSize; - - std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u); - } - // given already computed tables of lastElement indices per level, number of levels, and storage offsets, tell us total auxillary buffer size needed - inline uint32_t getScratchSize(uint32_t ssboAlignment=256u) - { - uint32_t uint_count = 1u; // workgroup enumerator - uint_count += temporaryStorageOffset[MaxScanLevels/2u-1u]; // last scratch offset - uint_count += lastElement[topLevel]+1u; // and its size - return core::roundUp(uint_count*sizeof(uint32_t),ssboAlignment); - } - }; - // the default scheduler we provide works as described above in the big documentation block - struct SchedulerParameters : nbl::hlsl::scan::DefaultSchedulerParameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches - { - SchedulerParameters() - { - //std::fill_n(finishedFlagOffset,Parameters::MaxScanLevels-1,0u); - std::fill_n(cumulativeWorkgroupCount,Parameters::MaxScanLevels,0u); - std::fill_n(workgroupFinishFlagsOffset, Parameters::MaxScanLevels, 0u); - std::fill_n(lastWorkgroupSetCountForLevel,Parameters::MaxScanLevels,0u); - } - // given the number of elements and workgroup size, figure out how many atomics we need - // also account for the fact that we will want to use the same scratch buffer both for the - // scheduler's atomics and the aux data storage - SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters() - { - /*For 696969 - We will launch 1362 WGs - CumulativeWGCountPerLvl will be : - [1362 + 3 + 1] - WGFinFlagOFfsets - inclusive_scan([0, roundUp(1362 / 512), roundUp(3 / 512)]) - LastWorkgroupSetCountForLevel - [1362 & (WGSZ - 1), 3 & (WGSZ - 1), 1 & (WGSZ - 1)]*/ - - outScanParams = Parameters(_elementCount,workgroupSize); - const auto topLevel = outScanParams.topLevel; - - std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount); - for (auto i=0u; i<=topLevel; i++) - cumulativeWorkgroupCount[i] += 1u; - std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u); - - for (auto i = 0u; i <= topLevel; i++) { - workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1; - lastWorkgroupSetCountForLevel[i] = cumulativeWorkgroupCount[i] & (workgroupSize - 1u); - } - - /*std::copy_n(cumulativeWorkgroupCount+1u,topLevel,finishedFlagOffset); - std::copy_n(cumulativeWorkgroupCount+topLevel,topLevel,finishedFlagOffset+topLevel); - - const auto finishedFlagCount = sizeof(finishedFlagOffset)/sizeof(uint32_t); - const auto finishedFlagsSize = std::accumulate(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,0u); - std::exclusive_scan(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,finishedFlagOffset,0u);*/ - - const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxScanLevels, 0u); - std::exclusive_scan(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxScanLevels, workgroupFinishFlagsOffset, 0u); - for (auto i=0u; i&& device) : CScanner(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} - // - CScanner(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize) - { - assert(core::isPoT(m_workgroupSize)); - const asset::SPushConstantRange pc_range[] = {asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)}; - const IGPUDescriptorSetLayout::SBinding bindings[2] = { - { 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer - { 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch - }; - - m_ds_layout = m_device->createDescriptorSetLayout(bindings); - assert(m_ds_layout && "CScanner Descriptor Set Layout was not created successfully"); - m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout)); - assert(m_pipeline_layout && "CScanner Pipeline Layout was not created successfully"); - } - - // - inline auto getDefaultDescriptorSetLayout() const { return m_ds_layout.get(); } - - // - inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); } - + // You need to override this shader with your own defintion of `nbl_glsl_scan_getIndirectElementCount` for it to even compile, so we always give you a new shader - core::smart_refctd_ptr getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const - { - return createShader(true,scanType,dataType,op,scratchSz); - } + //core::smart_refctd_ptr getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const + //{ + // return createShader(true,scanType,dataType,op,scratchSz); + //} // - inline asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) - { - if (!m_shaders[scanType][dataType][op]) - m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz); - return m_shaders[scanType][dataType][op].get(); - } - // - inline IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) - { - if (!m_specialized_shaders[scanType][dataType][op]) - { - auto cpuShader = core::smart_refctd_ptr(getDefaultShader(scanType,dataType,op,scratchSz)); - cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); - cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); - - auto gpushader = m_device->createShader(cpuShader.get()); - - m_specialized_shaders[scanType][dataType][op] = gpushader; - } - return m_specialized_shaders[scanType][dataType][op].get(); - } - - // - inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) - { - // ondemand - if (!m_pipelines[scanType][dataType][op]) { - IGPUComputePipeline::SCreationParams params = {}; - params.layout = m_pipeline_layout.get(); - // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") - params.shader.entryPoint = "main"; - params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op, scratchSz); - - m_device->createComputePipelines( - nullptr, { ¶ms,1 }, - & m_pipelines[scanType][dataType][op] - ); - } - return m_pipelines[scanType][dataType][op].get(); - } - - // - inline uint32_t getWorkgroupSize() const {return m_workgroupSize;} - - // - inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo) - { - pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize); - dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize); - } - - // - static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange& input_range, const asset::SBufferRange& scratch_range) - { - IGPUDescriptorSet::SDescriptorInfo infos[2]; - infos[0].desc = input_range.buffer; - infos[0].info.buffer = {input_range.offset,input_range.size}; - infos[1].desc = scratch_range.buffer; - infos[1].info.buffer = {scratch_range.offset,scratch_range.size}; - - video::IGPUDescriptorSet::SWriteDescriptorSet writes[2]; - for (auto i=0u; i<2u; i++) - { - writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i }; - } - - device->updateDescriptorSets(2, writes, 0u, nullptr); - } - - // Half and sizeof(uint32_t) of the scratch buffer need to be cleared to 0s - static inline void dispatchHelper( - IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipeline_layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo, - const uint32_t srcBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier* srcBufferBarriers, - const uint32_t dstBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier* dstBufferBarriers - ) - { - cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants); - if (srcBufferBarrierCount) - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { srcBufferBarriers, 1} }; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); - } - cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u); - if (srcBufferBarrierCount) - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { dstBufferBarriers, 1} }; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); - } - } - - inline ILogicalDevice* getDevice() const {return m_device.get();} - + CScanner(core::smart_refctd_ptr&& device) : CScanner(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} + CScanner(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {} + asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); + IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); + IGPUComputePipeline* getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); + core::smart_refctd_ptr createShader(const char* shaderFile, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const; protected: ~CScanner() { @@ -411,16 +175,10 @@ class CScanner final : public core::IReferenceCounted core::smart_refctd_ptr createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const; - - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_ds_layout; - core::smart_refctd_ptr m_pipeline_layout; core::smart_refctd_ptr m_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; core::smart_refctd_ptr m_pipelines[EST_COUNT][EDT_COUNT][EO_COUNT]; - const uint32_t m_workgroupSize; }; -#endif } #endif diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index baf821f7a2..321058cf50 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -11,6 +11,7 @@ #include "nbl/video/alloc/StreamingTransientDataBuffer.h" #include "nbl/video/utilities/SIntendedSubmitInfo.h" #include "nbl/video/utilities/CPropertyPoolHandler.h" +#include "nbl/video/utilities/CReduce.h" #include "nbl/video/utilities/CScanner.h" #include "nbl/video/utilities/CComputeBlit.h" @@ -134,6 +135,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted // TODO: investigate whether we need to clamp against 256u instead of 128u on mobile #endif const auto scan_workgroup_size = core::max(core::roundDownToPoT(limits.maxWorkgroupSize[0]) >> 1u, 128u); + m_reducer = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device), scan_workgroup_size); m_scanner = core::make_smart_refctd_ptr(core::smart_refctd_ptr(m_device), scan_workgroup_size); } @@ -161,6 +163,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return m_propertyPoolHandler.get(); } #endif + virtual CReduce* getDefaultReducer() const + { + return m_reducer.get(); + } + virtual CScanner* getDefaultScanner() const { return m_scanner.get(); @@ -721,6 +728,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted #if 0 // TODO: port core::smart_refctd_ptr m_propertyPoolHandler; #endif + core::smart_refctd_ptr m_reducer; core::smart_refctd_ptr m_scanner; }; diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index b04188d243..04d9b654a6 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -266,6 +266,7 @@ set(NBL_VIDEO_SOURCES ${NBL_ROOT_PATH}/src/nbl/video/utilities/IPropertyPool.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/IUtilities.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/CPropertyPoolHandler.cpp + ${NBL_ROOT_PATH}/src/nbl/video/utilities/CReduce.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/CScanner.cpp ${NBL_ROOT_PATH}/src/nbl/video/utilities/CComputeBlit.cpp diff --git a/src/nbl/video/utilities/CReduce.cpp b/src/nbl/video/utilities/CReduce.cpp new file mode 100644 index 0000000000..bc9f745207 --- /dev/null +++ b/src/nbl/video/utilities/CReduce.cpp @@ -0,0 +1,56 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/utilities/CReduce.h" + +namespace nbl::video +{ + +asset::ICPUShader* CReduce::getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) +{ + if (!m_shaders[dataType][op]) + m_shaders[dataType][op] = createShader(dataType,op,scratchSz); + return m_shaders[dataType][op].get(); +} + +IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) +{ + if (!m_specialized_shaders[dataType][op]) + { + auto cpuShader = core::smart_refctd_ptr(getDefaultShader(dataType,op,scratchSz)); + cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/reduce.hlsl"); + cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); + + auto gpushader = m_device->createShader(cpuShader.get()); + + m_specialized_shaders[dataType][op] = gpushader; + } + return m_specialized_shaders[dataType][op].get(); +} + +IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) +{ + // ondemand + if (!m_pipelines[dataType][op]) { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pipeline_layout.get(); + // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") + params.shader.entryPoint = "main"; + params.shader.shader = getDefaultSpecializedShader(dataType,op,scratchSz); + + m_device->createComputePipelines( + nullptr, { ¶ms,1 }, + & m_pipelines[dataType][op] + ); + } + return m_pipelines[dataType][op].get(); +} + +core::smart_refctd_ptr CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const +{ + core::smart_refctd_ptr base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz); + return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", "false", "false"); +} + +} \ No newline at end of file diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index 06dfd076e3..3c6c4088a3 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -1,85 +1,52 @@ #include "nbl/video/utilities/CScanner.h" -using namespace nbl; -using namespace video; +namespace nbl::video +{ + +asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) +{ + if (!m_shaders[scanType][dataType][op]) + m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz); + return m_shaders[scanType][dataType][op].get(); +} -core::smart_refctd_ptr CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const +IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) { - auto system = m_device->getPhysicalDevice()->getSystem(); - core::smart_refctd_ptr hlsl; - { - auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr - { - nbl::system::ISystem::future_t> future; - system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE); - if (future.wait()) - return future.copy(); - return nullptr; - }; + if (!m_specialized_shaders[scanType][dataType][op]) + { + auto cpuShader = core::smart_refctd_ptr(getDefaultShader(scanType,dataType,op,scratchSz)); + cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); + cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); - if(indirect) - hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect.hlsl"); - else - hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/direct.hlsl"); - } - auto buffer = core::make_smart_refctd_ptr(hlsl->getSize()); - memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize()); - auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????"); + auto gpushader = m_device->createShader(cpuShader.get()); - // (REVIEW): All of the below, that rely on enumerations, should probably be changed to take advantage of HLSL-CPP compatibility. - // Issue is that CScanner caches all shaders for all permutations of scanType-dataType-operator and it's not clear how to - // do this without enums - const char* storageType = nullptr; - switch (dataType) - { - case EDT_UINT: - storageType = "uint32_t"; - break; - case EDT_INT: - storageType = "int32_t"; - break; - case EDT_FLOAT: - storageType = "float32_t"; - break; - default: - assert(false); - break; - } + m_specialized_shaders[scanType][dataType][op] = gpushader; + } + return m_specialized_shaders[scanType][dataType][op].get(); +} - const char* isExclusive = scanType == EST_EXCLUSIVE ? "true" : "false"; +IGPUComputePipeline* CScanner::getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) +{ + // ondemand + if (!m_pipelines[scanType][dataType][op]) { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pipeline_layout.get(); + // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") + params.shader.entryPoint = "main"; + params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op, scratchSz); + + m_device->createComputePipelines( + nullptr, { ¶ms,1 }, + & m_pipelines[scanType][dataType][op] + ); + } + return m_pipelines[scanType][dataType][op].get(); +} + +core::smart_refctd_ptr CScanner::createShader(const char* shaderFile, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const +{ + core::smart_refctd_ptr base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz); + return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true"); +} - const char* binop = nullptr; - switch (op) - { - case EO_AND: - binop = "bit_and"; - break; - case EO_OR: - binop = "bit_or"; - break; - case EO_XOR: - binop = "bit_xor"; - break; - case EO_ADD: - binop = "plus"; - break; - case EO_MUL: - binop = "multiplies"; - break; - case EO_MAX: - binop = "maximum"; - break; - case EO_MIN: - binop = "minimum"; - break; - default: - assert(false); - break; - } - - return asset::CHLSLCompiler::createOverridenCopy( - cpushader.get(), - "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n", - m_workgroupSize, storageType, binop, scratchSz - ); } \ No newline at end of file From 40953e3cf07bfa89e9a2be7f4670c7f805b78c78 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Mon, 17 Jun 2024 11:11:47 +0300 Subject: [PATCH 14/25] Fix issues with global reduce algorithm It should now work with all sizes of inputs. --- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 3 +- include/nbl/builtin/hlsl/scan/scheduler.hlsl | 40 ++++++++++++------- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 5 ++- include/nbl/video/utilities/CArithmeticOps.h | 17 ++++---- 4 files changed, 39 insertions(+), 26 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl index c46b2d38b3..f59b5cb53f 100644 --- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl +++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl @@ -20,7 +20,7 @@ namespace scan template struct Scratch { - uint32_t workgroupsStarted; + uint32_t workgroupsStarted[NBL_BUILTIN_MAX_LEVELS]; uint32_t data[scratchElementCount]; }; @@ -95,6 +95,7 @@ void setData( } else if (inRange) { + // REVIEW: Check if we have to add levelInvocationIndex in offset or levelWorkgroupIndex as above if (!isScan && treeLevel == params.topLevel) { scanBuffer[levelInvocationIndex] = data; diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl index 28f3f31d09..d7a94e1e78 100644 --- a/include/nbl/builtin/hlsl/scan/scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl @@ -30,54 +30,64 @@ struct Scheduler } template - bool getWork(NBL_REF_ARG(Accessor) accessor) + bool getWork(NBL_REF_ARG(Accessor) accessor) // rename to hasWork() { -glsl::memoryBarrierBuffer(); -if(scanScratchBuf[0u].workgroupsStarted >= schedParams.cumulativeWorkgroupCount[level]) - return false; - + // This part ensures that each workgroup that was not chosen to run the next level + // will exit once all the work for the current level is finished + glsl::memoryBarrierBuffer(); + if(scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level)) + return false; + if (workgroup::SubgroupContiguousIndex()==0u) { - uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted,1u); + uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level],1u); } uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor); // nothing really happens, we just check if there's a workgroup in the level to grab - return uberWorkgroupIndex <= schedParams.cumulativeWorkgroupCount[level] - 1u; + return uberWorkgroupIndex < totalWorkgroupsPerLevel(level); } template - void markDone(NBL_REF_ARG(Accessor) accessor) + bool markDone(NBL_REF_ARG(Accessor) accessor) { if (level==(params.topLevel)) // check if we reached the lastLevel { + // not even sure this is needed uberWorkgroupIndex = ~0u; - return; + return true; } + uint32_t prevLevel = level; if (workgroup::SubgroupContiguousIndex()==0u) { // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize; - const uint32_t lastWorkgroupSetIndexForLevel = (level == 0u ? schedParams.cumulativeWorkgroupCount[level] : (schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u])) / WorkgroupSize; - const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u) + 1u; + const uint32_t lastWorkgroupSetIndexForLevel = totalWorkgroupsPerLevel(level) / WorkgroupSize; + const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u); //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) - if ((uberWorkgroupIndex < lastWorkgroupSetIndexForLevel ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) + if (((uberWorkgroupIndex/WorkgroupSize) < lastWorkgroupSetIndexForLevel ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) { level++; } } level = workgroup::BroadcastFirst(level, accessor); + return level == 0 ? false : level == prevLevel; // on level 0 never exit early but on higher levels each workgroup is allowed one operation and exits except if promoted to next level } - uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level) + uint32_t levelWorkgroupIndex(const in uint32_t level) { - return level == 0u ? uberWorkgroupIndex : (uberWorkgroupIndex - schedParams.cumulativeWorkgroupCount[level-1u]); + return uberWorkgroupIndex; } + uint32_t totalWorkgroupsPerLevel(const in uint32_t level) + { + return level == 0 ? schedParams.cumulativeWorkgroupCount[level] : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u]; + } + Parameters_t params; DefaultSchedulerParameters_t schedParams; - uint32_t uberWorkgroupIndex; + uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex uint16_t level; }; diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 72434d2286..9319ca1306 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -76,7 +76,10 @@ namespace scan virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); accessor.workgroupExecutionAndMemoryBarrier(); - scheduler.markDone(accessor); + if(scheduler.markDone(accessor)) + { + return; + } } } } diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h index c57f18839d..b066cbff15 100644 --- a/include/nbl/video/utilities/CArithmeticOps.h +++ b/include/nbl/video/utilities/CArithmeticOps.h @@ -58,16 +58,16 @@ class CArithmeticOps : public core::IReferenceCounted lastElement[0u] = _elementCount-1u; for (topLevel=0u; lastElement[topLevel]>=workgroupSize;) - temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = lastElement[topLevel]/workgroupSize; + temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = std::ceil(lastElement[topLevel] / double(workgroupSize)); std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u); } - // given already computed tables of lastElement indices per level, number of levels, and storage offsets, tell us total auxillary buffer size needed + // given already computed tables of lastElement indice s per level, number of levels, and storage offsets, tell us total auxillary buffer size needed inline uint32_t getScratchSize(uint32_t ssboAlignment=256u) { - uint32_t uint_count = 1u; // workgroup enumerator + uint32_t uint_count = MaxLevels; // workgroup enumerator uint_count += temporaryStorageOffset[MaxLevels/2u-1u]; // last scratch offset - uint_count += lastElement[topLevel]+1u; // and its size + uint_count += lastElement[topLevel]+1u; // starting from the last storage offset, we also need slots equal to the top level's elementCount (add 1u because 0u based) return core::roundUp(uint_count*sizeof(uint32_t),ssboAlignment); } }; @@ -91,13 +91,11 @@ class CArithmeticOps : public core::IReferenceCounted const auto topLevel = outScanParams.topLevel; std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount); - for (auto i=0u; i<=topLevel; i++) - cumulativeWorkgroupCount[i] += 1u; std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u); - + cumulativeWorkgroupCount[topLevel] = 1u; // the top level will always end up with 1 workgroup to do the final reduction for (auto i = 0u; i <= topLevel; i++) { - workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1; - lastWorkgroupSetCountForLevel[i] = cumulativeWorkgroupCount[i] & (workgroupSize - 1u); + workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1; // RECHECK: findMSB(511) == 8u !! Here we assume it's 9u !! + lastWorkgroupSetCountForLevel[i] = (cumulativeWorkgroupCount[i] - 1u) & (workgroupSize - 1u); } const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, 0u); @@ -127,6 +125,7 @@ class CArithmeticOps : public core::IReferenceCounted { constexpr auto workgroupSpinningProtection = 4u; // to prevent first workgroup starving/idling on level 1 after finishing level 0 early wg_count = limits.computeOptimalPersistentWorkgroupDispatchSize(elementCount,workgroupSize,workgroupSpinningProtection); + assert(wg_count >= elementCount / workgroupSize^2 + 1 && "Too few workgroups! The workgroup count must be at least the elementCount/(wgSize^2)"); } uint32_t wg_count; From 3d252d07612d14b625e4fa32557ca27ba5161ff0 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 7 Jul 2024 17:20:19 +0300 Subject: [PATCH 15/25] refactor global reduce to properly work with required global scan changes --- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 17 +-- .../nbl/builtin/hlsl/scan/scan_scheduler.hlsl | 69 ++++++++++ include/nbl/builtin/hlsl/scan/scheduler.hlsl | 123 ++++++++++-------- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 98 ++++++++++---- include/nbl/video/utilities/CArithmeticOps.h | 36 ++--- include/nbl/video/utilities/CReduce.h | 8 +- include/nbl/video/utilities/CScanner.h | 4 +- src/nbl/builtin/CMakeLists.txt | 8 ++ src/nbl/video/utilities/CReduce.cpp | 18 +-- src/nbl/video/utilities/CScanner.cpp | 4 +- 10 files changed, 259 insertions(+), 126 deletions(-) create mode 100644 include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl index f59b5cb53f..93d383a7dd 100644 --- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl +++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl @@ -17,11 +17,12 @@ namespace hlsl namespace scan { -template +template struct Scratch { + uint32_t reduceResult; uint32_t workgroupsStarted[NBL_BUILTIN_MAX_LEVELS]; - uint32_t data[scratchElementCount]; + uint32_t data[dataElementCount]; }; [[vk::binding(0 ,0)]] RWStructuredBuffer scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) @@ -43,7 +44,7 @@ void getData( if (notFirstOrLastLevel) offset += params.temporaryStorageOffset[pseudoLevel-1u]; - if (pseudoLevel!=treeLevel) // downsweep + if (pseudoLevel!=treeLevel) // downsweep/scan { const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; if (bool(levelWorkgroupIndex) && firstInvocationInGroup) @@ -87,19 +88,19 @@ void setData( ) { const Parameters_t params = getParameters(); - if (treeLevel struct ScanScheduler +{ + + static ScanScheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams) + { + ScanScheduler scheduler; + scheduler.params = params; + scheduler.schedParams = schedParams; + return scheduler; + } + + template bool getWork(NBL_REF_ARG(Accessor) accessor) // rename to hasWork() + { + + // This part ensures that each workgroup that was not chosen to run the next level + // will exit once all the work for the current level is finished + glsl::memoryBarrierBuffer(); + if (scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level)) + return false; + + if (workgroup::SubgroupContiguousIndex() == 0u) + { + uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level], 1u); + } + uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor); + // nothing really happens, we just check if there's a workgroup in the level to grab + return uberWorkgroupIndex < totalWorkgroupsPerLevel(level); + } + + uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level) { return uberWorkgroupIndex; } + + uint32_t totalWorkgroupsPerLevel(NBL_CONST_REF_ARG(uint32_t) level) + { + return level == 0u + ? schedParams.cumulativeWorkgroupCount[level] + : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level - 1u]; + } + + Parameters_t params; + DefaultSchedulerParameters_t schedParams; + uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex + uint32_t level; // 32 bit to stop warnings from level = workgroup::BroadcastFirst +}; +} +} +} +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl index d7a94e1e78..99effbd88a 100644 --- a/include/nbl/builtin/hlsl/scan/scheduler.hlsl +++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl @@ -2,8 +2,8 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_ -#define _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_ +#ifndef _NBL_HLSL_SCHEDULER_INCLUDED_ +#define _NBL_HLSL_SCHEDULER_INCLUDED_ #include "nbl/builtin/hlsl/scan/declarations.hlsl" #include "nbl/builtin/hlsl/scan/descriptors.hlsl" @@ -20,75 +20,88 @@ namespace scan template struct Scheduler { - - static Scheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams) - { - Scheduler scheduler; - scheduler.params = params; - scheduler.schedParams = schedParams; - return scheduler; - } + static Scheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams) + { + Scheduler scheduler; + scheduler.params = params; + scheduler.schedParams = schedParams; + return scheduler; + } - template - bool getWork(NBL_REF_ARG(Accessor) accessor) // rename to hasWork() - { - - // This part ensures that each workgroup that was not chosen to run the next level - // will exit once all the work for the current level is finished - glsl::memoryBarrierBuffer(); - if(scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level)) - return false; - - if (workgroup::SubgroupContiguousIndex()==0u) + template + bool getWork(NBL_REF_ARG(Accessor) accessor) // rename to hasWork() { - uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level],1u); + + // This part ensures that each workgroup that was not chosen to run the next level + // will exit once all the work for the current level is finished + glsl::memoryBarrierBuffer(); + if(scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level)) + return false; + + if (workgroup::SubgroupContiguousIndex()==0u) + { + uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level],1u); + } + uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor); + // nothing really happens, we just check if there's a workgroup in the level to grab + return uberWorkgroupIndex < totalWorkgroupsPerLevel(level); } - uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor); - // nothing really happens, we just check if there's a workgroup in the level to grab - return uberWorkgroupIndex < totalWorkgroupsPerLevel(level); - } - template - bool markDone(NBL_REF_ARG(Accessor) accessor) - { - if (level==(params.topLevel)) // check if we reached the lastLevel + template + bool markDone(NBL_REF_ARG(Accessor) accessor) { - // not even sure this is needed - uberWorkgroupIndex = ~0u; - return true; + if (level==(params.topLevel)) // check if we reached the lastLevel + { + // not even sure this is needed + uberWorkgroupIndex = ~0u; + return true; + } + + uint32_t prevLevel = level; + if (workgroup::SubgroupContiguousIndex()==0u) + { + // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset + const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize; + const uint32_t lastWorkgroupSetIndexForLevel = totalWorkgroupsPerLevel(level) / WorkgroupSize; + const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u); + //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) + if (((uberWorkgroupIndex/WorkgroupSize) < lastWorkgroupSetIndexForLevel ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) + { + level++; + } + } + level = workgroup::BroadcastFirst(level, accessor); + return level == 0 ? false : level == prevLevel; // on level 0 never exit early but on higher levels each workgroup is allowed one operation and exits except if promoted to next level } - uint32_t prevLevel = level; - if (workgroup::SubgroupContiguousIndex()==0u) + uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level) { - // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset - const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize; - const uint32_t lastWorkgroupSetIndexForLevel = totalWorkgroupsPerLevel(level) / WorkgroupSize; - const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u); - //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) - if (((uberWorkgroupIndex/WorkgroupSize) < lastWorkgroupSetIndexForLevel ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount) - { - level++; - } + return uberWorkgroupIndex; } - level = workgroup::BroadcastFirst(level, accessor); - return level == 0 ? false : level == prevLevel; // on level 0 never exit early but on higher levels each workgroup is allowed one operation and exits except if promoted to next level - } - uint32_t levelWorkgroupIndex(const in uint32_t level) - { - return uberWorkgroupIndex; - } - - uint32_t totalWorkgroupsPerLevel(const in uint32_t level) + uint32_t totalWorkgroupsPerLevel(NBL_CONST_REF_ARG(uint32_t) level) + { + return level == 0 + ? schedParams.cumulativeWorkgroupCount[level] + : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u]; + } + + void resetWorkgroupsStartedBuffer() { - return level == 0 ? schedParams.cumulativeWorkgroupCount[level] : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u]; + // could do scanScratchBuf[0u].workgroupsStarted[SubgroupContiguousIndex()] = 0u but don't know how many invocations are live during this call + if(workgroup::SubgroupContiguousIndex() == 0u) + { + for(uint32_t i = 0; i < params.topLevel; i++) + { + scanScratchBuf[0u].workgroupsStarted[i] = 0u; + } + } } Parameters_t params; DefaultSchedulerParameters_t schedParams; uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex - uint16_t level; + uint32_t level; // 32 bit to stop warnings from level = workgroup::BroadcastFirst }; } diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 9319ca1306..3bebbd7305 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -6,23 +6,37 @@ #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined #include "nbl/builtin/hlsl/scan/declarations.hlsl" #include "nbl/builtin/hlsl/scan/scheduler.hlsl" +#include "nbl/builtin/hlsl/scan/scan_scheduler.hlsl" namespace nbl { namespace hlsl { namespace scan -{ +{ + /** + * Finds the index on a higher level in the reduce tree which the + * levelInvocationIndex needs to add to level 0 + */ + template + uint32_t2 findHigherLevelIndex(uint32_t level, uint32_t levelInvocationIndex) + { + const uint32_t WgSzLog2 = firstbithigh(WorkgroupSize); + uint32_t rightIndexNonInclusive = (levelInvocationIndex >> (WgSzLog2 * level)); // formula for finding the index. If it underflows it means that there's nothing to do for the level + uint32_t leftIndexInclusive = (rightIndexNonInclusive >> WgSzLog2) << WgSzLog2; + uint32_t add = (levelInvocationIndex & (WgSzLog2 * level - 1)) > 0 ? 1u : 0u; // if the index mod WorkgroupSize*level > 0 then we must add 1u + return uint32_t2(leftIndexInclusive, rightIndexNonInclusive); // return both the index and the potential underflow information to know we must do nothing + } + template void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_REF_ARG(Accessor) accessor) { const Parameters_t params = getParameters(); const uint32_t levelInvocationIndex = levelWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex(); - const uint32_t lastLevel = params.topLevel << 1u; // pseudoLevel is the level index going up until toplevel then back down - const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel; - const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; + const uint32_t pseudoLevel = isScan ? (params.topLevel-treeLevel) : treeLevel; + const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; // the lastElement array contains the lastElement's index hence the '<=' // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel // so that it exits after reaching the top? @@ -38,24 +52,38 @@ namespace scan getData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel); } - bool doReduce = isScan ? treeLevel < params.topLevel : treeLevel <= params.topLevel; - + bool doReduce = !isScan; if(doReduce) { data = workgroup::reduction::template __call(data,accessor); } - else if (!isExclusive && params.topLevel == 0u) - { - data = workgroup::inclusive_scan::template __call(data,accessor); - } - else if (treeLevel != params.topLevel) + else { - data = workgroup::inclusive_scan::template __call(data,accessor); - } - else - { - data = workgroup::exclusive_scan::template __call(data,accessor); + if (isExclusive) + { + data = workgroup::exclusive_scan::template __call(data,accessor); + } + else + { + data = workgroup::inclusive_scan::template __call(data,accessor); + } + + uint32_t reverseLevel = params.topLevel - treeLevel; + while(reverseLevel != ~0u) + { + Storage_t tempData = Binop::identity; + uint32_t2 idxRange = findHigherLevelIndex(reverseLevel--, levelInvocationIndex); + if(workgroup::SubgroupContiguousIndex() < idxRange.y - idxRange.x) + { + tempData = scanScratchBuf[0].data[params.temporaryStorageOffset[reverseLevel] + idxRange.x + workgroup::SubgroupContiguousIndex()]; + } + + // we could potentially do an inclusive scan of this part and cache it for other WGs + //tempData = workgroup::inclusive_scan::template __call(tempData,accessor); + data += workgroup::reduction::template __call(tempData,accessor); + } } + setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange); } @@ -65,21 +93,35 @@ namespace scan { const Parameters_t params = getParameters(); const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters(); - Scheduler scheduler = Scheduler::create(params, schedulerParams); - // persistent workgroups - while (true) + + if(!isScan) { - if (!scheduler.getWork(accessor)) + Scheduler scheduler = Scheduler::create(params, schedulerParams); + // persistent workgroups + + while (true) { - return; - } + if (!scheduler.getWork(accessor)) + { + return; + } - virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); - accessor.workgroupExecutionAndMemoryBarrier(); - if(scheduler.markDone(accessor)) - { - return; - } + virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); + accessor.workgroupExecutionAndMemoryBarrier(); + if(scheduler.markDone(accessor)) + { + return; + } + } + } + else + { + ScanScheduler scheduler = ScanScheduler::create(params, schedulerParams); + while(scheduler.getWork(accessor)) + { + virtualWorkgroup(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor); + accessor.workgroupExecutionAndMemoryBarrier(); + } } } } diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h index b066cbff15..492e8fe641 100644 --- a/include/nbl/video/utilities/CArithmeticOps.h +++ b/include/nbl/video/utilities/CArithmeticOps.h @@ -65,11 +65,17 @@ class CArithmeticOps : public core::IReferenceCounted // given already computed tables of lastElement indice s per level, number of levels, and storage offsets, tell us total auxillary buffer size needed inline uint32_t getScratchSize(uint32_t ssboAlignment=256u) { - uint32_t uint_count = MaxLevels; // workgroup enumerator + uint32_t uint_count = 1u; // reduceResult field + uint_count += MaxLevels; // workgroup enumerator uint_count += temporaryStorageOffset[MaxLevels/2u-1u]; // last scratch offset - uint_count += lastElement[topLevel]+1u; // starting from the last storage offset, we also need slots equal to the top level's elementCount (add 1u because 0u based) + uint_count += lastElement[topLevel]+1u; // starting from the last temporary storage offset, we also need slots equal to the top level's elementCount (add 1u because 0u based) return core::roundUp(uint_count*sizeof(uint32_t),ssboAlignment); } + + inline uint32_t getWorkgroupEnumeratorSize() + { + return MaxLevels * sizeof(uint32_t); + } }; // the default scheduler we provide works as described above in the big documentation block @@ -178,26 +184,22 @@ class CArithmeticOps : public core::IReferenceCounted } static inline void dispatchHelper( - IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipeline_layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo, - const uint32_t srcBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier* srcBufferBarriers, - const uint32_t dstBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier* dstBufferBarriers + IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo, + std::span bufferBarriers ) { - cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants); - if (srcBufferBarrierCount) + cmdbuf->pushConstants(layout, asset::IShader::ESS_COMPUTE, 0u, sizeof(DefaultPushConstants), &pushConstants); + if (bufferBarriers.size() > 0) { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { srcBufferBarriers, 1} }; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); - } - cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u); - if (srcBufferBarrierCount) - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { dstBufferBarriers, 1} }; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); + for (uint32_t i = 0; i < bufferBarriers.size(); i++) + { + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, bufferBarriers[i]); + } } + cmdbuf->dispatch(dispatchInfo.wg_count, 1u, 1u); } - inline core::smart_refctd_ptr createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const + inline core::smart_refctd_ptr createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchElCount) const { auto system = m_device->getPhysicalDevice()->getSystem(); core::smart_refctd_ptr hlsl; @@ -266,7 +268,7 @@ class CArithmeticOps : public core::IReferenceCounted break; } - return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n", m_workgroupSize, storageType, binop, scratchSz); + return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_EL_CNT %d\n", m_workgroupSize, storageType, binop, scratchElCount); } inline ILogicalDevice* getDevice() const {return m_device.get();} diff --git a/include/nbl/video/utilities/CReduce.h b/include/nbl/video/utilities/CReduce.h index 2b4e1a6374..0a3d896f2c 100644 --- a/include/nbl/video/utilities/CReduce.h +++ b/include/nbl/video/utilities/CReduce.h @@ -16,10 +16,10 @@ namespace nbl::video public: CReduce(core::smart_refctd_ptr&& device) : CReduce(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {} CReduce(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {} - asset::ICPUShader* getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz); - IGPUShader* getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz); - IGPUComputePipeline* getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz); - core::smart_refctd_ptr createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const; + asset::ICPUShader* getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount); + IGPUShader* getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount); + IGPUComputePipeline* getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount); + core::smart_refctd_ptr createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) const; protected: ~CReduce() diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index bef690f50c..a6b2a6ca11 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -166,15 +166,13 @@ class CScanner final : public CArithmeticOps asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); IGPUComputePipeline* getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz); - core::smart_refctd_ptr createShader(const char* shaderFile, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const; + core::smart_refctd_ptr createShader(/*const bool indirect, */const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const; protected: ~CScanner() { // all drop themselves automatically } - core::smart_refctd_ptr createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const; - core::smart_refctd_ptr m_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT]; core::smart_refctd_ptr m_pipelines[EST_COUNT][EDT_COUNT][EO_COUNT]; diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index e3772b724b..5183f21242 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -314,6 +314,14 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/ballot.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") +#global scan +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/declarations.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/default_scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/descriptors.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/direct.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scan_scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/virtual_workgroup.hlsl") #Extensions LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl") diff --git a/src/nbl/video/utilities/CReduce.cpp b/src/nbl/video/utilities/CReduce.cpp index bc9f745207..7a56839636 100644 --- a/src/nbl/video/utilities/CReduce.cpp +++ b/src/nbl/video/utilities/CReduce.cpp @@ -7,19 +7,19 @@ namespace nbl::video { -asset::ICPUShader* CReduce::getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) +asset::ICPUShader* CReduce::getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) { if (!m_shaders[dataType][op]) - m_shaders[dataType][op] = createShader(dataType,op,scratchSz); + m_shaders[dataType][op] = createShader(dataType,op,scratchElCount); return m_shaders[dataType][op].get(); } -IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) +IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) { if (!m_specialized_shaders[dataType][op]) { - auto cpuShader = core::smart_refctd_ptr(getDefaultShader(dataType,op,scratchSz)); - cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/reduce.hlsl"); + auto cpuShader = core::smart_refctd_ptr(getDefaultShader(dataType,op,scratchElCount)); + cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); auto gpushader = m_device->createShader(cpuShader.get()); @@ -29,7 +29,7 @@ IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TY return m_specialized_shaders[dataType][op].get(); } -IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) +IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) { // ondemand if (!m_pipelines[dataType][op]) { @@ -37,7 +37,7 @@ IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TY params.layout = m_pipeline_layout.get(); // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main") params.shader.entryPoint = "main"; - params.shader.shader = getDefaultSpecializedShader(dataType,op,scratchSz); + params.shader.shader = getDefaultSpecializedShader(dataType,op,scratchElCount); m_device->createComputePipelines( nullptr, { ¶ms,1 }, @@ -47,9 +47,9 @@ IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TY return m_pipelines[dataType][op].get(); } -core::smart_refctd_ptr CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const +core::smart_refctd_ptr CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) const { - core::smart_refctd_ptr base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz); + core::smart_refctd_ptr base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchElCount); return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", "false", "false"); } diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index 3c6c4088a3..5d0e9dc039 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -6,7 +6,7 @@ namespace nbl::video asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) { if (!m_shaders[scanType][dataType][op]) - m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz); + m_shaders[scanType][dataType][op] = createShader(scanType,dataType,op,scratchSz); return m_shaders[scanType][dataType][op].get(); } @@ -43,7 +43,7 @@ IGPUComputePipeline* CScanner::getDefaultPipeline(const E_SCAN_TYPE scanType, co return m_pipelines[scanType][dataType][op].get(); } -core::smart_refctd_ptr CScanner::createShader(const char* shaderFile, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const +core::smart_refctd_ptr CScanner::createShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const { core::smart_refctd_ptr base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz); return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true"); From dd2cc097d47d32bc94bc9279aa0477f683842674 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 7 Jul 2024 18:01:34 +0300 Subject: [PATCH 16/25] Remove unused pseudoLevel parameter --- .../nbl/builtin/hlsl/scan/descriptors.hlsl | 66 +++++++++---------- .../builtin/hlsl/scan/virtual_workgroup.hlsl | 8 +-- 2 files changed, 35 insertions(+), 39 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl index 93d383a7dd..2e2896cf37 100644 --- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl +++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl @@ -33,48 +33,47 @@ void getData( NBL_REF_ARG(Storage_t) data, NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, - NBL_CONST_REF_ARG(uint32_t) treeLevel, - NBL_CONST_REF_ARG(uint32_t) pseudoLevel + NBL_CONST_REF_ARG(uint32_t) treeLevel ) { const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders uint32_t offset = levelInvocationIndex; - const bool notFirstOrLastLevel = bool(pseudoLevel); + const bool notFirstOrLastLevel = bool(treeLevel); if (notFirstOrLastLevel) - offset += params.temporaryStorageOffset[pseudoLevel-1u]; + offset += params.temporaryStorageOffset[treeLevel-1u]; - if (pseudoLevel!=treeLevel) // downsweep/scan - { - const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; - if (bool(levelWorkgroupIndex) && firstInvocationInGroup) - data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]]; - - if (notFirstOrLastLevel) - { - if (!firstInvocationInGroup) - data = scanScratchBuf[0].data[offset-1u]; - } - else - { - if(isExclusive) - { - if (!firstInvocationInGroup) - data += scanBuffer[offset-1u]; - } - else - { - data += scanBuffer[offset]; - } - } - } - else - { + //if (pseudoLevel!=treeLevel) // downsweep/scan + //{ + // const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u; + // if (bool(levelWorkgroupIndex) && firstInvocationInGroup) + // data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]]; + // + // if (notFirstOrLastLevel) + // { + // if (!firstInvocationInGroup) + // data = scanScratchBuf[0].data[offset-1u]; + // } + // else + // { + // if(isExclusive) + // { + // if (!firstInvocationInGroup) + // data += scanBuffer[offset-1u]; + // } + // else + // { + // data += scanBuffer[offset]; + // } + // } + //} + //else + //{ if (notFirstOrLastLevel) data = scanScratchBuf[0].data[offset]; else data = scanBuffer[offset]; - } + //} } template @@ -83,7 +82,6 @@ void setData( NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_CONST_REF_ARG(uint32_t) treeLevel, - NBL_CONST_REF_ARG(uint32_t) pseudoLevel, NBL_CONST_REF_ARG(bool) inRange ) { @@ -101,9 +99,9 @@ void setData( scanScratchBuf[0u].reduceResult = data; } // The following only for isScan == true - else if (bool(pseudoLevel)) + else if (bool(treeLevel)) { - const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u]; + const uint32_t offset = params.temporaryStorageOffset[treeLevel-1u]; scanScratchBuf[0].data[levelInvocationIndex+offset] = data; } else diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl index 3bebbd7305..f9bd38a9e3 100644 --- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl +++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl @@ -34,9 +34,7 @@ namespace scan const Parameters_t params = getParameters(); const uint32_t levelInvocationIndex = levelWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex(); - // pseudoLevel is the level index going up until toplevel then back down - const uint32_t pseudoLevel = isScan ? (params.topLevel-treeLevel) : treeLevel; - const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; // the lastElement array contains the lastElement's index hence the '<=' + const bool inRange = levelInvocationIndex <= params.lastElement[treeLevel]; // the lastElement array contains the lastElement's index hence the '<=' // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel // so that it exits after reaching the top? @@ -49,7 +47,7 @@ namespace scan Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t? if(inRange) { - getData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel); + getData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel); } bool doReduce = !isScan; @@ -84,7 +82,7 @@ namespace scan } } - setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange); + setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, inRange); } DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components From ce78fc09bf3569c2e794c5a68f7dcf113f27ce6c Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 7 Jul 2024 19:06:14 +0300 Subject: [PATCH 17/25] Fix workgroupFinishFlagsOffset[0] being twice the needed size --- include/nbl/video/utilities/CArithmeticOps.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h index 492e8fe641..749faa204d 100644 --- a/include/nbl/video/utilities/CArithmeticOps.h +++ b/include/nbl/video/utilities/CArithmeticOps.h @@ -100,7 +100,7 @@ class CArithmeticOps : public core::IReferenceCounted std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u); cumulativeWorkgroupCount[topLevel] = 1u; // the top level will always end up with 1 workgroup to do the final reduction for (auto i = 0u; i <= topLevel; i++) { - workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1; // RECHECK: findMSB(511) == 8u !! Here we assume it's 9u !! + workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize)) + 1; // RECHECK: findMSB(511) == 8u !! Here we assume it's 9u !! lastWorkgroupSetCountForLevel[i] = (cumulativeWorkgroupCount[i] - 1u) & (workgroupSize - 1u); } From 0d71686e221bec826741f663c0bbbaf7b10ecd84 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 21 Jul 2024 14:47:39 +0300 Subject: [PATCH 18/25] Revert .gitmodules pointing to examples fork --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index f4bc04a799..ea533a6736 100644 --- a/.gitmodules +++ b/.gitmodules @@ -69,7 +69,7 @@ branch = main [submodule "examples_tests"] path = examples_tests - url = git@github.com:kpentaris/Nabla-Examples-and-Tests.git + url = git@github.com:Devsh-Graphics-Programming/Nabla-Examples-and-Tests.git branch = master [submodule "3rdparty/dxc/dxc"] path = 3rdparty/dxc/dxc From 7f4fcd527423ac34ec5e18e9c09a514ef0ed18ba Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Tue, 25 Feb 2025 20:59:51 +0200 Subject: [PATCH 19/25] merge upstream Nable to local --- 3rdparty/Vulkan-Headers | 2 +- 3rdparty/dxc/dxc | 2 +- 3rdparty/glslang | 2 +- 3rdparty/imgui | 2 +- 3rdparty/imguizmo | 2 +- 3rdparty/openexr | 2 +- 3rdparty/shaderc | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers index 234c4b7370..2c823b7f27 160000 --- a/3rdparty/Vulkan-Headers +++ b/3rdparty/Vulkan-Headers @@ -1 +1 @@ -Subproject commit 234c4b7370a8ea3239a214c9e871e4b17c89f4ab +Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 8a281b7e34..a08b6cbeb1 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 8a281b7e34513b3c142e895b96c522810d585ad9 +Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb diff --git a/3rdparty/glslang b/3rdparty/glslang index ff26c2e995..f397c9b6e9 160000 --- a/3rdparty/glslang +++ b/3rdparty/glslang @@ -1 +1 @@ -Subproject commit ff26c2e995ae521cb9fbc902fb4d686a47e4eb53 +Subproject commit f397c9b6e90bc53aa2e9feaef1a9cdf20ca43298 diff --git a/3rdparty/imgui b/3rdparty/imgui index a29e9dba30..e489e40a85 160000 --- a/3rdparty/imgui +++ b/3rdparty/imgui @@ -1 +1 @@ -Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175 +Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e diff --git a/3rdparty/imguizmo b/3rdparty/imguizmo index b10e91756d..6f4b2197ef 160000 --- a/3rdparty/imguizmo +++ b/3rdparty/imguizmo @@ -1 +1 @@ -Subproject commit b10e91756d32395f5c1fefd417899b657ed7cb88 +Subproject commit 6f4b2197efd715d16b19775b00f36c6c6f5aacb6 diff --git a/3rdparty/openexr b/3rdparty/openexr index c8a74d9ac9..fca936a964 160000 --- a/3rdparty/openexr +++ b/3rdparty/openexr @@ -1 +1 @@ -Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd +Subproject commit fca936a964da5983daecdbed7cd249934701b41a diff --git a/3rdparty/shaderc b/3rdparty/shaderc index d2564ba598..e72186b66b 160000 --- a/3rdparty/shaderc +++ b/3rdparty/shaderc @@ -1 +1 @@ -Subproject commit d2564ba5989c9de1a76714b3e59ec60595e9be50 +Subproject commit e72186b66bb90ed06aaf15cbdc9a053581a0616b From 201636d50412e4dc685bec18f0f5fe34ab0b9ec2 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sat, 15 Mar 2025 20:48:22 +0200 Subject: [PATCH 20/25] update 3rdparty to point to master submodules --- 3rdparty/Vulkan-Headers | 2 +- 3rdparty/dxc/dxc | 2 +- 3rdparty/glslang | 2 +- 3rdparty/imgui | 2 +- 3rdparty/imguizmo | 2 +- 3rdparty/openexr | 2 +- 3rdparty/shaderc | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers index 2c823b7f27..234c4b7370 160000 --- a/3rdparty/Vulkan-Headers +++ b/3rdparty/Vulkan-Headers @@ -1 +1 @@ -Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb +Subproject commit 234c4b7370a8ea3239a214c9e871e4b17c89f4ab diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index a08b6cbeb1..b2e75826b7 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb +Subproject commit b2e75826b70d85d03686dd8a755ef477b4fa3807 diff --git a/3rdparty/glslang b/3rdparty/glslang index f397c9b6e9..ff26c2e995 160000 --- a/3rdparty/glslang +++ b/3rdparty/glslang @@ -1 +1 @@ -Subproject commit f397c9b6e90bc53aa2e9feaef1a9cdf20ca43298 +Subproject commit ff26c2e995ae521cb9fbc902fb4d686a47e4eb53 diff --git a/3rdparty/imgui b/3rdparty/imgui index e489e40a85..a29e9dba30 160000 --- a/3rdparty/imgui +++ b/3rdparty/imgui @@ -1 +1 @@ -Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e +Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175 diff --git a/3rdparty/imguizmo b/3rdparty/imguizmo index 6f4b2197ef..b10e91756d 160000 --- a/3rdparty/imguizmo +++ b/3rdparty/imguizmo @@ -1 +1 @@ -Subproject commit 6f4b2197efd715d16b19775b00f36c6c6f5aacb6 +Subproject commit b10e91756d32395f5c1fefd417899b657ed7cb88 diff --git a/3rdparty/openexr b/3rdparty/openexr index fca936a964..c8a74d9ac9 160000 --- a/3rdparty/openexr +++ b/3rdparty/openexr @@ -1 +1 @@ -Subproject commit fca936a964da5983daecdbed7cd249934701b41a +Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd diff --git a/3rdparty/shaderc b/3rdparty/shaderc index e72186b66b..d2564ba598 160000 --- a/3rdparty/shaderc +++ b/3rdparty/shaderc @@ -1 +1 @@ -Subproject commit e72186b66bb90ed06aaf15cbdc9a053581a0616b +Subproject commit d2564ba5989c9de1a76714b3e59ec60595e9be50 From 21bfa0feb3dbed4b4ca2745f3d067363d10f5151 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 16 Mar 2025 13:02:18 +0200 Subject: [PATCH 21/25] update the hlsl/scan module paths to CMakeLists --- src/nbl/builtin/CMakeLists.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index d3dccd1ffb..1bafe6a332 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -330,21 +330,21 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h") #workgroup LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/arithmetic.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/ballot.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") -#global scan -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/declarations.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/default_scheduler.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/descriptors.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/direct.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scan_scheduler.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scheduler.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/virtual_workgroup.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl") +#global scan +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/declarations.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/default_scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/descriptors.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/direct.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/scan_scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/scheduler.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/virtual_workgroup.hlsl") #Extensions LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl") From 0a3728ccc3525e9552b7072b42dc2ad03e4ebc67 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sun, 16 Mar 2025 13:50:44 +0200 Subject: [PATCH 22/25] Update carithmeticops with proper enum paths --- include/nbl/video/utilities/CArithmeticOps.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h index 749faa204d..c5d54234af 100644 --- a/include/nbl/video/utilities/CArithmeticOps.h +++ b/include/nbl/video/utilities/CArithmeticOps.h @@ -142,10 +142,12 @@ class CArithmeticOps : public core::IReferenceCounted CArithmeticOps(core::smart_refctd_ptr&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize) { assert(core::isPoT(m_workgroupSize)); - const asset::SPushConstantRange pc_range[] = {asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)}; + const asset::SPushConstantRange pc_range[] = {asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)}; const IGPUDescriptorSetLayout::SBinding bindings[2] = { - { 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer - { 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch + {.binding = 0u, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = video::IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1u, .immutableSamplers = nullptr }, + { .binding = 1u, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = video::IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1u, .immutableSamplers = nullptr } // scratch }; m_ds_layout = m_device->createDescriptorSetLayout(bindings); @@ -188,7 +190,7 @@ class CArithmeticOps : public core::IReferenceCounted std::span bufferBarriers ) { - cmdbuf->pushConstants(layout, asset::IShader::ESS_COMPUTE, 0u, sizeof(DefaultPushConstants), &pushConstants); + cmdbuf->pushConstants(layout, asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(DefaultPushConstants), &pushConstants); if (bufferBarriers.size() > 0) { for (uint32_t i = 0; i < bufferBarriers.size(); i++) @@ -219,7 +221,7 @@ class CArithmeticOps : public core::IReferenceCounted auto buffer = core::make_smart_refctd_ptr(hlsl->getSize()); memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize()); - auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????"); + auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????"); // REVIEW: Probably combine the commons parts of CReduce and CScanner const char* storageType = nullptr; From 1c32528240d96a3d04d53a9f061d2f6416333f8f Mon Sep 17 00:00:00 2001 From: pentakon Date: Mon, 17 Mar 2025 15:06:30 +0200 Subject: [PATCH 23/25] merge submodules changes to branch --- 3rdparty/dxc/dxc | 2 +- examples_tests | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index b2e75826b7..8a281b7e34 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit b2e75826b70d85d03686dd8a755ef477b4fa3807 +Subproject commit 8a281b7e34513b3c142e895b96c522810d585ad9 diff --git a/examples_tests b/examples_tests index fb30e097e4..73b4a8ead9 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit fb30e097e4ad32217f8588df2a81a266acf4a045 +Subproject commit 73b4a8ead98daed4a731cb3ae79ec84396d0e53c From 82bd9a1a7f74a88b57c5ecf369ea325b697dfdfc Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Sat, 19 Apr 2025 17:49:15 +0300 Subject: [PATCH 24/25] Fix issues with ICPUBuffer changes --- include/nbl/video/utilities/CArithmeticOps.h | 3 +-- src/nbl/video/utilities/CReduce.cpp | 2 +- src/nbl/video/utilities/CScanner.cpp | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h index c5d54234af..ebeae08181 100644 --- a/include/nbl/video/utilities/CArithmeticOps.h +++ b/include/nbl/video/utilities/CArithmeticOps.h @@ -218,8 +218,7 @@ class CArithmeticOps : public core::IReferenceCounted // hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect_reduce.hlsl"); ?? hlsl = loadBuiltinData(shaderFile); } - - auto buffer = core::make_smart_refctd_ptr(hlsl->getSize()); + auto buffer = asset::ICPUBuffer::create({ {.size = hlsl->getSize()} }); memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize()); auto cpushader = core::make_smart_refctd_ptr(std::move(buffer), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????"); diff --git a/src/nbl/video/utilities/CReduce.cpp b/src/nbl/video/utilities/CReduce.cpp index 7a56839636..16b5cc5e29 100644 --- a/src/nbl/video/utilities/CReduce.cpp +++ b/src/nbl/video/utilities/CReduce.cpp @@ -20,7 +20,7 @@ IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TY { auto cpuShader = core::smart_refctd_ptr(getDefaultShader(dataType,op,scratchElCount)); cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); - cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); + cpuShader->setShaderStage(asset::IShader::E_SHADER_STAGE::ESS_COMPUTE); auto gpushader = m_device->createShader(cpuShader.get()); diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp index 5d0e9dc039..3cdcc82ee7 100644 --- a/src/nbl/video/utilities/CScanner.cpp +++ b/src/nbl/video/utilities/CScanner.cpp @@ -16,7 +16,7 @@ IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, co { auto cpuShader = core::smart_refctd_ptr(getDefaultShader(scanType,dataType,op,scratchSz)); cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); - cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); + cpuShader->setShaderStage(asset::IShader::E_SHADER_STAGE::ESS_COMPUTE); auto gpushader = m_device->createShader(cpuShader.get()); From 6e740b3d5e3fefd262ceb042b8cad9c49a4fce71 Mon Sep 17 00:00:00 2001 From: Pentaris Konstantinos Date: Tue, 22 Apr 2025 13:52:05 +0300 Subject: [PATCH 25/25] Fix bad shared-mem accessor pattern in direct.hlsl and properly export symbols for CReduce and CScanner --- include/nbl/builtin/hlsl/scan/direct.hlsl | 4 ++-- include/nbl/video/utilities/CReduce.h | 2 +- include/nbl/video/utilities/CScanner.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl index e7bbcf7233..3075816521 100644 --- a/include/nbl/builtin/hlsl/scan/direct.hlsl +++ b/include/nbl/builtin/hlsl/scan/direct.hlsl @@ -20,9 +20,9 @@ groupshared uint32_t wgScratch[SharedScratchSz]; template struct WGScratchProxy { - uint32_t get(const uint32_t ix) + void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) { - return wgScratch[ix+offset]; + value = wgScratch[ix+offset]; } void set(const uint32_t ix, const uint32_t value) { diff --git a/include/nbl/video/utilities/CReduce.h b/include/nbl/video/utilities/CReduce.h index 0a3d896f2c..4a311aa738 100644 --- a/include/nbl/video/utilities/CReduce.h +++ b/include/nbl/video/utilities/CReduce.h @@ -10,7 +10,7 @@ namespace nbl::video { - class CReduce final : public CArithmeticOps + class NBL_API2 CReduce final : public CArithmeticOps { public: diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h index a6b2a6ca11..d705113f1a 100644 --- a/include/nbl/video/utilities/CScanner.h +++ b/include/nbl/video/utilities/CScanner.h @@ -142,7 +142,7 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about Console devs get to bring a gun to a knife fight... **/ -class CScanner final : public CArithmeticOps +class NBL_API2 CScanner final : public CArithmeticOps { public: enum E_SCAN_TYPE : uint8_t