From d61a2aa623630f2bc697a32d29c217e908ea8f21 Mon Sep 17 00:00:00 2001
From: PentaKon <kpentaris@gmail.com>
Date: Wed, 31 Jan 2024 00:29:00 +0200
Subject: [PATCH 01/25] First commit for global scan implementation. Ported
 most of the code but not working yet

---
 .../nbl/builtin/hlsl/glsl_compat/core.hlsl    |   5 +
 .../nbl/builtin/hlsl/scan/declarations.hlsl   |  75 ++++++-----
 .../builtin/hlsl/scan/default_scheduler.hlsl  | 106 +++++++---------
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    | 116 +++++++++++++++++-
 include/nbl/builtin/hlsl/scan/direct.hlsl     |  53 ++++----
 include/nbl/builtin/hlsl/scan/indirect.hlsl   |  34 ++---
 .../builtin/hlsl/scan/parameters_struct.hlsl  |  30 -----
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  42 +++----
 8 files changed, 256 insertions(+), 205 deletions(-)
 delete mode 100644 include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
index 3cb34d6f16..b08073beae 100644
--- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
+++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
@@ -83,6 +83,11 @@ void tess_ctrl_barrier() {
 void memoryBarrierShared() {
     spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask);
 }
+
+void memoryBarrierBuffer() {
+    spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsUniformMemoryMask);
+}
+
 #endif
 
 }
diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
index 2d2e66e66d..ec3e558dfc 100644
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -1,66 +1,63 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
 #ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
 #define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
 
 // REVIEW: Not sure if this file is needed in HLSL implementation
 
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
+#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS
+#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
+#endif
 
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
 {
+    // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
+	struct Parameters_t {
+		uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
+		uint32_t topLevel;
+		uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
+	}
+    
 	Parameters_t getParameters();
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
-#endif
 
-#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
+    struct DefaultSchedulerParameters_t
+    {
+        uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
+        uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
+
+    };
+    
+    DefaultSchedulerParameters_t getSchedulerParameters();
+
 	template<typename Storage_t>
 	void getData(
-		inout Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel
+		NBL_REF_ARG(Storage_t) data,
+		NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+		NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+		NBL_CONST_REF_ARG(uint32_t) treeLevel,
+		NBL_CONST_REF_ARG(uint32_t) pseudoLevel
 	);
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-#endif
 
-#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
 	template<typename Storage_t>
 	void setData(
-		in Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel,
-		in bool inRange
+		NBL_CONST_REF_ARG(Storage_t) data,
+		NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+		NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+		NBL_CONST_REF_ARG(uint32_t) treeLevel,
+		NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
+		NBL_CONST_REF_ARG(bool) inRange
 	);
+    
 }
 }
 }
-#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-#endif
 
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 450368475d..67c62aadae 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -1,31 +1,13 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
 #ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
 #define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
 
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{	
-	struct DefaultSchedulerParameters_t
-	{
-		uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
-		uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
-
-	};
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#else
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 
 namespace nbl
 {
@@ -33,6 +15,8 @@ namespace hlsl
 {
 namespace scan
 {
+
+#ifdef HLSL
 namespace scheduler
 {
 	/**
@@ -58,17 +42,17 @@ namespace scheduler
 	 * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
 	 *							synchronizing between workgroups with while-loop spinning.
 	 */
-	void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
+	void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
 	{
 #define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u)
+        const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x);
 		_scanParams.lastElement[0] = elementCount-1u;
-		_scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
-		// REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime?
+		_scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2;
 		
-		for (uint i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
+		for (uint32_t i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
 		{
-			const uint next = i+1;
-			_scanParams.lastElement[next] = _scanParams.lastElement[i]>>_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
+			const uint32_t next = i+1;
+			_scanParams.lastElement[next] = _scanParams.lastElement[i]>>workgroupSizeLog2;
 			i = next;
 		}
 		_schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0);
@@ -130,34 +114,31 @@ namespace scheduler
 	 * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
 	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
 	 */
-	template<class ScratchAccessor>
-	bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex)
+	template<class Accessor>
+	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
 	{
-		ScratchAccessor sharedScratch;
 		if(SubgroupContiguousIndex() == 0u) 
 		{
-			uint64_t original;
-			InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-			sharedScratch.set(SubgroupContiguousIndex(), original);
+			uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u);
+            sharedScratch.set(0u, originalWGs);
 		}
 		else if (SubgroupContiguousIndex() == 1u) 
 		{
-			sharedScratch.set(SubgroupContiguousIndex(), 0u);
+			sharedScratch.set(1u, 0u);
 		}
-		GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology?
-		
-		const uint globalWorkgroupIndex; // does every thread need to know?
-		sharedScratch.get(0u, globalWorkgroupIndex);
-		const uint lastLevel = topLevel<<1u;
+		sharedScratch.workgroupExecutionAndMemoryBarrier();
+        
+		const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know?
+		const uint32_t lastLevel = topLevel<<1u;
 		if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) 
 		{
-			InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ?
+            sharedScratch.atomicAdd(1u, 1u);
 		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
+		sharedScratch.workgroupExecutionAndMemoryBarrier();
 		
-		sharedScratch.get(1u, treeLevel);
+		treeLevel = sharedScratch.get(1u);
 		if(treeLevel>lastLevel)
-			return true;
+            return true;
 		
 		localWorkgroupIndex = globalWorkgroupIndex;
 		const bool dependentLevel = treeLevel != 0u;
@@ -170,52 +151,55 @@ namespace scheduler
 				uint dependentsCount = 1u;
 				if(treeLevel <= topLevel) 
 				{
-					dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file?
+					dependentsCount = glsl::gl_WorkGroupSize().x;
 					const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
 					if (lastWorkgroup) 
 					{
-						const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader
+						const Parameters_t scanParams = getParameters();
 						dependentsCount = scanParams.lastElement[treeLevel]+1u;
 						if (treeLevel<topLevel) 
 						{
-							dependentsCount -= scanParams.lastElement[treeLevel+1u]*_NBL_HLSL_WORKGROUP_SIZE_;
+							dependentsCount -= scanParams.lastElement[treeLevel+1u] * glsl::gl_WorkGroupSize().x;
 						}
 					}
 				}
 				uint dependentsFinishedFlagOffset = localWorkgroupIndex;
 				if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
-					dependentsFinishedFlagOffset /= _NBL_HLSL_WORKGROUP_SIZE_;
+					dependentsFinishedFlagOffset /= glsl::gl_WorkGroupSize().x;
 				dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
-				while (scanScratch.data[dependentsFinishedFlagOffset]!=dependentsCount) // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-					GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
+				while (scanScratchBuf[0].data[dependentsFinishedFlagOffset]!=dependentsCount)
+					glsl::memoryBarrierBuffer();
 			}
 		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
+		sharedScratch.workgroupExecutionAndMemoryBarrier();
+        glsl::memoryBarrierBuffer();
 		return false;
 	}
 	
-	void markComplete(in DefaultSchedulerParameters_t params, in uint topLevel, in uint treeLevel, in uint localWorkgroupIndex)
+	void markComplete(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
 	{
-		GroupMemoryBarrierWithGroupSync(); // must complete writing the data before flags itself as complete  // TODO (PentaKon): Possibly refactor?
+        glsl::memoryBarrierBuffer();
 		if (SubgroupContiguousIndex()==0u)
 		{
-			uint finishedFlagOffset = params.finishedFlagOffset[treeLevel];
+			uint32_t finishedFlagOffset = params.finishedFlagOffset[treeLevel];
 			if (treeLevel<topLevel)
 			{
-				finishedFlagOffset += localWorkgroupIndex/_NBL_HLSL_WORKGROUP_SIZE_;
-				InterlockedAdd(scanScratch.data[finishedFlagOffset],1u);
+				finishedFlagOffset += localWorkgroupIndex/glsl::gl_WorkGroupSize().x;
+                glsl::atomicAdd(scanScratchBuf[0].data[finishedFlagOffset], 1u);
 			}
 			else if (treeLevel!=(topLevel<<1u))
 			{
 				finishedFlagOffset += localWorkgroupIndex;
-				scanScratch.data[finishedFlagOffset] = 1u; // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
+				scanScratchBuf[0].data[finishedFlagOffset] = 1u;
 			}
 		}
 	}
 }
+#endif
+
 }
 }
 }
-#endif
+
 
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 86c5589725..0adee5dc32 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -1,3 +1,117 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
 
+#ifndef _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
+#define _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
 
-// choerent -> globallycoherent
\ No newline at end of file
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+// coherent -> globallycoherent
+
+// (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
+#ifndef SCRATCH_SIZE
+#error "Must manually define SCRATCH_SIZE for now"
+#endif
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint32_t scratchElementCount=SCRATCH_SIZE>
+struct Scratch
+{
+    uint32_t workgroupsStarted;
+    uint32_t data[scratchElementCount];
+};
+
+[[vk::binding(0 ,0)]] StructuredBuffer<uint32_t> scanInputBuf; // (REVIEW): Make the type externalizable. Decide how (#define?)
+[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
+
+template<typename Storage_t, bool isExclusive=false>
+void getData(
+    NBL_REF_ARG(Storage_t) data,
+    NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+    NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) treeLevel,
+    NBL_CONST_REF_ARG(uint32_t) pseudoLevel
+)
+{
+    const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders
+    
+    uint32_t offset = levelInvocationIndex;
+    const bool notFirstOrLastLevel = bool(pseudoLevel);
+    if (notFirstOrLastLevel)
+		offset += params.temporaryStorageOffset[pseudoLevel-1u];
+    
+    if (pseudoLevel!=treeLevel) // downsweep
+	{
+		const bool firstInvocationInGroup = SubgroupContiguousIndex()==0u;
+		if (bool(localWorkgroupIndex) && firstInvocationInGroup)
+			data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]];
+
+		if (notFirstOrLastLevel)
+		{
+			if (!firstInvocationInGroup)
+				data = scanScratchBuf[0].data[offset-1u];
+		}
+		else
+		{
+            if(isExclusive)
+            {
+                if (!firstInvocationInGroup)
+                    data += scanInputBuf[offset-1u];
+            }
+            else
+            {
+                data += scanInputBuf[offset];
+            }
+		}
+	}
+	else
+	{
+		if (notFirstOrLastLevel)
+			data = scanScratchBuf[0].data[offset];
+		else
+			data = scanInputBuf[offset];
+	}
+}
+
+template<typename Storage_t>
+void setData(
+    NBL_CONST_REF_ARG(Storage_t) data,
+    NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+    NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) treeLevel,
+    NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
+    NBL_CONST_REF_ARG(bool) inRange
+)
+{
+    const Parameters_t params = getParameters();
+	if (treeLevel<params.topLevel)
+	{
+		const bool lastInvocationInGroup = SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1);
+		if (lastInvocationInGroup)
+			scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
+	}
+	else if (inRange)
+	{
+		if (bool(pseudoLevel))
+		{
+			const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u];
+			scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
+		}
+		else
+			scanInputBuf[levelInvocationIndex] = data;
+	}
+}
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index 325a08e3f0..cb2e0ce078 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -1,10 +1,20 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#endif
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
 
-#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
-#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+struct ScanPushConstants
+{
+    nbl::hlsl::scan::Parameters_t scanParams;
+    nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams;
+};
+
+[[vk::push_constant]]
+ScanPushConstants spc;
 
 namespace nbl
 {
@@ -12,39 +22,22 @@ namespace hlsl
 {
 namespace scan
 {
-#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-	cbuffer PC // REVIEW: register and packoffset selection
-	{
-		Parameters_t scanParams;
-		DefaultSchedulerParameters_t schedulerParams;
-	};
-#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
 Parameters_t getParameters()
 {
-	return pc.scanParams;
+    return spc.scanParams;
 }
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
 
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
 DefaultSchedulerParameters_t getSchedulerParameters()
 {
-	return pc.schedulerParams;
+    return spc.schedulerParams;
 }
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
+
 }
 }
 }
 
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
 {
-	nbl::hlsl::scan::main();
-}
-#define _NBL_HLSL_MAIN_DEFINED_
-#endif
\ No newline at end of file
+    nbl::hlsl::scan::main();
+}
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/indirect.hlsl b/include/nbl/builtin/hlsl/scan/indirect.hlsl
index 1191731f65..f330b1393d 100644
--- a/include/nbl/builtin/hlsl/scan/indirect.hlsl
+++ b/include/nbl/builtin/hlsl/scan/indirect.hlsl
@@ -1,8 +1,3 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#define _NBL_HLSL_WORKGROUP_SIZE_LOG2_ 8
-#endif
-
 #include "nbl/builtin/hlsl/scan/descriptors.hlsl"
 #include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
 #include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
@@ -13,36 +8,33 @@ namespace hlsl
 {
 namespace scan
 {
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
+    
+uint32_t getIndirectElementCount();
+
 Parameters_t scanParams;
+    
 Parameters_t getParameters()
 {
-	return scanParams;
+    return scanParams;
 }
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
-
-uint getIndirectElementCount();
 
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
 DefaultSchedulerParameters_t schedulerParams;
+
 DefaultSchedulerParameters_t getSchedulerParameters()
 {
-	scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams);
+    scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams);
 	return schedulerParams;
 }
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
+
 }
 }
 }
 
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
 {
-	if (bool(nbl::hlsl::scan::getIndirectElementCount()))
-		nbl::hlsl::scan::main();
+    if(bool(nbl::hlsl::scan::getIndirectElementCount())) {
+        // TODO call main from virtual_workgroup.hlsl
+    }
 }
-#define _NBL_HLSL_MAIN_DEFINED_
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl b/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
deleted file mode 100644
index bfeba13be2..0000000000
--- a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-#define _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-
-#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	// REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
-	struct Parameters_t {
-		uint lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
-		uint topLevel;
-		uint temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
-	}
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 488bf29012..3563f9546e 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -6,29 +6,27 @@
 #include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
-
+#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
 #include "nbl/builtin/hlsl/binops.hlsl"
 
-#if 0
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
 {
-	template<class Binop, class Storage_t>
-	void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex)
+	template<class Binop, typename Storage_t, uint16_t ItemCount, class Accessor, bool isExclusive, class device_capabilities=void>
+	void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
 	{
 		const Parameters_t params = getParameters();
-		const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex();
-		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1);
-
-		const uint lastLevel = params.topLevel << 1u;
-		const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel];
+		const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex();
+		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1);
 
+		const uint32_t lastLevel = params.topLevel << 1u;
+		const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
 		const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
 
-		Storage_t data = Binop::identity();
+		Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t?
 		if(inRange)
 		{
 			getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
@@ -36,17 +34,19 @@ namespace scan
 
 		if(treeLevel < params.topLevel) 
 		{
-			#error "Must also define some scratch accessor when calling operation()"
-			data = workgroup::reduction<Binop>()(data);
+            data = workgroup::reduction<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
 		}
-		// REVIEW: missing _TYPE_ check and extra case here
+        else if (!isExclusive && params.topLevel == 0u)
+        {
+            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+        }
 		else if (treeLevel != params.topLevel)
 		{
-			data = workgroup::inclusive_scan<Binop>()(data);
+			data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
 		}
 		else
 		{
-			data = workgroup::exclusive_scan<Binop>()(data);
+			data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
 		}
 		setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
 	}
@@ -54,8 +54,6 @@ namespace scan
 }
 }
 
-#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored?
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
 namespace nbl
 {
 namespace hlsl
@@ -66,11 +64,13 @@ namespace scan
 	void main()
 	{
 		const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
-		const uint topLevel = getParameters().topLevel;
+		const uint32_t topLevel = getParameters().topLevel;
 		// persistent workgroups
 		while (true)
 		{
-			uint treeLevel,localWorkgroupIndex;
+            // REVIEW: Need to create accessor here.
+            // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call?
+			uint32_t treeLevel,localWorkgroupIndex;
 			if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex))
 			{
 				return;
@@ -84,9 +84,5 @@ namespace scan
 }
 }
 }
-#endif
-
-#define _NBL_HLSL_SCAN_MAIN_DEFINED_
-#endif
 
 #endif
\ No newline at end of file

From 0eec0ee44b9661fd3317e4968a33b5d88fd372bb Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Mon, 12 Feb 2024 01:17:25 +0200
Subject: [PATCH 02/25] Global scan migration of code to new APIs

---
 .../builtin/hlsl/scan/default_scheduler.hlsl  |  2 +-
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    | 17 ++----
 include/nbl/builtin/hlsl/scan/direct.hlsl     | 49 ++++++++++++++-
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  | 29 ++++-----
 include/nbl/video/utilities/CScanner.h        | 34 +++++------
 src/nbl/video/utilities/CScanner.cpp          | 61 ++++++++++++++-----
 6 files changed, 130 insertions(+), 62 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 67c62aadae..5ec112b065 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -115,7 +115,7 @@ namespace scheduler
 	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
 	 */
 	template<class Accessor>
-	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
+	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
 	{
 		if(SubgroupContiguousIndex() == 0u) 
 		{
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 0adee5dc32..169257e30a 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -10,11 +10,6 @@
 
 // coherent -> globallycoherent
 
-// (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
-#ifndef SCRATCH_SIZE
-#error "Must manually define SCRATCH_SIZE for now"
-#endif
-
 namespace nbl
 {
 namespace hlsl
@@ -22,14 +17,14 @@ namespace hlsl
 namespace scan
 {
 
-template<uint32_t scratchElementCount=SCRATCH_SIZE>
+template<uint32_t scratchElementCount=scratchSz> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
 struct Scratch
 {
     uint32_t workgroupsStarted;
     uint32_t data[scratchElementCount];
 };
 
-[[vk::binding(0 ,0)]] StructuredBuffer<uint32_t> scanInputBuf; // (REVIEW): Make the type externalizable. Decide how (#define?)
+[[vk::binding(0 ,0)]] RWStructuredBuffer<uint32_t /*Storage_t*/> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
 [[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
 
 template<typename Storage_t, bool isExclusive=false>
@@ -64,11 +59,11 @@ void getData(
             if(isExclusive)
             {
                 if (!firstInvocationInGroup)
-                    data += scanInputBuf[offset-1u];
+                    data += scanBuffer[offset-1u];
             }
             else
             {
-                data += scanInputBuf[offset];
+                data += scanBuffer[offset];
             }
 		}
 	}
@@ -77,7 +72,7 @@ void getData(
 		if (notFirstOrLastLevel)
 			data = scanScratchBuf[0].data[offset];
 		else
-			data = scanInputBuf[offset];
+			data = scanBuffer[offset];
 	}
 }
 
@@ -106,7 +101,7 @@ void setData(
 			scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
 		}
 		else
-			scanInputBuf[levelInvocationIndex] = data;
+			scanBuffer[levelInvocationIndex] = data;
 	}
 }
 
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index cb2e0ce078..76003d8e4a 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -1,9 +1,45 @@
 // Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#pragma shader_stage(compute)
+
+#include "nbl/builtin/hlsl/functional.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
 
+static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
+
+// TODO: Can we make it a static variable?
+groupshared uint32_t wgScratch[SharedScratchSz];
+
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+
+template<uint16_t offset>
+struct WGScratchProxy
+{
+	uint32_t get(const uint32_t ix)
+	{
+		return wgScratch[ix+offset];
+	}
+	void set(const uint32_t ix, const uint32_t value)
+	{
+		wgScratch[ix+offset] = value;
+	}
+
+    uint32_t atomicAdd(uint32_t ix, uint32_t val)
+    {
+        return glsl::atomicAdd(wgScratch[ix + offset], val);
+    }
+
+	void workgroupExecutionAndMemoryBarrier()
+	{
+		nbl::hlsl::glsl::barrier();
+		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+	}
+};
+static WGScratchProxy<0> accessor;
+
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
 
@@ -16,6 +52,17 @@ struct ScanPushConstants
 [[vk::push_constant]]
 ScanPushConstants spc;
 
+/**
+ * Required since we rely on SubgroupContiguousIndex instead of 
+ * gl_LocalInvocationIndex which means to match the global index 
+ * we can't use the gl_GlobalInvocationID but an index based on 
+ * SubgroupContiguousIndex.
+ */
+uint32_t globalIndex()
+{
+	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
 namespace nbl
 {
 namespace hlsl
@@ -39,5 +86,5 @@ DefaultSchedulerParameters_t getSchedulerParameters()
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-    nbl::hlsl::scan::main();
+    nbl::hlsl::scan::main<BINOP, Storage_t, isExclusive, WGScratchProxy>(accessor);
 }
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 3563f9546e..a8319f4366 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -15,7 +15,7 @@ namespace hlsl
 {
 namespace scan
 {
-	template<class Binop, typename Storage_t, uint16_t ItemCount, class Accessor, bool isExclusive, class device_capabilities=void>
+	template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor, class device_capabilities=void>
 	void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
 	{
 		const Parameters_t params = getParameters();
@@ -26,6 +26,9 @@ namespace scan
 		const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
 		const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
 
+        // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel
+        // so that it exits after reaching the top?
+
 		Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t?
 		if(inRange)
 		{
@@ -34,34 +37,26 @@ namespace scan
 
 		if(treeLevel < params.topLevel) 
 		{
-            data = workgroup::reduction<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+            data = workgroup::reduction<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
 		}
         else if (!isExclusive && params.topLevel == 0u)
         {
-            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
         }
 		else if (treeLevel != params.topLevel)
 		{
-			data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+			data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
 		}
 		else
 		{
-			data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+			data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
 		}
 		setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
 	}
-}
-}
-}
 
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
 	DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
-	void main()
+	template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor>
+    void main(NBL_REF_ARG(Accessor) accessor)
 	{
 		const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
 		const uint32_t topLevel = getParameters().topLevel;
@@ -71,12 +66,12 @@ namespace scan
             // REVIEW: Need to create accessor here.
             // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call?
 			uint32_t treeLevel,localWorkgroupIndex;
-			if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex))
+			if (scheduler::getWork<Accessor>(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor))
 			{
 				return;
 			}
 
-			virtualWorkgroup(treeLevel,localWorkgroupIndex);
+			virtualWorkgroup<Binop, Storage_t, isExclusive, ItemCount, Accessor>(treeLevel, localWorkgroupIndex, accessor);
 
 			scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex);
 		}
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index 2ae67163eb..9f1f75b117 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -9,14 +9,12 @@
 #include "nbl/video/IPhysicalDevice.h"
 #include "nbl/video/utilities/IDescriptorSetCache.h"
 
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS & 0x1, "NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!");
 
 namespace nbl::video
 {
 
-#include "nbl/builtin/glsl/scan/parameters_struct.glsl"
-#include "nbl/builtin/glsl/scan/default_scheduler.glsl"
-static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS&0x1,"NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!");
-
 /**
 Utility class to help you perform the equivalent of `std::inclusive_scan` and `std::exclusive_scan` with data on the GPU.
 
@@ -150,9 +148,9 @@ class CScanner final : public core::IReferenceCounted
 		enum E_SCAN_TYPE : uint8_t
 		{
 			 // computes output[n] = Sum_{i<=n}(input[i])
-			 EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_,
+			 EST_INCLUSIVE = 0u,
 			 // computes output[n] = Sum_{i<n}(input[i]), meaning first element is identity
-			 EST_EXCLUSIVE = _NBL_GLSL_SCAN_TYPE_EXCLUSIVE_,
+			 EST_EXCLUSIVE,
 			 EST_COUNT
 		};
 		// Only 4 byte wide data types supported due to need to trade the via shared memory,
@@ -167,18 +165,18 @@ class CScanner final : public core::IReferenceCounted
 		};
 		enum E_OPERATOR : uint8_t
 		{
-			EO_AND = _NBL_GLSL_SCAN_OP_AND_,
-			EO_XOR = _NBL_GLSL_SCAN_OP_XOR_,
-			EO_OR = _NBL_GLSL_SCAN_OP_OR_,
-			EO_ADD = _NBL_GLSL_SCAN_OP_ADD_,
-			EO_MUL = _NBL_GLSL_SCAN_OP_MUL_,
-			EO_MIN = _NBL_GLSL_SCAN_OP_MIN_,
-			EO_MAX = _NBL_GLSL_SCAN_OP_MAX_,
-			EO_COUNT = _NBL_GLSL_SCAN_OP_COUNT_
+			EO_AND = 0u,
+			EO_XOR,
+			EO_OR,
+			EO_ADD,
+			EO_MUL,
+			EO_MIN,
+			EO_MAX,
+			EO_COUNT
 		};
 
 		// This struct is only for managing where to store intermediate results of the scans
-		struct Parameters : nbl_glsl_scan_Parameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches
+		struct Parameters : nbl::hlsl::scan::Parameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches
 		{
 			static inline constexpr uint32_t MaxScanLevels = NBL_BUILTIN_MAX_SCAN_LEVELS;
 
@@ -210,7 +208,7 @@ class CScanner final : public core::IReferenceCounted
 			}
 		};
                 // the default scheduler we provide works as described above in the big documentation block
-		struct SchedulerParameters : nbl_glsl_scan_DefaultSchedulerParameters_t  // this struct and its methods are also available in GLSL so you can launch indirect dispatches
+		struct SchedulerParameters : nbl::hlsl::scan::DefaultSchedulerParameters_t  // this struct and its methods are also available in GLSL so you can launch indirect dispatches
 		{
 			SchedulerParameters()
 			{
@@ -305,14 +303,14 @@ class CScanner final : public core::IReferenceCounted
 			if (!m_specialized_shaders[scanType][dataType][op])
 			{
 				auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op));
-				cpuShader->setFilePathHint("nbl/builtin/glsl/scan/direct.comp");
+				cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
 				cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
 
 				auto gpushader = m_device->createShader(std::move(cpuShader));
 
 				m_specialized_shaders[scanType][dataType][op] = m_device->createSpecializedShader(
 					gpushader.get(),{nullptr,nullptr,"main"});
-				// , asset::IShader::ESS_COMPUTE, "nbl/builtin/glsl/scan/direct.comp"
+				// , asset::IShader::ESS_COMPUTE, "nbl/builtin/hlsl/scan/direct.hlsl"
 			}
 			return m_specialized_shaders[scanType][dataType][op].get();
 		}
diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp
index ec31272358..6fb88074b6 100644
--- a/src/nbl/video/utilities/CScanner.cpp
+++ b/src/nbl/video/utilities/CScanner.cpp
@@ -3,11 +3,10 @@
 using namespace nbl;
 using namespace video;
 
-#if 0 // TODO: port
 core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const
 {
 	auto system = m_device->getPhysicalDevice()->getSystem();
-	core::smart_refctd_ptr<const system::IFile> glsl;
+	core::smart_refctd_ptr<const system::IFile> hlsl;
 	{
 		auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile>
 		{
@@ -19,24 +18,59 @@ core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indi
 		};
 
 		if(indirect)
-			glsl = loadBuiltinData("nbl/builtin/glsl/scan/indirect.comp");
+			hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect.hlsl");
 		else
-			glsl = loadBuiltinData("nbl/builtin/glsl/scan/direct.comp");
+			hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/direct.hlsl");
 	}
-	auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(glsl->getSize());
-	memcpy(buffer->getPointer(), glsl->getMappedPointer(), glsl->getSize());
-	auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????");
+	auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(hlsl->getSize());
+	memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize());
+	auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????");
+
+	// (REVIEW): All of the below, that rely on enumerations, should probably be changed to take advantage of HLSL-CPP compatibility.
+	// Issue is that CScanner caches all shaders for all permutations of scanType-dataType-operator and it's not clear how to 
+	// do this without enums
 	const char* storageType = nullptr;
 	switch (dataType)
 	{
 		case EDT_UINT:
-			storageType = "uint";
+			storageType = "uint32_t";
 			break;
 		case EDT_INT:
-			storageType = "int";
+			storageType = "int32_t";
 			break;
 		case EDT_FLOAT:
-			storageType = "float";
+			storageType = "float32_t";
+			break;
+		default:
+			assert(false);
+			break;
+	}
+
+	bool isExclusive = scanType == EST_EXCLUSIVE;
+
+	const char* binop = nullptr;
+	switch (op)
+	{
+		case EO_AND:
+			binop = "bit_and";
+			break;
+		case EO_OR:
+			binop = "bit_or";
+			break;
+		case EO_XOR:
+			binop = "bit_xor";
+			break;
+		case EO_ADD:
+			binop = "plus";
+			break;
+		case EO_MUL:
+			binop = "multiplies";
+			break;
+		case EO_MAX:
+			binop = "maximum";
+			break;
+		case EO_MIN:
+			binop = "minimum";
 			break;
 		default:
 			assert(false);
@@ -45,8 +79,7 @@ core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indi
 
 	return asset::CGLSLCompiler::createOverridenCopy(
 		cpushader.get(),
-		"#define _NBL_GLSL_WORKGROUP_SIZE_ %d\n#define _NBL_GLSL_WORKGROUP_SIZE_LOG2_ %d\n#define _NBL_GLSL_SCAN_TYPE_ %d\n#define _NBL_GLSL_SCAN_STORAGE_TYPE_ %s\n#define _NBL_GLSL_SCAN_BIN_OP_ %d\n",
-		m_workgroupSize,hlsl::findMSB(m_workgroupSize),uint32_t(scanType),storageType,uint32_t(op)
+		"#define WORKGROUP_SIZE %d\nconst bool isExclusive = %b\ntypedef storageType %s\n#define BINOP nbl::hlsl::%s\n",
+		m_workgroupSize,isExclusive,storageType,binop
 	);
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file

From 2df4ad7e374ab34c94b3e4dda279f33885d51758 Mon Sep 17 00:00:00 2001
From: PentaKon <kpentaris@gmail.com>
Date: Wed, 31 Jan 2024 00:29:00 +0200
Subject: [PATCH 03/25] First commit for global scan implementation. Ported
 most of the code but not working yet

---
 .../nbl/builtin/hlsl/glsl_compat/core.hlsl    |   4 +
 .../nbl/builtin/hlsl/scan/declarations.hlsl   |  75 ++++++-----
 .../builtin/hlsl/scan/default_scheduler.hlsl  | 106 +++++++---------
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    | 116 +++++++++++++++++-
 include/nbl/builtin/hlsl/scan/direct.hlsl     |  53 ++++----
 include/nbl/builtin/hlsl/scan/indirect.hlsl   |  34 ++---
 .../builtin/hlsl/scan/parameters_struct.hlsl  |  30 -----
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  42 +++----
 8 files changed, 255 insertions(+), 205 deletions(-)
 delete mode 100644 include/nbl/builtin/hlsl/scan/parameters_struct.hlsl

diff --git a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
index 6f5d290dc3..faa45c8ed4 100644
--- a/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
+++ b/include/nbl/builtin/hlsl/glsl_compat/core.hlsl
@@ -85,6 +85,10 @@ void memoryBarrierShared() {
     spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsWorkgroupMemoryMask);
 }
 
+void memoryBarrierBuffer() {
+    spirv::memoryBarrier(spv::ScopeDevice, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsUniformMemoryMask);
+}
+
 namespace impl 
 {
 
diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
index 2d2e66e66d..ec3e558dfc 100644
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -1,66 +1,63 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
 #ifndef _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
 #define _NBL_HLSL_SCAN_DECLARATIONS_INCLUDED_
 
 // REVIEW: Not sure if this file is needed in HLSL implementation
 
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
+#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS
+#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
+#endif
 
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
 {
+    // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
+	struct Parameters_t {
+		uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
+		uint32_t topLevel;
+		uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
+	}
+    
 	Parameters_t getParameters();
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DECLARED_
-#endif
 
-#ifndef _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
+    struct DefaultSchedulerParameters_t
+    {
+        uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
+        uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
+
+    };
+    
+    DefaultSchedulerParameters_t getSchedulerParameters();
+
 	template<typename Storage_t>
 	void getData(
-		inout Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel
+		NBL_REF_ARG(Storage_t) data,
+		NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+		NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+		NBL_CONST_REF_ARG(uint32_t) treeLevel,
+		NBL_CONST_REF_ARG(uint32_t) pseudoLevel
 	);
-}
-}
-}
-#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_
-#endif
 
-#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
 	template<typename Storage_t>
 	void setData(
-		in Storage_t data,
-		in uint levelInvocationIndex,
-		in uint localWorkgroupIndex,
-		in uint treeLevel,
-		in uint pseudoLevel,
-		in bool inRange
+		NBL_CONST_REF_ARG(Storage_t) data,
+		NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+		NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+		NBL_CONST_REF_ARG(uint32_t) treeLevel,
+		NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
+		NBL_CONST_REF_ARG(bool) inRange
 	);
+    
 }
 }
 }
-#define _NBL_HLSL_SCAN_SET_DATA_DECLARED_
-#endif
 
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 450368475d..67c62aadae 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -1,31 +1,13 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
 #ifndef _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
 #define _NBL_HLSL_SCAN_DEFAULT_SCHEDULER_INCLUDED_
 
-#include "nbl/builtin/hlsl/scan/parameters_struct.hlsl"
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{	
-	struct DefaultSchedulerParameters_t
-	{
-		uint finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
-		uint cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
-
-	};
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#else
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 
 namespace nbl
 {
@@ -33,6 +15,8 @@ namespace hlsl
 {
 namespace scan
 {
+
+#ifdef HLSL
 namespace scheduler
 {
 	/**
@@ -58,17 +42,17 @@ namespace scheduler
 	 * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
 	 *							synchronizing between workgroups with while-loop spinning.
 	 */
-	void computeParameters(in uint elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
+	void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
 	{
 #define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u)
+        const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x);
 		_scanParams.lastElement[0] = elementCount-1u;
-		_scanParams.topLevel = firstbithigh(_scanParams.lastElement[0])/_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
-		// REVIEW: _NBL_HLSL_WORKGROUP_SIZE_LOG2_ is defined in files that include THIS file. Why not query the API for workgroup size at runtime?
+		_scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2;
 		
-		for (uint i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
+		for (uint32_t i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
 		{
-			const uint next = i+1;
-			_scanParams.lastElement[next] = _scanParams.lastElement[i]>>_NBL_HLSL_WORKGROUP_SIZE_LOG2_;
+			const uint32_t next = i+1;
+			_scanParams.lastElement[next] = _scanParams.lastElement[i]>>workgroupSizeLog2;
 			i = next;
 		}
 		_schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0);
@@ -130,34 +114,31 @@ namespace scheduler
 	 * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
 	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
 	 */
-	template<class ScratchAccessor>
-	bool getWork(in DefaultSchedulerParameters_t params, in uint topLevel, out uint treeLevel, out uint localWorkgroupIndex)
+	template<class Accessor>
+	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
 	{
-		ScratchAccessor sharedScratch;
 		if(SubgroupContiguousIndex() == 0u) 
 		{
-			uint64_t original;
-			InterlockedAdd(scanScratch.workgroupsStarted, 1u, original); // REVIEW: Refactor InterlockedAdd with GLSL terminology? // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-			sharedScratch.set(SubgroupContiguousIndex(), original);
+			uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u);
+            sharedScratch.set(0u, originalWGs);
 		}
 		else if (SubgroupContiguousIndex() == 1u) 
 		{
-			sharedScratch.set(SubgroupContiguousIndex(), 0u);
+			sharedScratch.set(1u, 0u);
 		}
-		GroupMemoryBarrierWithGroupSync(); // REVIEW: refactor this somewhere with GLSL terminology?
-		
-		const uint globalWorkgroupIndex; // does every thread need to know?
-		sharedScratch.get(0u, globalWorkgroupIndex);
-		const uint lastLevel = topLevel<<1u;
+		sharedScratch.workgroupExecutionAndMemoryBarrier();
+        
+		const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know?
+		const uint32_t lastLevel = topLevel<<1u;
 		if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) 
 		{
-			InterlockedAdd(sharedScratch.get(1u, ?), 1u); // REVIEW: The way scratchaccessoradaptor is implemented (e.g. under subgroup/arithmetic_portability) doesn't allow for atomic ops on the scratch buffer. Should we ask for another implementation that overrides the [] operator ?
+            sharedScratch.atomicAdd(1u, 1u);
 		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
+		sharedScratch.workgroupExecutionAndMemoryBarrier();
 		
-		sharedScratch.get(1u, treeLevel);
+		treeLevel = sharedScratch.get(1u);
 		if(treeLevel>lastLevel)
-			return true;
+            return true;
 		
 		localWorkgroupIndex = globalWorkgroupIndex;
 		const bool dependentLevel = treeLevel != 0u;
@@ -170,52 +151,55 @@ namespace scheduler
 				uint dependentsCount = 1u;
 				if(treeLevel <= topLevel) 
 				{
-					dependentsCount = _NBL_HLSL_WORKGROUP_SIZE_; // REVIEW: Defined in the files that include this file?
+					dependentsCount = glsl::gl_WorkGroupSize().x;
 					const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
 					if (lastWorkgroup) 
 					{
-						const Parameters_t scanParams = getParameters(); // TODO (PentaKon): Undeclared as of now, this should return the Parameters_t from the push constants of (in)direct shader
+						const Parameters_t scanParams = getParameters();
 						dependentsCount = scanParams.lastElement[treeLevel]+1u;
 						if (treeLevel<topLevel) 
 						{
-							dependentsCount -= scanParams.lastElement[treeLevel+1u]*_NBL_HLSL_WORKGROUP_SIZE_;
+							dependentsCount -= scanParams.lastElement[treeLevel+1u] * glsl::gl_WorkGroupSize().x;
 						}
 					}
 				}
 				uint dependentsFinishedFlagOffset = localWorkgroupIndex;
 				if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
-					dependentsFinishedFlagOffset /= _NBL_HLSL_WORKGROUP_SIZE_;
+					dependentsFinishedFlagOffset /= glsl::gl_WorkGroupSize().x;
 				dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
-				while (scanScratch.data[dependentsFinishedFlagOffset]!=dependentsCount) // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
-					GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
+				while (scanScratchBuf[0].data[dependentsFinishedFlagOffset]!=dependentsCount)
+					glsl::memoryBarrierBuffer();
 			}
 		}
-		GroupMemoryBarrierWithGroupSync(); // TODO (PentaKon): Possibly refactor?
+		sharedScratch.workgroupExecutionAndMemoryBarrier();
+        glsl::memoryBarrierBuffer();
 		return false;
 	}
 	
-	void markComplete(in DefaultSchedulerParameters_t params, in uint topLevel, in uint treeLevel, in uint localWorkgroupIndex)
+	void markComplete(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
 	{
-		GroupMemoryBarrierWithGroupSync(); // must complete writing the data before flags itself as complete  // TODO (PentaKon): Possibly refactor?
+        glsl::memoryBarrierBuffer();
 		if (SubgroupContiguousIndex()==0u)
 		{
-			uint finishedFlagOffset = params.finishedFlagOffset[treeLevel];
+			uint32_t finishedFlagOffset = params.finishedFlagOffset[treeLevel];
 			if (treeLevel<topLevel)
 			{
-				finishedFlagOffset += localWorkgroupIndex/_NBL_HLSL_WORKGROUP_SIZE_;
-				InterlockedAdd(scanScratch.data[finishedFlagOffset],1u);
+				finishedFlagOffset += localWorkgroupIndex/glsl::gl_WorkGroupSize().x;
+                glsl::atomicAdd(scanScratchBuf[0].data[finishedFlagOffset], 1u);
 			}
 			else if (treeLevel!=(topLevel<<1u))
 			{
 				finishedFlagOffset += localWorkgroupIndex;
-				scanScratch.data[finishedFlagOffset] = 1u; // TODO (PentaKon): Refactor this when the ScanScratch descriptor set is declared
+				scanScratchBuf[0].data[finishedFlagOffset] = 1u;
 			}
 		}
 	}
 }
+#endif
+
 }
 }
 }
-#endif
+
 
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 86c5589725..0adee5dc32 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -1,3 +1,117 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
 
+#ifndef _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
+#define _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_
 
-// choerent -> globallycoherent
\ No newline at end of file
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+// coherent -> globallycoherent
+
+// (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
+#ifndef SCRATCH_SIZE
+#error "Must manually define SCRATCH_SIZE for now"
+#endif
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint32_t scratchElementCount=SCRATCH_SIZE>
+struct Scratch
+{
+    uint32_t workgroupsStarted;
+    uint32_t data[scratchElementCount];
+};
+
+[[vk::binding(0 ,0)]] StructuredBuffer<uint32_t> scanInputBuf; // (REVIEW): Make the type externalizable. Decide how (#define?)
+[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
+
+template<typename Storage_t, bool isExclusive=false>
+void getData(
+    NBL_REF_ARG(Storage_t) data,
+    NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+    NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) treeLevel,
+    NBL_CONST_REF_ARG(uint32_t) pseudoLevel
+)
+{
+    const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders
+    
+    uint32_t offset = levelInvocationIndex;
+    const bool notFirstOrLastLevel = bool(pseudoLevel);
+    if (notFirstOrLastLevel)
+		offset += params.temporaryStorageOffset[pseudoLevel-1u];
+    
+    if (pseudoLevel!=treeLevel) // downsweep
+	{
+		const bool firstInvocationInGroup = SubgroupContiguousIndex()==0u;
+		if (bool(localWorkgroupIndex) && firstInvocationInGroup)
+			data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]];
+
+		if (notFirstOrLastLevel)
+		{
+			if (!firstInvocationInGroup)
+				data = scanScratchBuf[0].data[offset-1u];
+		}
+		else
+		{
+            if(isExclusive)
+            {
+                if (!firstInvocationInGroup)
+                    data += scanInputBuf[offset-1u];
+            }
+            else
+            {
+                data += scanInputBuf[offset];
+            }
+		}
+	}
+	else
+	{
+		if (notFirstOrLastLevel)
+			data = scanScratchBuf[0].data[offset];
+		else
+			data = scanInputBuf[offset];
+	}
+}
+
+template<typename Storage_t>
+void setData(
+    NBL_CONST_REF_ARG(Storage_t) data,
+    NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+    NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) treeLevel,
+    NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
+    NBL_CONST_REF_ARG(bool) inRange
+)
+{
+    const Parameters_t params = getParameters();
+	if (treeLevel<params.topLevel)
+	{
+		const bool lastInvocationInGroup = SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1);
+		if (lastInvocationInGroup)
+			scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
+	}
+	else if (inRange)
+	{
+		if (bool(pseudoLevel))
+		{
+			const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u];
+			scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
+		}
+		else
+			scanInputBuf[levelInvocationIndex] = data;
+	}
+}
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index 325a08e3f0..cb2e0ce078 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -1,10 +1,20 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#endif
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
 
-#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
-#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
+// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
+uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
+
+struct ScanPushConstants
+{
+    nbl::hlsl::scan::Parameters_t scanParams;
+    nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams;
+};
+
+[[vk::push_constant]]
+ScanPushConstants spc;
 
 namespace nbl
 {
@@ -12,39 +22,22 @@ namespace hlsl
 {
 namespace scan
 {
-#ifndef _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-	cbuffer PC // REVIEW: register and packoffset selection
-	{
-		Parameters_t scanParams;
-		DefaultSchedulerParameters_t schedulerParams;
-	};
-#define _NBL_HLSL_SCAN_PUSH_CONSTANTS_DEFINED_
-#endif
-
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
 Parameters_t getParameters()
 {
-	return pc.scanParams;
+    return spc.scanParams;
 }
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
 
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
 DefaultSchedulerParameters_t getSchedulerParameters()
 {
-	return pc.schedulerParams;
+    return spc.schedulerParams;
 }
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
+
 }
 }
 }
 
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
 {
-	nbl::hlsl::scan::main();
-}
-#define _NBL_HLSL_MAIN_DEFINED_
-#endif
\ No newline at end of file
+    nbl::hlsl::scan::main();
+}
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/indirect.hlsl b/include/nbl/builtin/hlsl/scan/indirect.hlsl
index 1191731f65..f330b1393d 100644
--- a/include/nbl/builtin/hlsl/scan/indirect.hlsl
+++ b/include/nbl/builtin/hlsl/scan/indirect.hlsl
@@ -1,8 +1,3 @@
-#ifndef _NBL_HLSL_WORKGROUP_SIZE_
-#define _NBL_HLSL_WORKGROUP_SIZE_ 256
-#define _NBL_HLSL_WORKGROUP_SIZE_LOG2_ 8
-#endif
-
 #include "nbl/builtin/hlsl/scan/descriptors.hlsl"
 #include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
 #include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
@@ -13,36 +8,33 @@ namespace hlsl
 {
 namespace scan
 {
-#ifndef _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
+    
+uint32_t getIndirectElementCount();
+
 Parameters_t scanParams;
+    
 Parameters_t getParameters()
 {
-	return scanParams;
+    return scanParams;
 }
-#define _NBL_HLSL_SCAN_GET_PARAMETERS_DEFINED_
-#endif
-
-uint getIndirectElementCount();
 
-#ifndef _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
 DefaultSchedulerParameters_t schedulerParams;
+
 DefaultSchedulerParameters_t getSchedulerParameters()
 {
-	scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams);
+    scheduler::computeParameters(getIndirectElementCount(),scanParams,schedulerParams);
 	return schedulerParams;
 }
-#define _NBL_HLSL_SCAN_GET_SCHEDULER_PARAMETERS_DEFINED_
-#endif
+
 }
 }
 }
 
-#ifndef _NBL_HLSL_MAIN_DEFINED_
-[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)]
-void CSMain()
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
 {
-	if (bool(nbl::hlsl::scan::getIndirectElementCount()))
-		nbl::hlsl::scan::main();
+    if(bool(nbl::hlsl::scan::getIndirectElementCount())) {
+        // TODO call main from virtual_workgroup.hlsl
+    }
 }
-#define _NBL_HLSL_MAIN_DEFINED_
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl b/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
deleted file mode 100644
index bfeba13be2..0000000000
--- a/include/nbl/builtin/hlsl/scan/parameters_struct.hlsl
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-#define _NBL_HLSL_SCAN_PARAMETERS_STRUCT_INCLUDED_
-
-#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
-
-#ifdef __cplusplus
-#define uint uint32_t
-#endif
-
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
-	// REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
-	struct Parameters_t {
-		uint lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
-		uint topLevel;
-		uint temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
-	}
-}
-}
-}
-
-#ifdef __cplusplus
-#undef uint
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 488bf29012..3563f9546e 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -6,29 +6,27 @@
 #include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
-
+#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
 #include "nbl/builtin/hlsl/binops.hlsl"
 
-#if 0
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
 {
-	template<class Binop, class Storage_t>
-	void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex)
+	template<class Binop, typename Storage_t, uint16_t ItemCount, class Accessor, bool isExclusive, class device_capabilities=void>
+	void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
 	{
 		const Parameters_t params = getParameters();
-		const uint levelInvocationIndex = localWorkgroupIndex * _NBL_HLSL_WORKGROUP_SIZE_ + SubgroupContiguousIndex();
-		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (_NBL_HLSL_WORKGROUP_SIZE_ - 1);
-
-		const uint lastLevel = params.topLevel << 1u;
-		const uint pseudoLevel = levelInvocationIndex <= params.lastElement[pseudoLevel];
+		const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex();
+		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1);
 
+		const uint32_t lastLevel = params.topLevel << 1u;
+		const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
 		const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
 
-		Storage_t data = Binop::identity();
+		Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t?
 		if(inRange)
 		{
 			getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
@@ -36,17 +34,19 @@ namespace scan
 
 		if(treeLevel < params.topLevel) 
 		{
-			#error "Must also define some scratch accessor when calling operation()"
-			data = workgroup::reduction<Binop>()(data);
+            data = workgroup::reduction<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
 		}
-		// REVIEW: missing _TYPE_ check and extra case here
+        else if (!isExclusive && params.topLevel == 0u)
+        {
+            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+        }
 		else if (treeLevel != params.topLevel)
 		{
-			data = workgroup::inclusive_scan<Binop>()(data);
+			data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
 		}
 		else
 		{
-			data = workgroup::exclusive_scan<Binop>()(data);
+			data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
 		}
 		setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
 	}
@@ -54,8 +54,6 @@ namespace scan
 }
 }
 
-#ifndef _NBL_HLSL_SCAN_MAIN_DEFINED_ // TODO REVIEW: Are these needed, can this logic be refactored?
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
 namespace nbl
 {
 namespace hlsl
@@ -66,11 +64,13 @@ namespace scan
 	void main()
 	{
 		const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
-		const uint topLevel = getParameters().topLevel;
+		const uint32_t topLevel = getParameters().topLevel;
 		// persistent workgroups
 		while (true)
 		{
-			uint treeLevel,localWorkgroupIndex;
+            // REVIEW: Need to create accessor here.
+            // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call?
+			uint32_t treeLevel,localWorkgroupIndex;
 			if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex))
 			{
 				return;
@@ -84,9 +84,5 @@ namespace scan
 }
 }
 }
-#endif
-
-#define _NBL_HLSL_SCAN_MAIN_DEFINED_
-#endif
 
 #endif
\ No newline at end of file

From 20a60f5fa4af48be81a511ed7db9ee5b6f106a89 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Mon, 12 Feb 2024 01:17:25 +0200
Subject: [PATCH 04/25] Global scan migration of code to new APIs

---
 .../builtin/hlsl/scan/default_scheduler.hlsl  |  2 +-
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    | 17 ++----
 include/nbl/builtin/hlsl/scan/direct.hlsl     | 49 ++++++++++++++-
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  | 29 ++++-----
 include/nbl/video/utilities/CScanner.h        | 34 +++++------
 src/nbl/video/utilities/CScanner.cpp          | 61 ++++++++++++++-----
 6 files changed, 130 insertions(+), 62 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 67c62aadae..5ec112b065 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -115,7 +115,7 @@ namespace scheduler
 	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
 	 */
 	template<class Accessor>
-	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
+	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
 	{
 		if(SubgroupContiguousIndex() == 0u) 
 		{
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 0adee5dc32..169257e30a 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -10,11 +10,6 @@
 
 // coherent -> globallycoherent
 
-// (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
-#ifndef SCRATCH_SIZE
-#error "Must manually define SCRATCH_SIZE for now"
-#endif
-
 namespace nbl
 {
 namespace hlsl
@@ -22,14 +17,14 @@ namespace hlsl
 namespace scan
 {
 
-template<uint32_t scratchElementCount=SCRATCH_SIZE>
+template<uint32_t scratchElementCount=scratchSz> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
 struct Scratch
 {
     uint32_t workgroupsStarted;
     uint32_t data[scratchElementCount];
 };
 
-[[vk::binding(0 ,0)]] StructuredBuffer<uint32_t> scanInputBuf; // (REVIEW): Make the type externalizable. Decide how (#define?)
+[[vk::binding(0 ,0)]] RWStructuredBuffer<uint32_t /*Storage_t*/> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
 [[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
 
 template<typename Storage_t, bool isExclusive=false>
@@ -64,11 +59,11 @@ void getData(
             if(isExclusive)
             {
                 if (!firstInvocationInGroup)
-                    data += scanInputBuf[offset-1u];
+                    data += scanBuffer[offset-1u];
             }
             else
             {
-                data += scanInputBuf[offset];
+                data += scanBuffer[offset];
             }
 		}
 	}
@@ -77,7 +72,7 @@ void getData(
 		if (notFirstOrLastLevel)
 			data = scanScratchBuf[0].data[offset];
 		else
-			data = scanInputBuf[offset];
+			data = scanBuffer[offset];
 	}
 }
 
@@ -106,7 +101,7 @@ void setData(
 			scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
 		}
 		else
-			scanInputBuf[levelInvocationIndex] = data;
+			scanBuffer[levelInvocationIndex] = data;
 	}
 }
 
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index cb2e0ce078..76003d8e4a 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -1,9 +1,45 @@
 // Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#pragma shader_stage(compute)
+
+#include "nbl/builtin/hlsl/functional.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
 
+static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
+
+// TODO: Can we make it a static variable?
+groupshared uint32_t wgScratch[SharedScratchSz];
+
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+
+template<uint16_t offset>
+struct WGScratchProxy
+{
+	uint32_t get(const uint32_t ix)
+	{
+		return wgScratch[ix+offset];
+	}
+	void set(const uint32_t ix, const uint32_t value)
+	{
+		wgScratch[ix+offset] = value;
+	}
+
+    uint32_t atomicAdd(uint32_t ix, uint32_t val)
+    {
+        return glsl::atomicAdd(wgScratch[ix + offset], val);
+    }
+
+	void workgroupExecutionAndMemoryBarrier()
+	{
+		nbl::hlsl::glsl::barrier();
+		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+	}
+};
+static WGScratchProxy<0> accessor;
+
 // https://github.com/microsoft/DirectXShaderCompiler/issues/6144
 uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
 
@@ -16,6 +52,17 @@ struct ScanPushConstants
 [[vk::push_constant]]
 ScanPushConstants spc;
 
+/**
+ * Required since we rely on SubgroupContiguousIndex instead of 
+ * gl_LocalInvocationIndex which means to match the global index 
+ * we can't use the gl_GlobalInvocationID but an index based on 
+ * SubgroupContiguousIndex.
+ */
+uint32_t globalIndex()
+{
+	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+}
+
 namespace nbl
 {
 namespace hlsl
@@ -39,5 +86,5 @@ DefaultSchedulerParameters_t getSchedulerParameters()
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-    nbl::hlsl::scan::main();
+    nbl::hlsl::scan::main<BINOP, Storage_t, isExclusive, WGScratchProxy>(accessor);
 }
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 3563f9546e..a8319f4366 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -15,7 +15,7 @@ namespace hlsl
 {
 namespace scan
 {
-	template<class Binop, typename Storage_t, uint16_t ItemCount, class Accessor, bool isExclusive, class device_capabilities=void>
+	template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor, class device_capabilities=void>
 	void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
 	{
 		const Parameters_t params = getParameters();
@@ -26,6 +26,9 @@ namespace scan
 		const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
 		const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
 
+        // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel
+        // so that it exits after reaching the top?
+
 		Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t?
 		if(inRange)
 		{
@@ -34,34 +37,26 @@ namespace scan
 
 		if(treeLevel < params.topLevel) 
 		{
-            data = workgroup::reduction<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+            data = workgroup::reduction<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
 		}
         else if (!isExclusive && params.topLevel == 0u)
         {
-            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
         }
 		else if (treeLevel != params.topLevel)
 		{
-			data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+			data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
 		}
 		else
 		{
-			data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(value,accessor);
+			data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
 		}
 		setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
 	}
-}
-}
-}
 
-namespace nbl
-{
-namespace hlsl
-{
-namespace scan
-{
 	DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
-	void main()
+	template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor>
+    void main(NBL_REF_ARG(Accessor) accessor)
 	{
 		const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
 		const uint32_t topLevel = getParameters().topLevel;
@@ -71,12 +66,12 @@ namespace scan
             // REVIEW: Need to create accessor here.
             // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call?
 			uint32_t treeLevel,localWorkgroupIndex;
-			if (scheduler::getWork(schedulerParams,topLevel,treeLevel,localWorkgroupIndex))
+			if (scheduler::getWork<Accessor>(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor))
 			{
 				return;
 			}
 
-			virtualWorkgroup(treeLevel,localWorkgroupIndex);
+			virtualWorkgroup<Binop, Storage_t, isExclusive, ItemCount, Accessor>(treeLevel, localWorkgroupIndex, accessor);
 
 			scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex);
 		}
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index 2ae67163eb..9f1f75b117 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -9,14 +9,12 @@
 #include "nbl/video/IPhysicalDevice.h"
 #include "nbl/video/utilities/IDescriptorSetCache.h"
 
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS & 0x1, "NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!");
 
 namespace nbl::video
 {
 
-#include "nbl/builtin/glsl/scan/parameters_struct.glsl"
-#include "nbl/builtin/glsl/scan/default_scheduler.glsl"
-static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS&0x1,"NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!");
-
 /**
 Utility class to help you perform the equivalent of `std::inclusive_scan` and `std::exclusive_scan` with data on the GPU.
 
@@ -150,9 +148,9 @@ class CScanner final : public core::IReferenceCounted
 		enum E_SCAN_TYPE : uint8_t
 		{
 			 // computes output[n] = Sum_{i<=n}(input[i])
-			 EST_INCLUSIVE = _NBL_GLSL_SCAN_TYPE_INCLUSIVE_,
+			 EST_INCLUSIVE = 0u,
 			 // computes output[n] = Sum_{i<n}(input[i]), meaning first element is identity
-			 EST_EXCLUSIVE = _NBL_GLSL_SCAN_TYPE_EXCLUSIVE_,
+			 EST_EXCLUSIVE,
 			 EST_COUNT
 		};
 		// Only 4 byte wide data types supported due to need to trade the via shared memory,
@@ -167,18 +165,18 @@ class CScanner final : public core::IReferenceCounted
 		};
 		enum E_OPERATOR : uint8_t
 		{
-			EO_AND = _NBL_GLSL_SCAN_OP_AND_,
-			EO_XOR = _NBL_GLSL_SCAN_OP_XOR_,
-			EO_OR = _NBL_GLSL_SCAN_OP_OR_,
-			EO_ADD = _NBL_GLSL_SCAN_OP_ADD_,
-			EO_MUL = _NBL_GLSL_SCAN_OP_MUL_,
-			EO_MIN = _NBL_GLSL_SCAN_OP_MIN_,
-			EO_MAX = _NBL_GLSL_SCAN_OP_MAX_,
-			EO_COUNT = _NBL_GLSL_SCAN_OP_COUNT_
+			EO_AND = 0u,
+			EO_XOR,
+			EO_OR,
+			EO_ADD,
+			EO_MUL,
+			EO_MIN,
+			EO_MAX,
+			EO_COUNT
 		};
 
 		// This struct is only for managing where to store intermediate results of the scans
-		struct Parameters : nbl_glsl_scan_Parameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches
+		struct Parameters : nbl::hlsl::scan::Parameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches
 		{
 			static inline constexpr uint32_t MaxScanLevels = NBL_BUILTIN_MAX_SCAN_LEVELS;
 
@@ -210,7 +208,7 @@ class CScanner final : public core::IReferenceCounted
 			}
 		};
                 // the default scheduler we provide works as described above in the big documentation block
-		struct SchedulerParameters : nbl_glsl_scan_DefaultSchedulerParameters_t  // this struct and its methods are also available in GLSL so you can launch indirect dispatches
+		struct SchedulerParameters : nbl::hlsl::scan::DefaultSchedulerParameters_t  // this struct and its methods are also available in GLSL so you can launch indirect dispatches
 		{
 			SchedulerParameters()
 			{
@@ -305,14 +303,14 @@ class CScanner final : public core::IReferenceCounted
 			if (!m_specialized_shaders[scanType][dataType][op])
 			{
 				auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op));
-				cpuShader->setFilePathHint("nbl/builtin/glsl/scan/direct.comp");
+				cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
 				cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
 
 				auto gpushader = m_device->createShader(std::move(cpuShader));
 
 				m_specialized_shaders[scanType][dataType][op] = m_device->createSpecializedShader(
 					gpushader.get(),{nullptr,nullptr,"main"});
-				// , asset::IShader::ESS_COMPUTE, "nbl/builtin/glsl/scan/direct.comp"
+				// , asset::IShader::ESS_COMPUTE, "nbl/builtin/hlsl/scan/direct.hlsl"
 			}
 			return m_specialized_shaders[scanType][dataType][op].get();
 		}
diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp
index ec31272358..6fb88074b6 100644
--- a/src/nbl/video/utilities/CScanner.cpp
+++ b/src/nbl/video/utilities/CScanner.cpp
@@ -3,11 +3,10 @@
 using namespace nbl;
 using namespace video;
 
-#if 0 // TODO: port
 core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const
 {
 	auto system = m_device->getPhysicalDevice()->getSystem();
-	core::smart_refctd_ptr<const system::IFile> glsl;
+	core::smart_refctd_ptr<const system::IFile> hlsl;
 	{
 		auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile>
 		{
@@ -19,24 +18,59 @@ core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indi
 		};
 
 		if(indirect)
-			glsl = loadBuiltinData("nbl/builtin/glsl/scan/indirect.comp");
+			hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect.hlsl");
 		else
-			glsl = loadBuiltinData("nbl/builtin/glsl/scan/direct.comp");
+			hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/direct.hlsl");
 	}
-	auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(glsl->getSize());
-	memcpy(buffer->getPointer(), glsl->getMappedPointer(), glsl->getSize());
-	auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_GLSL, "????");
+	auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(hlsl->getSize());
+	memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize());
+	auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????");
+
+	// (REVIEW): All of the below, that rely on enumerations, should probably be changed to take advantage of HLSL-CPP compatibility.
+	// Issue is that CScanner caches all shaders for all permutations of scanType-dataType-operator and it's not clear how to 
+	// do this without enums
 	const char* storageType = nullptr;
 	switch (dataType)
 	{
 		case EDT_UINT:
-			storageType = "uint";
+			storageType = "uint32_t";
 			break;
 		case EDT_INT:
-			storageType = "int";
+			storageType = "int32_t";
 			break;
 		case EDT_FLOAT:
-			storageType = "float";
+			storageType = "float32_t";
+			break;
+		default:
+			assert(false);
+			break;
+	}
+
+	bool isExclusive = scanType == EST_EXCLUSIVE;
+
+	const char* binop = nullptr;
+	switch (op)
+	{
+		case EO_AND:
+			binop = "bit_and";
+			break;
+		case EO_OR:
+			binop = "bit_or";
+			break;
+		case EO_XOR:
+			binop = "bit_xor";
+			break;
+		case EO_ADD:
+			binop = "plus";
+			break;
+		case EO_MUL:
+			binop = "multiplies";
+			break;
+		case EO_MAX:
+			binop = "maximum";
+			break;
+		case EO_MIN:
+			binop = "minimum";
 			break;
 		default:
 			assert(false);
@@ -45,8 +79,7 @@ core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indi
 
 	return asset::CGLSLCompiler::createOverridenCopy(
 		cpushader.get(),
-		"#define _NBL_GLSL_WORKGROUP_SIZE_ %d\n#define _NBL_GLSL_WORKGROUP_SIZE_LOG2_ %d\n#define _NBL_GLSL_SCAN_TYPE_ %d\n#define _NBL_GLSL_SCAN_STORAGE_TYPE_ %s\n#define _NBL_GLSL_SCAN_BIN_OP_ %d\n",
-		m_workgroupSize,hlsl::findMSB(m_workgroupSize),uint32_t(scanType),storageType,uint32_t(op)
+		"#define WORKGROUP_SIZE %d\nconst bool isExclusive = %b\ntypedef storageType %s\n#define BINOP nbl::hlsl::%s\n",
+		m_workgroupSize,isExclusive,storageType,binop
 	);
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file

From 7117cc31e061392025484d8d856baadbd0c28037 Mon Sep 17 00:00:00 2001
From: PentaKon <kpentaris@gmail.com>
Date: Sun, 31 Mar 2024 14:26:04 +0300
Subject: [PATCH 05/25] Update CScanner.h to new NBL API

---
 .../nbl/builtin/hlsl/scan/declarations.hlsl   |  2 +-
 include/nbl/video/utilities/CScanner.h        | 49 +++++++++----------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
index ec3e558dfc..4dbbf93d9a 100644
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -24,7 +24,7 @@ namespace scan
 		uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
 		uint32_t topLevel;
 		uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
-	}
+	};
     
 	Parameters_t getParameters();
 
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index 9f1f75b117..41dacf4328 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -141,7 +141,7 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about
 
 Console devs get to bring a gun to a knife fight...
 **/
-#if 0 // legacy & port to HLSL
+#if 1 // legacy & port to HLSL
 class CScanner final : public core::IReferenceCounted
 {
 	public:		
@@ -267,15 +267,14 @@ class CScanner final : public core::IReferenceCounted
 		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize)
 		{
 			assert(core::isPoT(m_workgroupSize));
-
-			const asset::SPushConstantRange pc_range = { asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants) };
+			const asset::SPushConstantRange pc_range[] = {asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)};
 			const IGPUDescriptorSetLayout::SBinding bindings[2] = {
 				{ 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer
 				{ 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch
 			};
 
-			m_ds_layout = m_device->createDescriptorSetLayout(bindings,bindings+sizeof(bindings)/sizeof(IGPUDescriptorSetLayout::SBinding));
-			m_pipeline_layout = m_device->createPipelineLayout(&pc_range,&pc_range+1,core::smart_refctd_ptr(m_ds_layout));
+			m_ds_layout = m_device->createDescriptorSetLayout(bindings);
+			m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout));
 		}
 
 		//
@@ -298,7 +297,7 @@ class CScanner final : public core::IReferenceCounted
 			return m_shaders[scanType][dataType][op].get();
 		}
 		//
-		inline IGPUSpecializedShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
+		inline IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
 		{
 			if (!m_specialized_shaders[scanType][dataType][op])
 			{
@@ -306,11 +305,9 @@ class CScanner final : public core::IReferenceCounted
 				cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
 				cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
 
-				auto gpushader = m_device->createShader(std::move(cpuShader));
+				auto gpushader = m_device->createShader(cpuShader.get());
 
-				m_specialized_shaders[scanType][dataType][op] = m_device->createSpecializedShader(
-					gpushader.get(),{nullptr,nullptr,"main"});
-				// , asset::IShader::ESS_COMPUTE, "nbl/builtin/hlsl/scan/direct.hlsl"
+				m_specialized_shaders[scanType][dataType][op] = gpushader;
 			}
 			return m_specialized_shaders[scanType][dataType][op].get();
 		}
@@ -319,11 +316,18 @@ class CScanner final : public core::IReferenceCounted
 		inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
 		{
 			// ondemand
-			if (!m_pipelines[scanType][dataType][op])
-				m_pipelines[scanType][dataType][op] = m_device->createComputePipeline(
-					nullptr,core::smart_refctd_ptr(m_pipeline_layout),
-					core::smart_refctd_ptr<IGPUSpecializedShader>(getDefaultSpecializedShader(scanType,dataType,op))
+			if (!m_pipelines[scanType][dataType][op]) {
+				IGPUComputePipeline::SCreationParams params = {};
+				params.layout = m_pipeline_layout.get();
+				// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
+				params.shader.entryPoint = "main";
+				params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op);
+
+				m_device->createComputePipelines(
+					nullptr, { &params,1 },
+					& m_pipelines[scanType][dataType][op]
 				);
+			}
 			return m_pipelines[scanType][dataType][op].get();
 		}
 
@@ -349,12 +353,7 @@ class CScanner final : public core::IReferenceCounted
 			video::IGPUDescriptorSet::SWriteDescriptorSet writes[2];
 			for (auto i=0u; i<2u; i++)
 			{
-				writes[i].dstSet = set;
-				writes[i].binding = i;
-				writes[i].arrayElement = 0u;
-				writes[i].count = 1u;
-				writes[i].descriptorType = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER;
-				writes[i].info = infos+i;
+				writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i };
 			}
 
 			device->updateDescriptorSets(2, writes, 0u, nullptr);
@@ -370,17 +369,13 @@ class CScanner final : public core::IReferenceCounted
 			cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants);
 			if (srcBufferBarrierCount)
 			{
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {};
-				info.bufBarrierCount = srcBufferBarrierCount;
-				info.bufBarriers = srcBufferBarriers;
+				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { srcBufferBarriers, 1} };
 				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
 			}
 			cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u);
 			if (srcBufferBarrierCount)
 			{
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {};
-				info.bufBarrierCount = dstBufferBarrierCount;
-				info.bufBarriers = dstBufferBarriers;
+				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { dstBufferBarriers, 1} };
 				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
 			}
 		}
@@ -400,7 +395,7 @@ class CScanner final : public core::IReferenceCounted
 		core::smart_refctd_ptr<IGPUDescriptorSetLayout> m_ds_layout;
 		core::smart_refctd_ptr<IGPUPipelineLayout> m_pipeline_layout;
 		core::smart_refctd_ptr<asset::ICPUShader> m_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
-		core::smart_refctd_ptr<IGPUSpecializedShader> m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
+		core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
 		core::smart_refctd_ptr<IGPUComputePipeline> m_pipelines[EST_COUNT][EDT_COUNT][EO_COUNT];
 		const uint32_t m_workgroupSize;
 };

From 51b4f741bb946343f70331ac37aa7006db5c328e Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Mon, 22 Apr 2024 00:12:34 +0300
Subject: [PATCH 06/25] Fix CScanner and related shaders to properly compile

---
 .../nbl/builtin/hlsl/scan/declarations.hlsl    |  2 +-
 .../builtin/hlsl/scan/default_scheduler.hlsl   | 15 ++++++---------
 include/nbl/builtin/hlsl/scan/descriptors.hlsl | 11 ++++++-----
 include/nbl/builtin/hlsl/scan/direct.hlsl      |  8 +++++---
 .../builtin/hlsl/scan/virtual_workgroup.hlsl   | 12 +++++-------
 include/nbl/video/utilities/CScanner.h         | 18 +++++++++---------
 include/nbl/video/utilities/IUtilities.h       |  8 +++-----
 src/nbl/video/utilities/CScanner.cpp           | 12 ++++++------
 8 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
index 4dbbf93d9a..4a3a473f74 100644
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -37,7 +37,7 @@ namespace scan
     
     DefaultSchedulerParameters_t getSchedulerParameters();
 
-	template<typename Storage_t>
+	template<typename Storage_t, bool isExclusive=false>
 	void getData(
 		NBL_REF_ARG(Storage_t) data,
 		NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 5ec112b065..40d746d7c1 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -15,8 +15,6 @@ namespace hlsl
 {
 namespace scan
 {
-
-#ifdef HLSL
 namespace scheduler
 {
 	/**
@@ -115,14 +113,14 @@ namespace scheduler
 	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
 	 */
 	template<class Accessor>
-	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
+	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
 	{
-		if(SubgroupContiguousIndex() == 0u) 
+		if(workgroup::SubgroupContiguousIndex() == 0u) 
 		{
 			uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u);
             sharedScratch.set(0u, originalWGs);
 		}
-		else if (SubgroupContiguousIndex() == 1u) 
+		else if (workgroup::SubgroupContiguousIndex() == 1u) 
 		{
 			sharedScratch.set(1u, 0u);
 		}
@@ -130,7 +128,7 @@ namespace scheduler
         
 		const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know?
 		const uint32_t lastLevel = topLevel<<1u;
-		if (SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[SubgroupContiguousIndex()]) 
+		if (workgroup::SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[workgroup::SubgroupContiguousIndex()]) 
 		{
             sharedScratch.atomicAdd(1u, 1u);
 		}
@@ -146,7 +144,7 @@ namespace scheduler
 		{
 			const uint prevLevel = treeLevel - 1u;
 			localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel];
-			if(SubgroupContiguousIndex() == 0u) 
+			if(workgroup::SubgroupContiguousIndex() == 0u) 
 			{
 				uint dependentsCount = 1u;
 				if(treeLevel <= topLevel) 
@@ -179,7 +177,7 @@ namespace scheduler
 	void markComplete(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
 	{
         glsl::memoryBarrierBuffer();
-		if (SubgroupContiguousIndex()==0u)
+		if (workgroup::SubgroupContiguousIndex()==0u)
 		{
 			uint32_t finishedFlagOffset = params.finishedFlagOffset[treeLevel];
 			if (treeLevel<topLevel)
@@ -195,7 +193,6 @@ namespace scheduler
 		}
 	}
 }
-#endif
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 169257e30a..d53f4e0541 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -17,7 +17,7 @@ namespace hlsl
 namespace scan
 {
 
-template<uint32_t scratchElementCount=scratchSz> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
+template<uint32_t scratchElementCount=SCRATCH_SZ> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
 struct Scratch
 {
     uint32_t workgroupsStarted;
@@ -25,9 +25,9 @@ struct Scratch
 };
 
 [[vk::binding(0 ,0)]] RWStructuredBuffer<uint32_t /*Storage_t*/> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
-[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
+[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> /*globallycoherent (seems we can't use along with VMM)*/ scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
 
-template<typename Storage_t, bool isExclusive=false>
+template<typename Storage_t, bool isExclusive>
 void getData(
     NBL_REF_ARG(Storage_t) data,
     NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
@@ -36,6 +36,7 @@ void getData(
     NBL_CONST_REF_ARG(uint32_t) pseudoLevel
 )
 {
+    glsl::memoryBarrierBuffer(); // scanScratchBuf can't be declared as coherent due to VMM(?)
     const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders
     
     uint32_t offset = levelInvocationIndex;
@@ -45,7 +46,7 @@ void getData(
     
     if (pseudoLevel!=treeLevel) // downsweep
 	{
-		const bool firstInvocationInGroup = SubgroupContiguousIndex()==0u;
+		const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
 		if (bool(localWorkgroupIndex) && firstInvocationInGroup)
 			data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]];
 
@@ -89,7 +90,7 @@ void setData(
     const Parameters_t params = getParameters();
 	if (treeLevel<params.topLevel)
 	{
-		const bool lastInvocationInGroup = SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1);
+		const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1);
 		if (lastInvocationInGroup)
 			scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
 	}
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index 76003d8e4a..09e83808f3 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -7,8 +7,10 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/scan/virtual_workgroup.hlsl"
 
-static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
+// ITEMS_PER_WG = WORKGROUP_SIZE
+static const uint32_t SharedScratchSz = nbl::hlsl::workgroup::scratch_size_arithmetic<WORKGROUP_SIZE>::value;
 
 // TODO: Can we make it a static variable?
 groupshared uint32_t wgScratch[SharedScratchSz];
@@ -29,7 +31,7 @@ struct WGScratchProxy
 
     uint32_t atomicAdd(uint32_t ix, uint32_t val)
     {
-        return glsl::atomicAdd(wgScratch[ix + offset], val);
+        return nbl::hlsl::glsl::atomicAdd(wgScratch[ix + offset], val);
     }
 
 	void workgroupExecutionAndMemoryBarrier()
@@ -86,5 +88,5 @@ DefaultSchedulerParameters_t getSchedulerParameters()
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-    nbl::hlsl::scan::main<BINOP, Storage_t, isExclusive, WGScratchProxy>(accessor);
+    nbl::hlsl::scan::main<BINOP<Storage_t>, Storage_t, true, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor);
 }
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index a8319f4366..7c39dd98c0 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -2,12 +2,10 @@
 #define _NBL_HLSL_SCAN_VIRTUAL_WORKGROUP_INCLUDED_
 
 // TODO (PentaKon): Decide if these are needed once we have a clearer picture of the refactor
-#include "nbl/builtin/hlsl/limits/numeric.hlsl"
-#include "nbl/builtin/hlsl/math/typeless_arithmetic.hlsl"
+#include "nbl/builtin/hlsl/functional.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
 #include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
-#include "nbl/builtin/hlsl/binops.hlsl"
 
 namespace nbl
 {
@@ -19,8 +17,8 @@ namespace scan
 	void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
 	{
 		const Parameters_t params = getParameters();
-		const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex();
-		const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1);
+		const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex();
+		const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex() == (glsl::gl_WorkGroupSize().x - 1);
 
 		const uint32_t lastLevel = params.topLevel << 1u;
 		const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
@@ -29,10 +27,10 @@ namespace scan
         // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel
         // so that it exits after reaching the top?
 
-		Storage_t data = Binop::identity(); // REVIEW: replace Storage_t with Binop::type_t?
+		Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t?
 		if(inRange)
 		{
-			getData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
+			getData<Storage_t, isExclusive>(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
 		}
 
 		if(treeLevel < params.topLevel) 
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index 41dacf4328..ff2250e544 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -284,24 +284,24 @@ class CScanner final : public core::IReferenceCounted
 		inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); }
 
 		// You need to override this shader with your own defintion of `nbl_glsl_scan_getIndirectElementCount` for it to even compile, so we always give you a new shader
-		core::smart_refctd_ptr<asset::ICPUShader> getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const
+		core::smart_refctd_ptr<asset::ICPUShader> getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
 		{
-			return createShader(true,scanType,dataType,op);
+			return createShader(true,scanType,dataType,op,scratchSz);
 		}
 
 		//
-		inline asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
+		inline asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
 		{
 			if (!m_shaders[scanType][dataType][op])
-				m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op);
+				m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz);
 			return m_shaders[scanType][dataType][op].get();
 		}
 		//
-		inline IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
+		inline IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
 		{
 			if (!m_specialized_shaders[scanType][dataType][op])
 			{
-				auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op));
+				auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op,scratchSz));
 				cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
 				cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
 
@@ -313,7 +313,7 @@ class CScanner final : public core::IReferenceCounted
 		}
 
 		//
-		inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op)
+		inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
 		{
 			// ondemand
 			if (!m_pipelines[scanType][dataType][op]) {
@@ -321,7 +321,7 @@ class CScanner final : public core::IReferenceCounted
 				params.layout = m_pipeline_layout.get();
 				// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
 				params.shader.entryPoint = "main";
-				params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op);
+				params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op, scratchSz);
 
 				m_device->createComputePipelines(
 					nullptr, { &params,1 },
@@ -388,7 +388,7 @@ class CScanner final : public core::IReferenceCounted
 			// all drop themselves automatically
 		}
 
-		core::smart_refctd_ptr<asset::ICPUShader> createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const;
+		core::smart_refctd_ptr<asset::ICPUShader> createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const;
 
 
 		core::smart_refctd_ptr<ILogicalDevice> m_device;
diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 23e609e14c..73304645d1 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -134,9 +134,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             m_propertyPoolHandler = core::make_smart_refctd_ptr<CPropertyPoolHandler>(core::smart_refctd_ptr(m_device));
             // smaller workgroups fill occupancy gaps better, especially on new Nvidia GPUs, but we don't want too small workgroups on mobile
             // TODO: investigate whether we need to clamp against 256u instead of 128u on mobile
+#endif
             const auto scan_workgroup_size = core::max(core::roundDownToPoT(limits.maxWorkgroupSize[0]) >> 1u, 128u);
             m_scanner = core::make_smart_refctd_ptr<CScanner>(core::smart_refctd_ptr(m_device), scan_workgroup_size);
-#endif
         }
 
         inline ~IUtilities()
@@ -162,13 +162,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         {
             return m_propertyPoolHandler.get();
         }
-
-        //!
+#endif
         virtual CScanner* getDefaultScanner() const
         {
             return m_scanner.get();
         }
-#endif
         //! This function provides some guards against streamingBuffer fragmentation or allocation failure
         static uint32_t getAllocationSizeForStreamingBuffer(const size_t size, const uint64_t alignment, uint32_t maxFreeBlock, const uint32_t optimalTransferAtom)
         {
@@ -613,8 +611,8 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
 
 #if 0 // TODO: port
         core::smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
-        core::smart_refctd_ptr<CScanner> m_scanner;
 #endif
+        core::smart_refctd_ptr<CScanner> m_scanner;
 };
 
 class ImageRegionIterator
diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp
index 6fb88074b6..06dfd076e3 100644
--- a/src/nbl/video/utilities/CScanner.cpp
+++ b/src/nbl/video/utilities/CScanner.cpp
@@ -3,7 +3,7 @@
 using namespace nbl;
 using namespace video;
 
-core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const
+core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
 {
 	auto system = m_device->getPhysicalDevice()->getSystem();
 	core::smart_refctd_ptr<const system::IFile> hlsl;
@@ -46,7 +46,7 @@ core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indi
 			break;
 	}
 
-	bool isExclusive = scanType == EST_EXCLUSIVE;
+	const char* isExclusive = scanType == EST_EXCLUSIVE ? "true" : "false";
 
 	const char* binop = nullptr;
 	switch (op)
@@ -76,10 +76,10 @@ core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indi
 			assert(false);
 			break;
 	}
-
-	return asset::CGLSLCompiler::createOverridenCopy(
+	
+	return asset::CHLSLCompiler::createOverridenCopy(
 		cpushader.get(),
-		"#define WORKGROUP_SIZE %d\nconst bool isExclusive = %b\ntypedef storageType %s\n#define BINOP nbl::hlsl::%s\n",
-		m_workgroupSize,isExclusive,storageType,binop
+		"#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n",
+		m_workgroupSize, storageType, binop, scratchSz
 	);
 }
\ No newline at end of file

From 3d29bc3f2a3f13f95452987a04ecf0bafc34bfaf Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sat, 4 May 2024 17:53:48 +0300
Subject: [PATCH 07/25] Update formatting and fix scratch buffer size

---
 .../nbl/builtin/hlsl/scan/declarations.hlsl   |  48 +--
 .../builtin/hlsl/scan/default_scheduler.hlsl  | 324 +++++++++---------
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    |  76 ++--
 include/nbl/builtin/hlsl/scan/direct.hlsl     |  28 +-
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  88 ++---
 5 files changed, 282 insertions(+), 282 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
index 4a3a473f74..3dafd0b1b6 100644
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -20,13 +20,13 @@ namespace hlsl
 namespace scan
 {
     // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
-	struct Parameters_t {
-		uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
-		uint32_t topLevel;
-		uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
-	};
+    struct Parameters_t {
+        uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
+        uint32_t topLevel;
+        uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
+    };
     
-	Parameters_t getParameters();
+    Parameters_t getParameters();
 
     struct DefaultSchedulerParameters_t
     {
@@ -37,24 +37,24 @@ namespace scan
     
     DefaultSchedulerParameters_t getSchedulerParameters();
 
-	template<typename Storage_t, bool isExclusive=false>
-	void getData(
-		NBL_REF_ARG(Storage_t) data,
-		NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
-		NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
-		NBL_CONST_REF_ARG(uint32_t) treeLevel,
-		NBL_CONST_REF_ARG(uint32_t) pseudoLevel
-	);
-
-	template<typename Storage_t>
-	void setData(
-		NBL_CONST_REF_ARG(Storage_t) data,
-		NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
-		NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
-		NBL_CONST_REF_ARG(uint32_t) treeLevel,
-		NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
-		NBL_CONST_REF_ARG(bool) inRange
-	);
+    template<typename Storage_t, bool isExclusive=false>
+    void getData(
+        NBL_REF_ARG(Storage_t) data,
+        NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+        NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+        NBL_CONST_REF_ARG(uint32_t) treeLevel,
+        NBL_CONST_REF_ARG(uint32_t) pseudoLevel
+    );
+
+    template<typename Storage_t>
+    void setData(
+        NBL_CONST_REF_ARG(Storage_t) data,
+        NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
+        NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+        NBL_CONST_REF_ARG(uint32_t) treeLevel,
+        NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
+        NBL_CONST_REF_ARG(bool) inRange
+    );
     
 }
 }
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 40d746d7c1..71dbc152aa 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -17,181 +17,181 @@ namespace scan
 {
 namespace scheduler
 {
-	/**
-	 * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation
-	 * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions 
-	 * that are used for synchronization etc.
-	 * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is 
-	 * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of 
-	 * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and 
-	 * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will 
-	 * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results 
-	 * in the downsweep phase of Blelloch Scan.
-	 * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount.
-	 * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase.
-	 * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the 
-	 * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40.
-	 * 
-	 * Parameter meaning:
-	 * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer
-	 * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize)
-	 * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan
-	 * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep)
-	 * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
-	 *							synchronizing between workgroups with while-loop spinning.
-	 */
-	void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
-	{
+    /**
+     * The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation
+     * (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions 
+     * that are used for synchronization etc.
+     * Remember that CScanner does a Blelloch Scan which works in levels. In each level of the Blelloch scan the array of elements is 
+     * broken down into sets of size=WorkgroupSize and each set is scanned using Hillis & Steele (aka Stone-Kogge adder). The result of 
+     * the scan is provided as an array element for the next level of the Blelloch Scan. This means that if we have 10000 elements and 
+     * WorkgroupSize=250, we will break the array into 40 sets and take their reduction results. The next level of the Blelloch Scan will 
+     * have an array of size 40. Only a single workgroup will be needed to work on that. After that array is scanned, we use the results 
+     * in the downsweep phase of Blelloch Scan.
+     * Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount.
+     * The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase.
+     * The intermediate results are stored in a scratch buffer. That buffer's size is is the sum of the element-array size for all the 
+     * Blelloch levels. Using the previous example, the scratch size should be 10000 + 40.
+     * 
+     * Parameter meaning:
+     * |> lastElement - the index of the last element of each Blelloch level in the scratch buffer
+     * |> topLevel - the top level the Blelloch Scan will have (this depends on the elementCount and the WorkgroupSize)
+     * |> temporaryStorageOffset - an offset array for each level of the Blelloch Scan. It is used when storing the REDUCTION result of each workgroup scan
+     * |> cumulativeWorkgroupCount - the sum-scan of all the workgroups that will need to be launched for each level of the Blelloch Scan (both upsweep and downsweep)
+     * |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps 
+     *                            synchronizing between workgroups with while-loop spinning.
+     */
+    void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams)
+    {
 #define WorkgroupCount(Level) (_scanParams.lastElement[Level+1]+1u)
         const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x);
-		_scanParams.lastElement[0] = elementCount-1u;
-		_scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2;
-		
-		for (uint32_t i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
-		{
-			const uint32_t next = i+1;
-			_scanParams.lastElement[next] = _scanParams.lastElement[i]>>workgroupSizeLog2;
-			i = next;
-		}
-		_schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0);
-		_schedulerParams.finishedFlagOffset[0] = 0u;
-		switch(_scanParams.topLevel)
-		{
-			case 1u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = 1u;
-				
-				_scanParams.temporaryStorageOffset[0] = 2u;
-				break;
-			case 2u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
-				_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u;
-				// climb down
-				_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u;
-				
-				_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1);
-				_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
-				break;
-			case 3u:
-				_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2);
-				_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u;
-				_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2);
-				_schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1);
-				_schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0);
-				// climb up
-				_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
-				_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2);
-				_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u;
-				// climb down
-				_schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u;
-				_schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2);
-				
-				_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1);
-				_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
-				_scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1);
-				break;
-			default:
-				break;
+        _scanParams.lastElement[0] = elementCount-1u;
+        _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2;
+        
+        for (uint32_t i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
+        {
+            const uint32_t next = i+1;
+            _scanParams.lastElement[next] = _scanParams.lastElement[i]>>workgroupSizeLog2;
+            i = next;
+        }
+        _schedulerParams.cumulativeWorkgroupCount[0] = WorkgroupCount(0);
+        _schedulerParams.finishedFlagOffset[0] = 0u;
+        switch(_scanParams.topLevel)
+        {
+            case 1u:
+                _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+1u;
+                _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(0);
+                // climb up
+                _schedulerParams.finishedFlagOffset[1] = 1u;
+                
+                _scanParams.temporaryStorageOffset[0] = 2u;
+                break;
+            case 2u:
+                _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
+                _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u;
+                _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1);
+                _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0);
+                // climb up
+                _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
+                _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u;
+                // climb down
+                _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u;
+                
+                _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1);
+                _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
+                break;
+            case 3u:
+                _schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1);
+                _schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2);
+                _schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u;
+                _schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(2);
+                _schedulerParams.cumulativeWorkgroupCount[5] = _schedulerParams.cumulativeWorkgroupCount[4]+WorkgroupCount(1);
+                _schedulerParams.cumulativeWorkgroupCount[6] = _schedulerParams.cumulativeWorkgroupCount[5]+WorkgroupCount(0);
+                // climb up
+                _schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1);
+                _schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+WorkgroupCount(2);
+                _schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[2]+1u;
+                // climb down
+                _schedulerParams.finishedFlagOffset[4] = _schedulerParams.finishedFlagOffset[2]+2u;
+                _schedulerParams.finishedFlagOffset[5] = _schedulerParams.finishedFlagOffset[4]+WorkgroupCount(2);
+                
+                _scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[5]+WorkgroupCount(1);
+                _scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0);
+                _scanParams.temporaryStorageOffset[2] = _scanParams.temporaryStorageOffset[1]+WorkgroupCount(1);
+                break;
+            default:
+                break;
 #if NBL_BUILTIN_MAX_SCAN_LEVELS>7
 #error "Switch needs more cases"
 #endif
-		}
+        }
 #undef WorkgroupCount
-	}
-	
-	/**
-	 * treeLevel - the current level in the Blelloch Scan
-	 * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
-	 * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
-	 */
-	template<class Accessor>
-	bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
-	{
-		if(workgroup::SubgroupContiguousIndex() == 0u) 
-		{
-			uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u);
+    }
+    
+    /**
+     * treeLevel - the current level in the Blelloch Scan
+     * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
+     * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
+     */
+    template<class Accessor>
+    bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
+    {
+        if(workgroup::SubgroupContiguousIndex() == 0u) 
+        {
+            uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u);
             sharedScratch.set(0u, originalWGs);
-		}
-		else if (workgroup::SubgroupContiguousIndex() == 1u) 
-		{
-			sharedScratch.set(1u, 0u);
-		}
-		sharedScratch.workgroupExecutionAndMemoryBarrier();
+        }
+        else if (workgroup::SubgroupContiguousIndex() == 1u) 
+        {
+            sharedScratch.set(1u, 0u);
+        }
+        sharedScratch.workgroupExecutionAndMemoryBarrier();
         
-		const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know?
-		const uint32_t lastLevel = topLevel<<1u;
-		if (workgroup::SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[workgroup::SubgroupContiguousIndex()]) 
-		{
+        const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know?
+        const uint32_t lastLevel = topLevel<<1u;
+        if (workgroup::SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[workgroup::SubgroupContiguousIndex()]) 
+        {
             sharedScratch.atomicAdd(1u, 1u);
-		}
-		sharedScratch.workgroupExecutionAndMemoryBarrier();
-		
-		treeLevel = sharedScratch.get(1u);
-		if(treeLevel>lastLevel)
+        }
+        sharedScratch.workgroupExecutionAndMemoryBarrier();
+        
+        treeLevel = sharedScratch.get(1u);
+        if(treeLevel>lastLevel)
             return true;
-		
-		localWorkgroupIndex = globalWorkgroupIndex;
-		const bool dependentLevel = treeLevel != 0u;
-		if(dependentLevel) 
-		{
-			const uint prevLevel = treeLevel - 1u;
-			localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel];
-			if(workgroup::SubgroupContiguousIndex() == 0u) 
-			{
-				uint dependentsCount = 1u;
-				if(treeLevel <= topLevel) 
-				{
-					dependentsCount = glsl::gl_WorkGroupSize().x;
-					const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
-					if (lastWorkgroup) 
-					{
-						const Parameters_t scanParams = getParameters();
-						dependentsCount = scanParams.lastElement[treeLevel]+1u;
-						if (treeLevel<topLevel) 
-						{
-							dependentsCount -= scanParams.lastElement[treeLevel+1u] * glsl::gl_WorkGroupSize().x;
-						}
-					}
-				}
-				uint dependentsFinishedFlagOffset = localWorkgroupIndex;
-				if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
-					dependentsFinishedFlagOffset /= glsl::gl_WorkGroupSize().x;
-				dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
-				while (scanScratchBuf[0].data[dependentsFinishedFlagOffset]!=dependentsCount)
-					glsl::memoryBarrierBuffer();
-			}
-		}
-		sharedScratch.workgroupExecutionAndMemoryBarrier();
+        
+        localWorkgroupIndex = globalWorkgroupIndex;
+        const bool dependentLevel = treeLevel != 0u;
+        if(dependentLevel) 
+        {
+            const uint32_t prevLevel = treeLevel - 1u;
+            localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel];
+            if(workgroup::SubgroupContiguousIndex() == 0u) 
+            {
+                uint32_t dependentsCount = 1u;
+                if(treeLevel <= topLevel) 
+                {
+                    dependentsCount = glsl::gl_WorkGroupSize().x;
+                    const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel];
+                    if (lastWorkgroup) 
+                    {
+                        const Parameters_t scanParams = getParameters();
+                        dependentsCount = scanParams.lastElement[treeLevel]+1u;
+                        if (treeLevel<topLevel) 
+                        {
+                            dependentsCount -= scanParams.lastElement[treeLevel+1u] * glsl::gl_WorkGroupSize().x;
+                        }
+                    }
+                }
+                uint32_t dependentsFinishedFlagOffset = localWorkgroupIndex;
+                if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above?
+                    dependentsFinishedFlagOffset /= glsl::gl_WorkGroupSize().x;
+                dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel];
+                while (scanScratchBuf[0].data[dependentsFinishedFlagOffset]!=dependentsCount)
+                    glsl::memoryBarrierBuffer();
+            }
+        }
+        sharedScratch.workgroupExecutionAndMemoryBarrier();
         glsl::memoryBarrierBuffer();
-		return false;
-	}
-	
-	void markComplete(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
-	{
+        return false;
+    }
+    
+    void markComplete(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex)
+    {
         glsl::memoryBarrierBuffer();
-		if (workgroup::SubgroupContiguousIndex()==0u)
-		{
-			uint32_t finishedFlagOffset = params.finishedFlagOffset[treeLevel];
-			if (treeLevel<topLevel)
-			{
-				finishedFlagOffset += localWorkgroupIndex/glsl::gl_WorkGroupSize().x;
+        if (workgroup::SubgroupContiguousIndex()==0u)
+        {
+            uint32_t finishedFlagOffset = params.finishedFlagOffset[treeLevel];
+            if (treeLevel<topLevel)
+            {
+                finishedFlagOffset += localWorkgroupIndex/glsl::gl_WorkGroupSize().x;
                 glsl::atomicAdd(scanScratchBuf[0].data[finishedFlagOffset], 1u);
-			}
-			else if (treeLevel!=(topLevel<<1u))
-			{
-				finishedFlagOffset += localWorkgroupIndex;
-				scanScratchBuf[0].data[finishedFlagOffset] = 1u;
-			}
-		}
-	}
+            }
+            else if (treeLevel!=(topLevel<<1u))
+            {
+                finishedFlagOffset += localWorkgroupIndex;
+                scanScratchBuf[0].data[finishedFlagOffset] = 1u;
+            }
+        }
+    }
 }
 
 }
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index d53f4e0541..9aaa672d3b 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -17,7 +17,7 @@ namespace hlsl
 namespace scan
 {
 
-template<uint32_t scratchElementCount=SCRATCH_SZ> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
+template<uint32_t scratchElementCount=SCRATCH_SZ - 1> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
 struct Scratch
 {
     uint32_t workgroupsStarted;
@@ -42,21 +42,21 @@ void getData(
     uint32_t offset = levelInvocationIndex;
     const bool notFirstOrLastLevel = bool(pseudoLevel);
     if (notFirstOrLastLevel)
-		offset += params.temporaryStorageOffset[pseudoLevel-1u];
+        offset += params.temporaryStorageOffset[pseudoLevel-1u];
     
     if (pseudoLevel!=treeLevel) // downsweep
-	{
-		const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
-		if (bool(localWorkgroupIndex) && firstInvocationInGroup)
-			data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]];
+    {
+        const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
+        if (bool(localWorkgroupIndex) && firstInvocationInGroup)
+            data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]];
 
-		if (notFirstOrLastLevel)
-		{
-			if (!firstInvocationInGroup)
-				data = scanScratchBuf[0].data[offset-1u];
-		}
-		else
-		{
+        if (notFirstOrLastLevel)
+        {
+            if (!firstInvocationInGroup)
+                data = scanScratchBuf[0].data[offset-1u];
+        }
+        else
+        {
             if(isExclusive)
             {
                 if (!firstInvocationInGroup)
@@ -66,15 +66,15 @@ void getData(
             {
                 data += scanBuffer[offset];
             }
-		}
-	}
-	else
-	{
-		if (notFirstOrLastLevel)
-			data = scanScratchBuf[0].data[offset];
-		else
-			data = scanBuffer[offset];
-	}
+        }
+    }
+    else
+    {
+        if (notFirstOrLastLevel)
+            data = scanScratchBuf[0].data[offset];
+        else
+            data = scanBuffer[offset];
+    }
 }
 
 template<typename Storage_t>
@@ -88,22 +88,22 @@ void setData(
 )
 {
     const Parameters_t params = getParameters();
-	if (treeLevel<params.topLevel)
-	{
-		const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1);
-		if (lastInvocationInGroup)
-			scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
-	}
-	else if (inRange)
-	{
-		if (bool(pseudoLevel))
-		{
-			const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u];
-			scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
-		}
-		else
-			scanBuffer[levelInvocationIndex] = data;
-	}
+    if (treeLevel<params.topLevel)
+    {
+        const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1);
+        if (lastInvocationInGroup)
+            scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
+    }
+    else if (inRange)
+    {
+        if (bool(pseudoLevel))
+        {
+            const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u];
+            scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
+        }
+        else
+            scanBuffer[levelInvocationIndex] = data;
+    }
 }
 
 }
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index 09e83808f3..f7a6d0fe5b 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -20,25 +20,25 @@ groupshared uint32_t wgScratch[SharedScratchSz];
 template<uint16_t offset>
 struct WGScratchProxy
 {
-	uint32_t get(const uint32_t ix)
-	{
-		return wgScratch[ix+offset];
-	}
-	void set(const uint32_t ix, const uint32_t value)
-	{
-		wgScratch[ix+offset] = value;
-	}
+    uint32_t get(const uint32_t ix)
+    {
+        return wgScratch[ix+offset];
+    }
+    void set(const uint32_t ix, const uint32_t value)
+    {
+        wgScratch[ix+offset] = value;
+    }
 
     uint32_t atomicAdd(uint32_t ix, uint32_t val)
     {
         return nbl::hlsl::glsl::atomicAdd(wgScratch[ix + offset], val);
     }
 
-	void workgroupExecutionAndMemoryBarrier()
-	{
-		nbl::hlsl::glsl::barrier();
-		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-	}
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        nbl::hlsl::glsl::barrier();
+        //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
+    }
 };
 static WGScratchProxy<0> accessor;
 
@@ -62,7 +62,7 @@ ScanPushConstants spc;
  */
 uint32_t globalIndex()
 {
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
+    return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
 }
 
 namespace nbl
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 7c39dd98c0..8425b096a3 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -13,67 +13,67 @@ namespace hlsl
 {
 namespace scan
 {
-	template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor, class device_capabilities=void>
-	void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
-	{
-		const Parameters_t params = getParameters();
-		const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex();
-		const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex() == (glsl::gl_WorkGroupSize().x - 1);
+    template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor, class device_capabilities=void>
+    void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
+    {
+        const Parameters_t params = getParameters();
+        const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex();
+        const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex() == (glsl::gl_WorkGroupSize().x - 1);
 
-		const uint32_t lastLevel = params.topLevel << 1u;
-		const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
-		const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
+        const uint32_t lastLevel = params.topLevel << 1u;
+        const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
+        const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
 
         // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel
         // so that it exits after reaching the top?
 
-		Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t?
-		if(inRange)
-		{
-			getData<Storage_t, isExclusive>(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
-		}
+        Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t?
+        if(inRange)
+        {
+            getData<Storage_t, isExclusive>(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
+        }
 
-		if(treeLevel < params.topLevel) 
-		{
+        if(treeLevel < params.topLevel) 
+        {
             data = workgroup::reduction<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
-		}
+        }
         else if (!isExclusive && params.topLevel == 0u)
         {
             data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
         }
-		else if (treeLevel != params.topLevel)
-		{
-			data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
-		}
-		else
-		{
-			data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
-		}
-		setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
-	}
+        else if (treeLevel != params.topLevel)
+        {
+            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
+        }
+        else
+        {
+            data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
+        }
+        setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
+    }
 
-	DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
-	template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor>
+    DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
+    template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor>
     void main(NBL_REF_ARG(Accessor) accessor)
-	{
-		const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
-		const uint32_t topLevel = getParameters().topLevel;
-		// persistent workgroups
-		while (true)
-		{
+    {
+        const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
+        const uint32_t topLevel = getParameters().topLevel;
+        // persistent workgroups
+        while (true)
+        {
             // REVIEW: Need to create accessor here.
             // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call?
-			uint32_t treeLevel,localWorkgroupIndex;
-			if (scheduler::getWork<Accessor>(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor))
-			{
-				return;
-			}
+            uint32_t treeLevel,localWorkgroupIndex;
+            if (scheduler::getWork<Accessor>(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor))
+            {
+                return;
+            }
 
-			virtualWorkgroup<Binop, Storage_t, isExclusive, ItemCount, Accessor>(treeLevel, localWorkgroupIndex, accessor);
+            virtualWorkgroup<Binop, Storage_t, isExclusive, ItemCount, Accessor>(treeLevel, localWorkgroupIndex, accessor);
 
-			scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex);
-		}
-	}
+            scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex);
+        }
+    }
 }
 }
 }

From af438bc619c72eede0ca42f7932028df9f94648e Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Fri, 10 May 2024 14:38:23 +0300
Subject: [PATCH 08/25] Intermediary update of code before re-implementation

---
 .../builtin/hlsl/scan/default_scheduler.hlsl  | 30 ++++++++-----------
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    |  1 -
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  7 ++++-
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 71dbc152aa..d240a271a1 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -115,24 +115,20 @@ namespace scheduler
     template<class Accessor>
     bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
     {
-        if(workgroup::SubgroupContiguousIndex() == 0u) 
-        {
-            uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u);
-            sharedScratch.set(0u, originalWGs);
-        }
-        else if (workgroup::SubgroupContiguousIndex() == 1u) 
-        {
-            sharedScratch.set(1u, 0u);
-        }
-        sharedScratch.workgroupExecutionAndMemoryBarrier();
-        
-        const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); // does every thread need to know?
         const uint32_t lastLevel = topLevel<<1u;
-        if (workgroup::SubgroupContiguousIndex()<=lastLevel && globalWorkgroupIndex>=params.cumulativeWorkgroupCount[workgroup::SubgroupContiguousIndex()]) 
-        {
-            sharedScratch.atomicAdd(1u, 1u);
-        }
-        sharedScratch.workgroupExecutionAndMemoryBarrier();
+		if(workgroup::SubgroupContiguousIndex() == 0u) 
+		{
+		  uint32_t originalWGs = glsl::atomicAdd(scanScratchBuf[0].workgroupsStarted, 1u);
+		  sharedScratch.set(0u, originalWGs);
+		  treeLevel = 0;
+		  while (originalWGs>=params.cumulativeWorkgroupCount[treeLevel]) { // doesn't work for now because PushConstant arrays can only be accessed by dynamically uniform indices
+			treeLevel++;
+		  }
+		  sharedScratch.set(1u, treeLevel);
+		}
+		sharedScratch.workgroupExecutionAndMemoryBarrier();
+		const uint32_t globalWorkgroupIndex = sharedScratch.get(0u);
+		treeLevel = sharedScratch.get(1u);
         
         treeLevel = sharedScratch.get(1u);
         if(treeLevel>lastLevel)
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 9aaa672d3b..10572e984a 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -36,7 +36,6 @@ void getData(
     NBL_CONST_REF_ARG(uint32_t) pseudoLevel
 )
 {
-    glsl::memoryBarrierBuffer(); // scanScratchBuf can't be declared as coherent due to VMM(?)
     const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders
     
     uint32_t offset = levelInvocationIndex;
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 8425b096a3..97b76035af 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -26,7 +26,12 @@ namespace scan
 
         // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel
         // so that it exits after reaching the top?
-
+        
+        // Seems that even though it's a mem barrier is must NOT diverge!
+        // This was called inside getData but due to inRange it can possibly diverge 
+        // so it must be called outside! Not yet sure why a mem barrier must be uniformly called.
+        glsl::memoryBarrierBuffer(); // scanScratchBuf can't be declared as coherent due to VMM(?)
+        
         Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t?
         if(inRange)
         {

From 61d480678a3801b314bf1bdb5c28952157f4e22e Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Mon, 20 May 2024 03:16:23 +0300
Subject: [PATCH 09/25] Initial implementation of global reduce

---
 .../nbl/builtin/hlsl/scan/declarations.hlsl   |  3 +-
 .../builtin/hlsl/scan/default_scheduler.hlsl  |  3 +-
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    | 14 ++--
 include/nbl/builtin/hlsl/scan/scheduler.hlsl  | 80 +++++++++++++++++++
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  | 40 +++++-----
 .../nbl/builtin/hlsl/workgroup/broadcast.hlsl |  2 +-
 include/nbl/video/utilities/CScanner.h        | 33 ++++++--
 7 files changed, 137 insertions(+), 38 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/scan/scheduler.hlsl

diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
index 3dafd0b1b6..699d345f15 100644
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -30,8 +30,9 @@ namespace scan
 
     struct DefaultSchedulerParameters_t
     {
-        uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1];
         uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
+        uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_SCAN_LEVELS];
+        uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_SCAN_LEVELS];
 
     };
     
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index d240a271a1..4ffe926a4b 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -110,7 +110,7 @@ namespace scheduler
     /**
      * treeLevel - the current level in the Blelloch Scan
      * localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. 
-     * For example, if we have dispatched 10 workgroups and we the virtual workgroup number is 35, then the localWorkgroupIndex should be 5.
+     * For example, if we have dispatched 10 workgroups and we the virtu    al workgroup number is 35, then the localWorkgroupIndex should be 5.
      */
     template<class Accessor>
     bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch)
@@ -128,7 +128,6 @@ namespace scheduler
 		}
 		sharedScratch.workgroupExecutionAndMemoryBarrier();
 		const uint32_t globalWorkgroupIndex = sharedScratch.get(0u);
-		treeLevel = sharedScratch.get(1u);
         
         treeLevel = sharedScratch.get(1u);
         if(treeLevel>lastLevel)
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 10572e984a..60ea6a771a 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -17,21 +17,21 @@ namespace hlsl
 namespace scan
 {
 
-template<uint32_t scratchElementCount=SCRATCH_SZ - 1> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though...
+template<uint32_t scratchElementCount=SCRATCH_SZ - 1>
 struct Scratch
 {
     uint32_t workgroupsStarted;
     uint32_t data[scratchElementCount];
 };
 
-[[vk::binding(0 ,0)]] RWStructuredBuffer<uint32_t /*Storage_t*/> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
+[[vk::binding(0 ,0)]] RWStructuredBuffer<Storage_t> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
 [[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> /*globallycoherent (seems we can't use along with VMM)*/ scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model
 
 template<typename Storage_t, bool isExclusive>
 void getData(
     NBL_REF_ARG(Storage_t) data,
     NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
-    NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex,
     NBL_CONST_REF_ARG(uint32_t) treeLevel,
     NBL_CONST_REF_ARG(uint32_t) pseudoLevel
 )
@@ -46,8 +46,8 @@ void getData(
     if (pseudoLevel!=treeLevel) // downsweep
     {
         const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
-        if (bool(localWorkgroupIndex) && firstInvocationInGroup)
-            data = scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]];
+        if (bool(levelWorkgroupIndex) && firstInvocationInGroup)
+            data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]];
 
         if (notFirstOrLastLevel)
         {
@@ -80,7 +80,7 @@ template<typename Storage_t>
 void setData(
     NBL_CONST_REF_ARG(Storage_t) data,
     NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
-    NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex,
+    NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex,
     NBL_CONST_REF_ARG(uint32_t) treeLevel,
     NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
     NBL_CONST_REF_ARG(bool) inRange
@@ -91,7 +91,7 @@ void setData(
     {
         const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1);
         if (lastInvocationInGroup)
-            scanScratchBuf[0].data[localWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
+            scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
     }
     else if (inRange)
     {
diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
new file mode 100644
index 0000000000..9aad4f0121
--- /dev/null
+++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
@@ -0,0 +1,80 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_
+#define _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_
+
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint16_t WorkgroupSize>
+struct Scheduler
+{
+  
+  static Scheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams)
+  {
+      Scheduler<WorkgroupSize> scheduler;
+      scheduler.params = params;
+      scheduler.schedParams = schedParams;
+      return scheduler;
+  }
+    
+  template<class Accessor>
+  bool getWork(NBL_REF_ARG(Accessor) accessor)
+  {
+    if (workgroup::SubgroupContiguousIndex()==0u)
+    {
+      uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted,1u);
+    }
+    uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor);
+    // nothing really happens, we just check if there's a workgroup in the level to grab
+    return uberWorkgroupIndex <= schedParams.cumulativeWorkgroupCount[level] - 1u;
+  }
+
+  template<class Accessor>
+  void markDone(NBL_REF_ARG(Accessor) accessor)
+  {
+    if (level==(params.topLevel)) // check if we reached the lastLevel
+    {
+      uberWorkgroupIndex = ~0u;
+      return;
+    }
+
+    if (workgroup::SubgroupContiguousIndex()==0u)
+    {
+      // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset
+      const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize;
+      const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u) + 1u;
+      if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
+      {
+        level++;
+      }
+    }
+    level = workgroup::BroadcastFirst(level, accessor);
+  }
+
+  uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level)
+  {
+      return level == 0u ? uberWorkgroupIndex : (uberWorkgroupIndex - schedParams.cumulativeWorkgroupCount[level-1u]);
+  }
+
+  Parameters_t params;
+  DefaultSchedulerParameters_t schedParams;
+  uint32_t uberWorkgroupIndex;
+  uint16_t level;
+};
+
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 97b76035af..638593ae42 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -5,7 +5,7 @@
 #include "nbl/builtin/hlsl/functional.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
-#include "nbl/builtin/hlsl/scan/default_scheduler.hlsl"
+#include "nbl/builtin/hlsl/scan/scheduler.hlsl"
 
 namespace nbl
 {
@@ -13,14 +13,14 @@ namespace hlsl
 {
 namespace scan
 {
-    template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor, class device_capabilities=void>
-    void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
+    template<class Binop, typename Storage_t, bool isExclusive, uint16_t WorkgroupSize, class Accessor, class device_capabilities=void>
+    void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
     {
         const Parameters_t params = getParameters();
-        const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex();
-        const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex() == (glsl::gl_WorkGroupSize().x - 1);
-
+        const uint32_t levelInvocationIndex = levelWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex();
         const uint32_t lastLevel = params.topLevel << 1u;
+        
+        // pseudoLevel is the level index going up until toplevel then back down
         const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
         const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
 
@@ -35,48 +35,46 @@ namespace scan
         Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t?
         if(inRange)
         {
-            getData<Storage_t, isExclusive>(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel);
+            getData<Storage_t, isExclusive>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel);
         }
 
         if(treeLevel < params.topLevel) 
         {
-            data = workgroup::reduction<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
+            data = workgroup::reduction<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
         }
         else if (!isExclusive && params.topLevel == 0u)
         {
-            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
+            data = workgroup::inclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
         }
         else if (treeLevel != params.topLevel)
         {
-            data = workgroup::inclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
+            data = workgroup::inclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
         }
         else
         {
-            data = workgroup::exclusive_scan<Binop,ItemCount,device_capabilities>::template __call<Accessor>(data,accessor);
+            data = workgroup::exclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
         }
-        setData(data, levelInvocationIndex, localWorkgroupIndex, treeLevel, pseudoLevel, inRange);
+        setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange);
     }
 
     DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
-    template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor>
+    template<class Binop, typename Storage_t, bool isExclusive, uint16_t WorkgroupSize, class Accessor>
     void main(NBL_REF_ARG(Accessor) accessor)
     {
+        const Parameters_t params = getParameters();
         const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
-        const uint32_t topLevel = getParameters().topLevel;
+        Scheduler<WorkgroupSize> scheduler = Scheduler<WorkgroupSize>::create(params, schedulerParams);
         // persistent workgroups
         while (true)
         {
-            // REVIEW: Need to create accessor here.
-            // REVIEW: Regarding ItemsPerWG this must probably be calculated after each getWork call?
-            uint32_t treeLevel,localWorkgroupIndex;
-            if (scheduler::getWork<Accessor>(schedulerParams, topLevel, treeLevel, localWorkgroupIndex, accessor))
+            if (!scheduler.getWork(accessor))
             {
                 return;
             }
 
-            virtualWorkgroup<Binop, Storage_t, isExclusive, ItemCount, Accessor>(treeLevel, localWorkgroupIndex, accessor);
-
-            scheduler::markComplete(schedulerParams,topLevel,treeLevel,localWorkgroupIndex);
+            virtualWorkgroup<Binop, Storage_t, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
+            accessor.workgroupExecutionAndMemoryBarrier();
+            scheduler.markDone(accessor);
         }
     }
 }
diff --git a/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl b/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl
index 97ef1f7dd0..ac5c889ad0 100644
--- a/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup/broadcast.hlsl
@@ -35,7 +35,7 @@ T Broadcast(NBL_CONST_REF_ARG(T) val, NBL_REF_ARG(Accessor) accessor, const uint
 template<typename T, class Accessor>
 T BroadcastFirst(NBL_CONST_REF_ARG(T) val, NBL_REF_ARG(Accessor) accessor)
 {
-    return Broadcast(val,0);
+    return Broadcast(val, accessor, 0);
 }
 
 }
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index ff2250e544..1868d14f77 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -212,14 +212,25 @@ class CScanner final : public core::IReferenceCounted
 		{
 			SchedulerParameters()
 			{
-				std::fill_n(finishedFlagOffset,Parameters::MaxScanLevels-1,0u);
+				//std::fill_n(finishedFlagOffset,Parameters::MaxScanLevels-1,0u);
 				std::fill_n(cumulativeWorkgroupCount,Parameters::MaxScanLevels,0u);
+				std::fill_n(workgroupFinishFlagsOffset, Parameters::MaxScanLevels, 0u);
+				std::fill_n(lastWorkgroupSetCountForLevel,Parameters::MaxScanLevels,0u);
 			}
                         // given the number of elements and workgroup size, figure out how many atomics we need
                         // also account for the fact that we will want to use the same scratch buffer both for the
                         // scheduler's atomics and the aux data storage
 			SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters()
 			{
+				/*For 696969
+					We will launch 1362 WGs
+					CumulativeWGCountPerLvl will be :
+				[1362 + 3 + 1]
+					WGFinFlagOFfsets
+					inclusive_scan([0, roundUp(1362 / 512), roundUp(3 / 512)])
+					LastWorkgroupSetCountForLevel
+					[1362 & (WGSZ - 1), 3 & (WGSZ - 1), 1 & (WGSZ - 1)]*/
+
 				outScanParams = Parameters(_elementCount,workgroupSize);
 				const auto topLevel = outScanParams.topLevel;
 
@@ -228,16 +239,24 @@ class CScanner final : public core::IReferenceCounted
 					cumulativeWorkgroupCount[i] += 1u;
 				std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u);
 
-				std::copy_n(cumulativeWorkgroupCount+1u,topLevel,finishedFlagOffset);
+				for (auto i = 0u; i <= topLevel; i++) {
+					workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1;
+					lastWorkgroupSetCountForLevel[i] = cumulativeWorkgroupCount[i] & (workgroupSize - 1u);
+				}
+
+				/*std::copy_n(cumulativeWorkgroupCount+1u,topLevel,finishedFlagOffset);
 				std::copy_n(cumulativeWorkgroupCount+topLevel,topLevel,finishedFlagOffset+topLevel);
 
 				const auto finishedFlagCount = sizeof(finishedFlagOffset)/sizeof(uint32_t);
 				const auto finishedFlagsSize = std::accumulate(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,0u);
-				std::exclusive_scan(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,finishedFlagOffset,0u);
+				std::exclusive_scan(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,finishedFlagOffset,0u);*/
+
+				const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxScanLevels, 0u);
+				std::exclusive_scan(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxScanLevels, workgroupFinishFlagsOffset, 0u);
 				for (auto i=0u; i<sizeof(Parameters::temporaryStorageOffset)/sizeof(uint32_t); i++)
-					outScanParams.temporaryStorageOffset[i] += finishedFlagsSize;
-					
-				std::inclusive_scan(cumulativeWorkgroupCount,cumulativeWorkgroupCount+Parameters::MaxScanLevels,cumulativeWorkgroupCount);
+					outScanParams.temporaryStorageOffset[i] += wgFinishedFlagsSize;
+				
+				std::inclusive_scan(cumulativeWorkgroupCount, cumulativeWorkgroupCount + Parameters::MaxScanLevels, cumulativeWorkgroupCount);
 			}
 		};
                 // push constants of the default direct scan pipeline provide both aux memory offset params and scheduling params
@@ -274,7 +293,9 @@ class CScanner final : public core::IReferenceCounted
 			};
 
 			m_ds_layout = m_device->createDescriptorSetLayout(bindings);
+			assert(m_ds_layout && "CScanner Descriptor Set Layout was not created successfully");
 			m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout));
+			assert(m_pipeline_layout && "CScanner Pipeline Layout was not created successfully");
 		}
 
 		//

From 44edb59cccd76edd78ae335f9c052af62f71ca1a Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Mon, 20 May 2024 04:41:16 +0300
Subject: [PATCH 10/25] Fix issues when WG > 1

---
 include/nbl/builtin/hlsl/scan/scheduler.hlsl | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
index 9aad4f0121..28f3f31d09 100644
--- a/include/nbl/builtin/hlsl/scan/scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
@@ -32,6 +32,11 @@ struct Scheduler
   template<class Accessor>
   bool getWork(NBL_REF_ARG(Accessor) accessor)
   {
+      
+glsl::memoryBarrierBuffer();
+if(scanScratchBuf[0u].workgroupsStarted >= schedParams.cumulativeWorkgroupCount[level])
+  return false;
+      
     if (workgroup::SubgroupContiguousIndex()==0u)
     {
       uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted,1u);
@@ -54,8 +59,10 @@ struct Scheduler
     {
       // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset
       const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize;
+      const uint32_t lastWorkgroupSetIndexForLevel = (level == 0u ? schedParams.cumulativeWorkgroupCount[level] : (schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u])) / WorkgroupSize;
       const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u) + 1u;
-      if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
+      //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
+      if ((uberWorkgroupIndex < lastWorkgroupSetIndexForLevel ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
       {
         level++;
       }

From cdcc5d0ea55472a682d6dcd8095a6404f97e09d1 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sun, 2 Jun 2024 12:58:49 +0300
Subject: [PATCH 11/25] Change .gitmodules examples to point to fork

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 20441ba171..478f7fa2f4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -69,7 +69,7 @@
 	branch = main
 [submodule "examples_tests"]
 	path = examples_tests
-	url = git@github.com:Devsh-Graphics-Programming/Nabla-Examples-and-Tests.git
+	url = git@github.com:kpentaris/Nabla-Examples-and-Tests.git
 [submodule "3rdparty/dxc/dxc"]
 	path = 3rdparty/dxc/dxc
 	url = git@github.com:Devsh-Graphics-Programming/DirectXShaderCompiler.git

From 1565fb82090995957b466accdba902fdd7950428 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sun, 2 Jun 2024 13:06:07 +0300
Subject: [PATCH 12/25] update .gitmodules to also point to branch for examples
 submodule

---
 .gitmodules | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitmodules b/.gitmodules
index 478f7fa2f4..3a54465877 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -70,6 +70,7 @@
 [submodule "examples_tests"]
 	path = examples_tests
 	url = git@github.com:kpentaris/Nabla-Examples-and-Tests.git
+	branch = master
 [submodule "3rdparty/dxc/dxc"]
 	path = 3rdparty/dxc/dxc
 	url = git@github.com:Devsh-Graphics-Programming/DirectXShaderCompiler.git

From 734e84a6aa805365318af9cadf04f41e22dc979b Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Mon, 10 Jun 2024 01:58:44 +0300
Subject: [PATCH 13/25] CReduce implementation

---
 .../nbl/builtin/hlsl/scan/declarations.hlsl   |  16 +-
 .../builtin/hlsl/scan/default_scheduler.hlsl  |   4 +-
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    |  10 +-
 include/nbl/builtin/hlsl/scan/direct.hlsl     |   2 +-
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  12 +-
 include/nbl/video/utilities/CArithmeticOps.h  | 286 ++++++++++++++++++
 include/nbl/video/utilities/CReduce.h         |  36 +++
 include/nbl/video/utilities/CScanner.h        | 270 +----------------
 include/nbl/video/utilities/IUtilities.h      |   8 +
 src/nbl/CMakeLists.txt                        |   1 +
 src/nbl/video/utilities/CReduce.cpp           |  56 ++++
 src/nbl/video/utilities/CScanner.cpp          | 119 +++-----
 12 files changed, 470 insertions(+), 350 deletions(-)
 create mode 100644 include/nbl/video/utilities/CArithmeticOps.h
 create mode 100644 include/nbl/video/utilities/CReduce.h
 create mode 100644 src/nbl/video/utilities/CReduce.cpp

diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
index 699d345f15..6e6c0f5563 100644
--- a/include/nbl/builtin/hlsl/scan/declarations.hlsl
+++ b/include/nbl/builtin/hlsl/scan/declarations.hlsl
@@ -9,8 +9,8 @@
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 
-#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS
-#define NBL_BUILTIN_MAX_SCAN_LEVELS 7
+#ifndef NBL_BUILTIN_MAX_LEVELS
+#define NBL_BUILTIN_MAX_LEVELS 7
 #endif
 
 namespace nbl
@@ -21,18 +21,18 @@ namespace scan
 {
     // REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules)
     struct Parameters_t {
-        uint32_t lastElement[NBL_BUILTIN_MAX_SCAN_LEVELS/2+1];
+        uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1];
         uint32_t topLevel;
-        uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2];
+        uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_LEVELS/2];
     };
     
     Parameters_t getParameters();
 
     struct DefaultSchedulerParameters_t
     {
-        uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS];
-        uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_SCAN_LEVELS];
-        uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_SCAN_LEVELS];
+        uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS];
+        uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS];
+        uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS];
 
     };
     
@@ -47,7 +47,7 @@ namespace scan
         NBL_CONST_REF_ARG(uint32_t) pseudoLevel
     );
 
-    template<typename Storage_t>
+    template<typename Storage_t, bool isScan>
     void setData(
         NBL_CONST_REF_ARG(Storage_t) data,
         NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
diff --git a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
index 4ffe926a4b..7f341b4cc2 100644
--- a/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/default_scheduler.hlsl
@@ -47,7 +47,7 @@ namespace scheduler
         _scanParams.lastElement[0] = elementCount-1u;
         _scanParams.topLevel = firstbithigh(_scanParams.lastElement[0]) / workgroupSizeLog2;
         
-        for (uint32_t i=0; i<NBL_BUILTIN_MAX_SCAN_LEVELS/2;)
+        for (uint32_t i=0; i<NBL_BUILTIN_MAX_LEVELS/2;)
         {
             const uint32_t next = i+1;
             _scanParams.lastElement[next] = _scanParams.lastElement[i]>>workgroupSizeLog2;
@@ -100,7 +100,7 @@ namespace scheduler
                 break;
             default:
                 break;
-#if NBL_BUILTIN_MAX_SCAN_LEVELS>7
+#if NBL_BUILTIN_MAX_LEVELS>7
 #error "Switch needs more cases"
 #endif
         }
diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 60ea6a771a..c46b2d38b3 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -76,7 +76,7 @@ void getData(
     }
 }
 
-template<typename Storage_t>
+template<typename Storage_t, bool isScan>
 void setData(
     NBL_CONST_REF_ARG(Storage_t) data,
     NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
@@ -95,13 +95,19 @@ void setData(
     }
     else if (inRange)
     {
-        if (bool(pseudoLevel))
+        if (!isScan && treeLevel == params.topLevel)
+        {
+            scanBuffer[levelInvocationIndex] = data;
+        }
+        else if (bool(pseudoLevel))
         {
             const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u];
             scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
         }
         else
+        {
             scanBuffer[levelInvocationIndex] = data;
+        }
     }
 }
 
diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index f7a6d0fe5b..e7bbcf7233 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -88,5 +88,5 @@ DefaultSchedulerParameters_t getSchedulerParameters()
 [numthreads(WORKGROUP_SIZE,1,1)]
 void main()
 {
-    nbl::hlsl::scan::main<BINOP<Storage_t>, Storage_t, true, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor);
+    nbl::hlsl::scan::main<BINOP<Storage_t>, Storage_t, IS_SCAN, IS_EXCLUSIVE, uint16_t(WORKGROUP_SIZE), WGScratchProxy<0> >(accessor);
 }
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 638593ae42..72434d2286 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -13,7 +13,7 @@ namespace hlsl
 {
 namespace scan
 {
-    template<class Binop, typename Storage_t, bool isExclusive, uint16_t WorkgroupSize, class Accessor, class device_capabilities=void>
+    template<class Binop, typename Storage_t, bool isScan, bool isExclusive, uint16_t WorkgroupSize, class Accessor, class device_capabilities=void>
     void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
     {
         const Parameters_t params = getParameters();
@@ -38,7 +38,9 @@ namespace scan
             getData<Storage_t, isExclusive>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel);
         }
 
-        if(treeLevel < params.topLevel) 
+        bool doReduce = isScan ? treeLevel < params.topLevel : treeLevel <= params.topLevel;
+
+        if(doReduce)
         {
             data = workgroup::reduction<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
         }
@@ -54,11 +56,11 @@ namespace scan
         {
             data = workgroup::exclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
         }
-        setData(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange);
+        setData<Storage_t, isScan>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange);
     }
 
     DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components
-    template<class Binop, typename Storage_t, bool isExclusive, uint16_t WorkgroupSize, class Accessor>
+    template<class Binop, typename Storage_t, bool isScan, bool isExclusive, uint16_t WorkgroupSize, class Accessor>
     void main(NBL_REF_ARG(Accessor) accessor)
     {
         const Parameters_t params = getParameters();
@@ -72,7 +74,7 @@ namespace scan
                 return;
             }
 
-            virtualWorkgroup<Binop, Storage_t, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
+            virtualWorkgroup<Binop, Storage_t, isScan, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
             accessor.workgroupExecutionAndMemoryBarrier();
             scheduler.markDone(accessor);
         }
diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h
new file mode 100644
index 0000000000..c57f18839d
--- /dev/null
+++ b/include/nbl/video/utilities/CArithmeticOps.h
@@ -0,0 +1,286 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_VIDEO_C_ARITHMETIC_OPS_H_INCLUDED_
+#define _NBL_VIDEO_C_ARITHMETIC_OPS_H_INCLUDED_
+
+
+#include "nbl/video/IPhysicalDevice.h"
+#include "nbl/video/utilities/IDescriptorSetCache.h"
+
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+
+
+namespace nbl::video
+{
+class CArithmeticOps : public core::IReferenceCounted
+{
+    public:
+		// Only 4 byte wide data types supported due to need to trade the via shared memory,
+		// different combinations of data type and operator have different identity elements.
+		// `EDT_INT` and `EO_MIN` will have `INT_MAX` as identity, while `EDT_UINT` would have `UINT_MAX`  
+		enum E_DATA_TYPE : uint8_t
+		{
+			EDT_UINT=0u,
+			EDT_INT,
+			EDT_FLOAT,
+			EDT_COUNT
+		};
+		enum E_OPERATOR : uint8_t
+		{
+			EO_AND = 0u,
+			EO_XOR,
+			EO_OR,
+			EO_ADD,
+			EO_MUL,
+			EO_MIN,
+			EO_MAX,
+			EO_COUNT
+		};
+
+		// This struct is only for managing where to store intermediate results of the scans
+		struct Parameters : nbl::hlsl::scan::Parameters_t // this struct and its methods are also available in HLSL
+		{
+			static inline constexpr uint32_t MaxLevels = NBL_BUILTIN_MAX_LEVELS;
+
+			Parameters()
+			{
+				std::fill_n(lastElement,MaxLevels/2+1,0u);
+				std::fill_n(temporaryStorageOffset,MaxLevels/2,0u);
+			}
+			// build the constant tables for each level given the number of elements to scan and workgroupSize
+			Parameters(const uint32_t _elementCount, const uint32_t workgroupSize) : Parameters()
+			{
+				assert(_elementCount!=0u && "Input element count can't be 0!");
+				const auto maxReductionLog2 = hlsl::findMSB(workgroupSize)*(MaxLevels/2u+1u);
+				assert(maxReductionLog2>=32u||((_elementCount-1u)>>maxReductionLog2)==0u && "Can't scan this many elements with such small workgroups!");
+
+				lastElement[0u] = _elementCount-1u;
+				for (topLevel=0u; lastElement[topLevel]>=workgroupSize;)
+					temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = lastElement[topLevel]/workgroupSize;
+				
+				std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u);
+			}
+            // given already computed tables of lastElement indices per level, number of levels, and storage offsets, tell us total auxillary buffer size needed
+			inline uint32_t getScratchSize(uint32_t ssboAlignment=256u)
+			{
+				uint32_t uint_count = 1u; // workgroup enumerator
+				uint_count += temporaryStorageOffset[MaxLevels/2u-1u]; // last scratch offset
+				uint_count += lastElement[topLevel]+1u; // and its size
+				return core::roundUp<uint32_t>(uint_count*sizeof(uint32_t),ssboAlignment);
+			}
+		};
+        
+        // the default scheduler we provide works as described above in the big documentation block
+		struct SchedulerParameters : nbl::hlsl::scan::DefaultSchedulerParameters_t  // this struct and its methods are also available in HLSL
+		{
+			SchedulerParameters()
+			{
+				std::fill_n(cumulativeWorkgroupCount,Parameters::MaxLevels,0u);
+				std::fill_n(workgroupFinishFlagsOffset, Parameters::MaxLevels, 0u);
+				std::fill_n(lastWorkgroupSetCountForLevel,Parameters::MaxLevels,0u);
+			}
+            
+            // given the number of elements and workgroup size, figure out how many atomics we need
+            // also account for the fact that we will want to use the same scratch buffer both for the
+            // scheduler's atomics and the aux data storage
+			SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters()
+			{
+				outScanParams = Parameters(_elementCount,workgroupSize);
+				const auto topLevel = outScanParams.topLevel;
+
+				std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount);
+				for (auto i=0u; i<=topLevel; i++)
+					cumulativeWorkgroupCount[i] += 1u;
+				std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u);
+
+				for (auto i = 0u; i <= topLevel; i++) {
+					workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1;
+					lastWorkgroupSetCountForLevel[i] = cumulativeWorkgroupCount[i] & (workgroupSize - 1u);
+				}
+
+				const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, 0u);
+				std::exclusive_scan(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, workgroupFinishFlagsOffset, 0u);
+				for (auto i=0u; i<sizeof(Parameters::temporaryStorageOffset)/sizeof(uint32_t); i++)
+					outScanParams.temporaryStorageOffset[i] += wgFinishedFlagsSize;
+				
+				std::inclusive_scan(cumulativeWorkgroupCount, cumulativeWorkgroupCount + Parameters::MaxLevels, cumulativeWorkgroupCount);
+			}
+		};
+        
+        // push constants of the default direct scan pipeline provide both aux memory offset params and scheduling params
+		struct DefaultPushConstants
+		{
+			Parameters scanParams;
+			SchedulerParameters schedulerParams;
+		};
+        
+		struct DispatchInfo
+		{
+			DispatchInfo() : wg_count(0u)
+			{
+			}
+            
+            // in case we scan very few elements, you don't want to launch workgroups that wont do anything
+			DispatchInfo(const IPhysicalDevice::SLimits& limits, const uint32_t elementCount, const uint32_t workgroupSize)
+			{
+				constexpr auto workgroupSpinningProtection = 4u; // to prevent first workgroup starving/idling on level 1 after finishing level 0 early
+				wg_count = limits.computeOptimalPersistentWorkgroupDispatchSize(elementCount,workgroupSize,workgroupSpinningProtection);
+			}
+
+			uint32_t wg_count;
+		};
+
+		CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device) : CArithmeticOps(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
+		
+        CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize)
+		{
+			assert(core::isPoT(m_workgroupSize));
+			const asset::SPushConstantRange pc_range[] = {asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)};
+			const IGPUDescriptorSetLayout::SBinding bindings[2] = {
+				{ 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer
+				{ 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch
+			};
+
+			m_ds_layout = m_device->createDescriptorSetLayout(bindings);
+			assert(m_ds_layout && "CArithmeticOps Descriptor Set Layout was not created successfully");
+			m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout));
+			assert(m_pipeline_layout && "CArithmeticOps Pipeline Layout was not created successfully");
+		}
+
+		inline auto getDefaultDescriptorSetLayout() const { return m_ds_layout.get(); }
+
+		inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); }
+
+		inline uint32_t getWorkgroupSize() const {return m_workgroupSize;}
+
+		inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo)
+		{
+			pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize);
+			dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize);
+		}
+
+		static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange<IGPUBuffer>& input_range, const asset::SBufferRange<IGPUBuffer>& scratch_range)
+		{
+			IGPUDescriptorSet::SDescriptorInfo infos[2];
+			infos[0].desc = input_range.buffer;
+			infos[0].info.buffer = {input_range.offset,input_range.size};
+			infos[1].desc = scratch_range.buffer;
+			infos[1].info.buffer = {scratch_range.offset,scratch_range.size};
+
+			video::IGPUDescriptorSet::SWriteDescriptorSet writes[2];
+			for (auto i=0u; i<2u; i++)
+			{
+				writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i };
+			}
+
+			device->updateDescriptorSets(2, writes, 0u, nullptr);
+		}
+
+		static inline void dispatchHelper(
+			IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipeline_layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo,
+			const uint32_t srcBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>* srcBufferBarriers,
+			const uint32_t dstBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>* dstBufferBarriers
+		)
+		{
+			cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants);
+			if (srcBufferBarrierCount)
+			{
+				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { srcBufferBarriers, 1} };
+				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
+			}
+			cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u);
+			if (srcBufferBarrierCount)
+			{
+				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { dstBufferBarriers, 1} };
+				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
+			}
+		}
+
+        inline core::smart_refctd_ptr<asset::ICPUShader> createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+        {
+            auto system = m_device->getPhysicalDevice()->getSystem();
+            core::smart_refctd_ptr<const system::IFile> hlsl;
+            {
+                auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile>
+                {
+                    nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future;
+                    system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE);
+                    if (future.wait())
+                        return future.copy();
+                    return nullptr;
+                };
+
+                // hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect_reduce.hlsl"); ??
+                hlsl = loadBuiltinData(shaderFile);
+            }
+            
+            auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(hlsl->getSize());
+            memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize());
+            auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????");
+
+            // REVIEW: Probably combine the commons parts of CReduce and CScanner
+            const char* storageType = nullptr;
+            switch (dataType)
+            {
+                case EDT_UINT:
+                    storageType = "uint32_t";
+                    break;
+                case EDT_INT:
+                    storageType = "int32_t";
+                    break;
+                case EDT_FLOAT:
+                    storageType = "float32_t";
+                    break;
+                default:
+                    assert(false);
+                    break;
+            }
+
+            const char* binop = nullptr;
+            switch (op)
+            {
+                case EO_AND:
+                    binop = "bit_and";
+                    break;
+                case EO_OR:
+                    binop = "bit_or";
+                    break;
+                case EO_XOR:
+                    binop = "bit_xor";
+                    break;
+                case EO_ADD:
+                    binop = "plus";
+                    break;
+                case EO_MUL:
+                    binop = "multiplies";
+                    break;
+                case EO_MAX:
+                    binop = "maximum";
+                    break;
+                case EO_MIN:
+                    binop = "minimum";
+                    break;
+                default:
+                    assert(false);
+                    break;
+            }
+            
+			return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n", m_workgroupSize, storageType, binop, scratchSz);
+        }
+
+		inline ILogicalDevice* getDevice() const {return m_device.get();}
+    protected:
+        // REVIEW: Does it need an empty destructor?
+
+		core::smart_refctd_ptr<ILogicalDevice> m_device;
+		core::smart_refctd_ptr<IGPUDescriptorSetLayout> m_ds_layout;
+		core::smart_refctd_ptr<IGPUPipelineLayout> m_pipeline_layout;
+		
+		const uint32_t m_workgroupSize;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/utilities/CReduce.h b/include/nbl/video/utilities/CReduce.h
new file mode 100644
index 0000000000..2b4e1a6374
--- /dev/null
+++ b/include/nbl/video/utilities/CReduce.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_VIDEO_C_REDUCE_H_INCLUDED_
+#define _NBL_VIDEO_C_REDUCE_H_INCLUDED_
+
+#include "nbl/video/utilities/CArithmeticOps.h"
+
+namespace nbl::video
+{
+
+    class CReduce final : public CArithmeticOps
+    {
+
+    public:
+        CReduce(core::smart_refctd_ptr<ILogicalDevice>&& device) : CReduce(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
+        CReduce(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {}
+        asset::ICPUShader* getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz);
+        IGPUShader* getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz);
+        IGPUComputePipeline* getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz);
+        core::smart_refctd_ptr<asset::ICPUShader> createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const;
+
+    protected:
+        ~CReduce()
+        {
+            // all drop themselves automatically
+        }
+
+        core::smart_refctd_ptr<asset::ICPUShader> m_shaders[EDT_COUNT][EO_COUNT];
+        core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EDT_COUNT][EO_COUNT];
+        core::smart_refctd_ptr<IGPUComputePipeline> m_pipelines[EDT_COUNT][EO_COUNT];
+    };
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index 1868d14f77..bef690f50c 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -8,9 +8,10 @@
 
 #include "nbl/video/IPhysicalDevice.h"
 #include "nbl/video/utilities/IDescriptorSetCache.h"
+#include "nbl/video/utilities/CArithmeticOps.h"
 
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
-static_assert(NBL_BUILTIN_MAX_SCAN_LEVELS & 0x1, "NBL_BUILTIN_MAX_SCAN_LEVELS must be odd!");
+static_assert(NBL_BUILTIN_MAX_LEVELS & 0x1, "NBL_BUILTIN_MAX_LEVELS must be odd!");
 
 namespace nbl::video
 {
@@ -141,8 +142,7 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about
 
 Console devs get to bring a gun to a knife fight...
 **/
-#if 1 // legacy & port to HLSL
-class CScanner final : public core::IReferenceCounted
+class CScanner final : public CArithmeticOps
 {
 	public:		
 		enum E_SCAN_TYPE : uint8_t
@@ -153,256 +153,20 @@ class CScanner final : public core::IReferenceCounted
 			 EST_EXCLUSIVE,
 			 EST_COUNT
 		};
-		// Only 4 byte wide data types supported due to need to trade the via shared memory,
-		// different combinations of data type and operator have different identity elements.
-		// `EDT_INT` and `EO_MIN` will have `INT_MAX` as identity, while `EDT_UINT` would have `UINT_MAX`  
-		enum E_DATA_TYPE : uint8_t
-		{
-			EDT_UINT=0u,
-			EDT_INT,
-			EDT_FLOAT,
-			EDT_COUNT
-		};
-		enum E_OPERATOR : uint8_t
-		{
-			EO_AND = 0u,
-			EO_XOR,
-			EO_OR,
-			EO_ADD,
-			EO_MUL,
-			EO_MIN,
-			EO_MAX,
-			EO_COUNT
-		};
-
-		// This struct is only for managing where to store intermediate results of the scans
-		struct Parameters : nbl::hlsl::scan::Parameters_t // this struct and its methods are also available in GLSL so you can launch indirect dispatches
-		{
-			static inline constexpr uint32_t MaxScanLevels = NBL_BUILTIN_MAX_SCAN_LEVELS;
-
-			Parameters()
-			{
-				std::fill_n(lastElement,MaxScanLevels/2+1,0u);
-				std::fill_n(temporaryStorageOffset,MaxScanLevels/2,0u);
-			}
-			// build the constant tables for each level given the number of elements to scan and workgroupSize
-			Parameters(const uint32_t _elementCount, const uint32_t workgroupSize) : Parameters()
-			{
-				assert(_elementCount!=0u && "Input element count can't be 0!");
-				const auto maxReductionLog2 = hlsl::findMSB(workgroupSize)*(MaxScanLevels/2u+1u);
-				assert(maxReductionLog2>=32u||((_elementCount-1u)>>maxReductionLog2)==0u && "Can't scan this many elements with such small workgroups!");
-
-				lastElement[0u] = _elementCount-1u;
-				for (topLevel=0u; lastElement[topLevel]>=workgroupSize;)
-					temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = lastElement[topLevel]/workgroupSize;
-				
-				std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u);
-			}
-                        // given already computed tables of lastElement indices per level, number of levels, and storage offsets, tell us total auxillary buffer size needed
-			inline uint32_t getScratchSize(uint32_t ssboAlignment=256u)
-			{
-				uint32_t uint_count = 1u; // workgroup enumerator
-				uint_count += temporaryStorageOffset[MaxScanLevels/2u-1u]; // last scratch offset
-				uint_count += lastElement[topLevel]+1u; // and its size
-				return core::roundUp<uint32_t>(uint_count*sizeof(uint32_t),ssboAlignment);
-			}
-		};
-                // the default scheduler we provide works as described above in the big documentation block
-		struct SchedulerParameters : nbl::hlsl::scan::DefaultSchedulerParameters_t  // this struct and its methods are also available in GLSL so you can launch indirect dispatches
-		{
-			SchedulerParameters()
-			{
-				//std::fill_n(finishedFlagOffset,Parameters::MaxScanLevels-1,0u);
-				std::fill_n(cumulativeWorkgroupCount,Parameters::MaxScanLevels,0u);
-				std::fill_n(workgroupFinishFlagsOffset, Parameters::MaxScanLevels, 0u);
-				std::fill_n(lastWorkgroupSetCountForLevel,Parameters::MaxScanLevels,0u);
-			}
-                        // given the number of elements and workgroup size, figure out how many atomics we need
-                        // also account for the fact that we will want to use the same scratch buffer both for the
-                        // scheduler's atomics and the aux data storage
-			SchedulerParameters(Parameters& outScanParams, const uint32_t _elementCount, const uint32_t workgroupSize) : SchedulerParameters()
-			{
-				/*For 696969
-					We will launch 1362 WGs
-					CumulativeWGCountPerLvl will be :
-				[1362 + 3 + 1]
-					WGFinFlagOFfsets
-					inclusive_scan([0, roundUp(1362 / 512), roundUp(3 / 512)])
-					LastWorkgroupSetCountForLevel
-					[1362 & (WGSZ - 1), 3 & (WGSZ - 1), 1 & (WGSZ - 1)]*/
-
-				outScanParams = Parameters(_elementCount,workgroupSize);
-				const auto topLevel = outScanParams.topLevel;
-
-				std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount);
-				for (auto i=0u; i<=topLevel; i++)
-					cumulativeWorkgroupCount[i] += 1u;
-				std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u);
-
-				for (auto i = 0u; i <= topLevel; i++) {
-					workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1;
-					lastWorkgroupSetCountForLevel[i] = cumulativeWorkgroupCount[i] & (workgroupSize - 1u);
-				}
-
-				/*std::copy_n(cumulativeWorkgroupCount+1u,topLevel,finishedFlagOffset);
-				std::copy_n(cumulativeWorkgroupCount+topLevel,topLevel,finishedFlagOffset+topLevel);
-
-				const auto finishedFlagCount = sizeof(finishedFlagOffset)/sizeof(uint32_t);
-				const auto finishedFlagsSize = std::accumulate(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,0u);
-				std::exclusive_scan(finishedFlagOffset,finishedFlagOffset+finishedFlagCount,finishedFlagOffset,0u);*/
-
-				const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxScanLevels, 0u);
-				std::exclusive_scan(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxScanLevels, workgroupFinishFlagsOffset, 0u);
-				for (auto i=0u; i<sizeof(Parameters::temporaryStorageOffset)/sizeof(uint32_t); i++)
-					outScanParams.temporaryStorageOffset[i] += wgFinishedFlagsSize;
-				
-				std::inclusive_scan(cumulativeWorkgroupCount, cumulativeWorkgroupCount + Parameters::MaxScanLevels, cumulativeWorkgroupCount);
-			}
-		};
-                // push constants of the default direct scan pipeline provide both aux memory offset params and scheduling params
-		struct DefaultPushConstants
-		{
-			Parameters scanParams;
-			SchedulerParameters schedulerParams;
-		};
-		struct DispatchInfo
-		{
-			DispatchInfo() : wg_count(0u)
-			{
-			}
-                        // in case we scan very few elements, you don't want to launch workgroups that wont do anything
-			DispatchInfo(const IPhysicalDevice::SLimits& limits, const uint32_t elementCount, const uint32_t workgroupSize)
-			{
-				constexpr auto workgroupSpinningProtection = 4u; // to prevent first workgroup starving/idling on level 1 after finishing level 0 early
-				wg_count = limits.computeOptimalPersistentWorkgroupDispatchSize(elementCount,workgroupSize,workgroupSpinningProtection);
-			}
-
-			uint32_t wg_count;
-		};
-
-		//
-		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device) : CScanner(std::move(device),core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
-		//
-		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize)
-		{
-			assert(core::isPoT(m_workgroupSize));
-			const asset::SPushConstantRange pc_range[] = {asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)};
-			const IGPUDescriptorSetLayout::SBinding bindings[2] = {
-				{ 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer
-				{ 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch
-			};
-
-			m_ds_layout = m_device->createDescriptorSetLayout(bindings);
-			assert(m_ds_layout && "CScanner Descriptor Set Layout was not created successfully");
-			m_pipeline_layout = m_device->createPipelineLayout({ pc_range }, core::smart_refctd_ptr(m_ds_layout));
-			assert(m_pipeline_layout && "CScanner Pipeline Layout was not created successfully");
-		}
-
-		//
-		inline auto getDefaultDescriptorSetLayout() const { return m_ds_layout.get(); }
-
-		//
-		inline auto getDefaultPipelineLayout() const { return m_pipeline_layout.get(); }
-
+		
 		// You need to override this shader with your own defintion of `nbl_glsl_scan_getIndirectElementCount` for it to even compile, so we always give you a new shader
-		core::smart_refctd_ptr<asset::ICPUShader> getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
-		{
-			return createShader(true,scanType,dataType,op,scratchSz);
-		}
+		//core::smart_refctd_ptr<asset::ICPUShader> getIndirectShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+		//{
+		//	return createShader(true,scanType,dataType,op,scratchSz);
+		//}
 
 		//
-		inline asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
-		{
-			if (!m_shaders[scanType][dataType][op])
-				m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz);
-			return m_shaders[scanType][dataType][op].get();
-		}
-		//
-		inline IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
-		{
-			if (!m_specialized_shaders[scanType][dataType][op])
-			{
-				auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op,scratchSz));
-				cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
-				cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
-
-				auto gpushader = m_device->createShader(cpuShader.get());
-
-				m_specialized_shaders[scanType][dataType][op] = gpushader;
-			}
-			return m_specialized_shaders[scanType][dataType][op].get();
-		}
-
-		//
-		inline auto getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
-		{
-			// ondemand
-			if (!m_pipelines[scanType][dataType][op]) {
-				IGPUComputePipeline::SCreationParams params = {};
-				params.layout = m_pipeline_layout.get();
-				// Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
-				params.shader.entryPoint = "main";
-				params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op, scratchSz);
-
-				m_device->createComputePipelines(
-					nullptr, { &params,1 },
-					& m_pipelines[scanType][dataType][op]
-				);
-			}
-			return m_pipelines[scanType][dataType][op].get();
-		}
-
-		//
-		inline uint32_t getWorkgroupSize() const {return m_workgroupSize;}
-
-		//
-		inline void buildParameters(const uint32_t elementCount, DefaultPushConstants& pushConstants, DispatchInfo& dispatchInfo)
-		{
-			pushConstants.schedulerParams = SchedulerParameters(pushConstants.scanParams,elementCount,m_workgroupSize);
-			dispatchInfo = DispatchInfo(m_device->getPhysicalDevice()->getLimits(),elementCount,m_workgroupSize);
-		}
-
-		//
-		static inline void updateDescriptorSet(ILogicalDevice* device, IGPUDescriptorSet* set, const asset::SBufferRange<IGPUBuffer>& input_range, const asset::SBufferRange<IGPUBuffer>& scratch_range)
-		{
-			IGPUDescriptorSet::SDescriptorInfo infos[2];
-			infos[0].desc = input_range.buffer;
-			infos[0].info.buffer = {input_range.offset,input_range.size};
-			infos[1].desc = scratch_range.buffer;
-			infos[1].info.buffer = {scratch_range.offset,scratch_range.size};
-
-			video::IGPUDescriptorSet::SWriteDescriptorSet writes[2];
-			for (auto i=0u; i<2u; i++)
-			{
-				writes[i] = { .dstSet = set,.binding = i,.arrayElement = 0u,.count = 1u,.info = infos+i };
-			}
-
-			device->updateDescriptorSets(2, writes, 0u, nullptr);
-		}
-
-		// Half and sizeof(uint32_t) of the scratch buffer need to be cleared to 0s
-		static inline void dispatchHelper(
-			IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipeline_layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo,
-			const uint32_t srcBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>* srcBufferBarriers,
-			const uint32_t dstBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>* dstBufferBarriers
-		)
-		{
-			cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants);
-			if (srcBufferBarrierCount)
-			{
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { srcBufferBarriers, 1} };
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
-			}
-			cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u);
-			if (srcBufferBarrierCount)
-			{
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { dstBufferBarriers, 1} };
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
-			}
-		}
-
-		inline ILogicalDevice* getDevice() const {return m_device.get();}
-
+		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device) : CScanner(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
+		CScanner(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {}
+		asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
+		IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
+		IGPUComputePipeline* getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
+        core::smart_refctd_ptr<asset::ICPUShader> createShader(const char* shaderFile, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const;
     protected:
 		~CScanner()
 		{
@@ -411,16 +175,10 @@ class CScanner final : public core::IReferenceCounted
 
 		core::smart_refctd_ptr<asset::ICPUShader> createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const;
 
-
-		core::smart_refctd_ptr<ILogicalDevice> m_device;
-		core::smart_refctd_ptr<IGPUDescriptorSetLayout> m_ds_layout;
-		core::smart_refctd_ptr<IGPUPipelineLayout> m_pipeline_layout;
 		core::smart_refctd_ptr<asset::ICPUShader> m_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
 		core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
 		core::smart_refctd_ptr<IGPUComputePipeline> m_pipelines[EST_COUNT][EDT_COUNT][EO_COUNT];
-		const uint32_t m_workgroupSize;
 };
-#endif
 
 }
 #endif
diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index baf821f7a2..321058cf50 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -11,6 +11,7 @@
 #include "nbl/video/alloc/StreamingTransientDataBuffer.h"
 #include "nbl/video/utilities/SIntendedSubmitInfo.h"
 #include "nbl/video/utilities/CPropertyPoolHandler.h"
+#include "nbl/video/utilities/CReduce.h"
 #include "nbl/video/utilities/CScanner.h"
 #include "nbl/video/utilities/CComputeBlit.h"
 
@@ -134,6 +135,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             // TODO: investigate whether we need to clamp against 256u instead of 128u on mobile
 #endif
             const auto scan_workgroup_size = core::max(core::roundDownToPoT(limits.maxWorkgroupSize[0]) >> 1u, 128u);
+            m_reducer = core::make_smart_refctd_ptr<CReduce>(core::smart_refctd_ptr(m_device), scan_workgroup_size);
             m_scanner = core::make_smart_refctd_ptr<CScanner>(core::smart_refctd_ptr(m_device), scan_workgroup_size);
         }
 
@@ -161,6 +163,11 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return m_propertyPoolHandler.get();
         }
 #endif
+        virtual CReduce* getDefaultReducer() const
+        {
+            return m_reducer.get();
+        }
+
         virtual CScanner* getDefaultScanner() const
         {
             return m_scanner.get();
@@ -721,6 +728,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
 #if 0 // TODO: port
         core::smart_refctd_ptr<CPropertyPoolHandler> m_propertyPoolHandler;
 #endif
+        core::smart_refctd_ptr<CReduce> m_reducer;
         core::smart_refctd_ptr<CScanner> m_scanner;
 };
 
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index b04188d243..04d9b654a6 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -266,6 +266,7 @@ set(NBL_VIDEO_SOURCES
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/IPropertyPool.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/IUtilities.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/CPropertyPoolHandler.cpp
+	${NBL_ROOT_PATH}/src/nbl/video/utilities/CReduce.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/CScanner.cpp
 	${NBL_ROOT_PATH}/src/nbl/video/utilities/CComputeBlit.cpp
 
diff --git a/src/nbl/video/utilities/CReduce.cpp b/src/nbl/video/utilities/CReduce.cpp
new file mode 100644
index 0000000000..bc9f745207
--- /dev/null
+++ b/src/nbl/video/utilities/CReduce.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/video/utilities/CReduce.h"
+
+namespace nbl::video
+{
+
+asset::ICPUShader* CReduce::getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz)
+{
+    if (!m_shaders[dataType][op])
+        m_shaders[dataType][op] = createShader(dataType,op,scratchSz);
+    return m_shaders[dataType][op].get();
+}
+
+IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz)
+{
+    if (!m_specialized_shaders[dataType][op])
+    {
+        auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(dataType,op,scratchSz));
+        cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/reduce.hlsl");
+        cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
+
+        auto gpushader = m_device->createShader(cpuShader.get());
+
+        m_specialized_shaders[dataType][op] = gpushader;
+    }
+    return m_specialized_shaders[dataType][op].get();
+}
+
+IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz)
+{
+    // ondemand
+    if (!m_pipelines[dataType][op]) {
+        IGPUComputePipeline::SCreationParams params = {};
+        params.layout = m_pipeline_layout.get();
+        // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
+        params.shader.entryPoint = "main";
+        params.shader.shader = getDefaultSpecializedShader(dataType,op,scratchSz);
+
+        m_device->createComputePipelines(
+            nullptr, { &params,1 },
+            & m_pipelines[dataType][op]
+        );
+    }
+    return m_pipelines[dataType][op].get();
+}
+
+core::smart_refctd_ptr<asset::ICPUShader> CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const
+{
+    core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz);
+    return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", "false", "false");
+}
+
+}
\ No newline at end of file
diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp
index 06dfd076e3..3c6c4088a3 100644
--- a/src/nbl/video/utilities/CScanner.cpp
+++ b/src/nbl/video/utilities/CScanner.cpp
@@ -1,85 +1,52 @@
 #include "nbl/video/utilities/CScanner.h"
 
-using namespace nbl;
-using namespace video;
+namespace nbl::video
+{
+    
+asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
+{
+    if (!m_shaders[scanType][dataType][op])
+        m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz);
+    return m_shaders[scanType][dataType][op].get();
+}
 
-core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
 {
-	auto system = m_device->getPhysicalDevice()->getSystem();
-	core::smart_refctd_ptr<const system::IFile> hlsl;
-	{
-		auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile>
-		{
-			nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future;
-			system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE);
-			if (future.wait())
-				return future.copy();
-			return nullptr;
-		};
+    if (!m_specialized_shaders[scanType][dataType][op])
+    {
+        auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op,scratchSz));
+        cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
+        cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
 
-		if(indirect)
-			hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect.hlsl");
-		else
-			hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/direct.hlsl");
-	}
-	auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(hlsl->getSize());
-	memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize());
-	auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????");
+        auto gpushader = m_device->createShader(cpuShader.get());
 
-	// (REVIEW): All of the below, that rely on enumerations, should probably be changed to take advantage of HLSL-CPP compatibility.
-	// Issue is that CScanner caches all shaders for all permutations of scanType-dataType-operator and it's not clear how to 
-	// do this without enums
-	const char* storageType = nullptr;
-	switch (dataType)
-	{
-		case EDT_UINT:
-			storageType = "uint32_t";
-			break;
-		case EDT_INT:
-			storageType = "int32_t";
-			break;
-		case EDT_FLOAT:
-			storageType = "float32_t";
-			break;
-		default:
-			assert(false);
-			break;
-	}
+        m_specialized_shaders[scanType][dataType][op] = gpushader;
+    }
+    return m_specialized_shaders[scanType][dataType][op].get();
+}
 
-	const char* isExclusive = scanType == EST_EXCLUSIVE ? "true" : "false";
+IGPUComputePipeline* CScanner::getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
+{
+    // ondemand
+    if (!m_pipelines[scanType][dataType][op]) {
+        IGPUComputePipeline::SCreationParams params = {};
+        params.layout = m_pipeline_layout.get();
+        // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
+        params.shader.entryPoint = "main";
+        params.shader.shader = getDefaultSpecializedShader(scanType, dataType, op, scratchSz);
+
+        m_device->createComputePipelines(
+            nullptr, { &params,1 },
+            & m_pipelines[scanType][dataType][op]
+        );
+    }
+    return m_pipelines[scanType][dataType][op].get();
+}
+    
+core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const char* shaderFile, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+{
+    core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz);
+    return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true");
+}
 
-	const char* binop = nullptr;
-	switch (op)
-	{
-		case EO_AND:
-			binop = "bit_and";
-			break;
-		case EO_OR:
-			binop = "bit_or";
-			break;
-		case EO_XOR:
-			binop = "bit_xor";
-			break;
-		case EO_ADD:
-			binop = "plus";
-			break;
-		case EO_MUL:
-			binop = "multiplies";
-			break;
-		case EO_MAX:
-			binop = "maximum";
-			break;
-		case EO_MIN:
-			binop = "minimum";
-			break;
-		default:
-			assert(false);
-			break;
-	}
-	
-	return asset::CHLSLCompiler::createOverridenCopy(
-		cpushader.get(),
-		"#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n",
-		m_workgroupSize, storageType, binop, scratchSz
-	);
 }
\ No newline at end of file

From 40953e3cf07bfa89e9a2be7f4670c7f805b78c78 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Mon, 17 Jun 2024 11:11:47 +0300
Subject: [PATCH 14/25] Fix issues with global reduce algorithm

It should now work with all sizes of inputs.
---
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    |  3 +-
 include/nbl/builtin/hlsl/scan/scheduler.hlsl  | 40 ++++++++++++-------
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  5 ++-
 include/nbl/video/utilities/CArithmeticOps.h  | 17 ++++----
 4 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index c46b2d38b3..f59b5cb53f 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -20,7 +20,7 @@ namespace scan
 template<uint32_t scratchElementCount=SCRATCH_SZ - 1>
 struct Scratch
 {
-    uint32_t workgroupsStarted;
+    uint32_t workgroupsStarted[NBL_BUILTIN_MAX_LEVELS];
     uint32_t data[scratchElementCount];
 };
 
@@ -95,6 +95,7 @@ void setData(
     }
     else if (inRange)
     {
+        // REVIEW: Check if we have to add levelInvocationIndex in offset or levelWorkgroupIndex as above
         if (!isScan && treeLevel == params.topLevel)
         {
             scanBuffer[levelInvocationIndex] = data;
diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
index 28f3f31d09..d7a94e1e78 100644
--- a/include/nbl/builtin/hlsl/scan/scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
@@ -30,54 +30,64 @@ struct Scheduler
   }
     
   template<class Accessor>
-  bool getWork(NBL_REF_ARG(Accessor) accessor)
+  bool getWork(NBL_REF_ARG(Accessor) accessor)  // rename to hasWork()
   {
       
-glsl::memoryBarrierBuffer();
-if(scanScratchBuf[0u].workgroupsStarted >= schedParams.cumulativeWorkgroupCount[level])
-  return false;
-      
+    // This part ensures that each workgroup that was not chosen to run the next level 
+    // will exit once all the work for the current level is finished
+    glsl::memoryBarrierBuffer();
+    if(scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level))
+      return false;
+
     if (workgroup::SubgroupContiguousIndex()==0u)
     {
-      uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted,1u);
+      uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level],1u);
     }
     uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor);
     // nothing really happens, we just check if there's a workgroup in the level to grab
-    return uberWorkgroupIndex <= schedParams.cumulativeWorkgroupCount[level] - 1u;
+    return uberWorkgroupIndex < totalWorkgroupsPerLevel(level);
   }
 
   template<class Accessor>
-  void markDone(NBL_REF_ARG(Accessor) accessor)
+  bool markDone(NBL_REF_ARG(Accessor) accessor)
   {
     if (level==(params.topLevel)) // check if we reached the lastLevel
     {
+        // not even sure this is needed
       uberWorkgroupIndex = ~0u;
-      return;
+      return true;
     }
 
+    uint32_t prevLevel = level;
     if (workgroup::SubgroupContiguousIndex()==0u)
     {
       // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset
       const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize;
-      const uint32_t lastWorkgroupSetIndexForLevel = (level == 0u ? schedParams.cumulativeWorkgroupCount[level] : (schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u])) / WorkgroupSize;
-      const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u) + 1u;
+      const uint32_t lastWorkgroupSetIndexForLevel = totalWorkgroupsPerLevel(level) / WorkgroupSize;
+      const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u);
       //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
-      if ((uberWorkgroupIndex < lastWorkgroupSetIndexForLevel ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
+      if (((uberWorkgroupIndex/WorkgroupSize) < lastWorkgroupSetIndexForLevel  ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
       {
         level++;
       }
     }
     level = workgroup::BroadcastFirst(level, accessor);
+    return level == 0 ? false : level == prevLevel; // on level 0 never exit early but on higher levels each workgroup is allowed one operation and exits except if promoted to next level
   }
 
-  uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level)
+  uint32_t levelWorkgroupIndex(const in uint32_t level)
   {
-      return level == 0u ? uberWorkgroupIndex : (uberWorkgroupIndex - schedParams.cumulativeWorkgroupCount[level-1u]);
+      return uberWorkgroupIndex;
   }
 
+    uint32_t totalWorkgroupsPerLevel(const in uint32_t level)
+    {
+        return level == 0 ? schedParams.cumulativeWorkgroupCount[level] : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u];
+    }
+
   Parameters_t params;
   DefaultSchedulerParameters_t schedParams;
-  uint32_t uberWorkgroupIndex;
+  uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex
   uint16_t level;
 };
 
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 72434d2286..9319ca1306 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -76,7 +76,10 @@ namespace scan
 
             virtualWorkgroup<Binop, Storage_t, isScan, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
             accessor.workgroupExecutionAndMemoryBarrier();
-            scheduler.markDone(accessor);
+            if(scheduler.markDone(accessor))
+			{
+				return;
+			}
         }
     }
 }
diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h
index c57f18839d..b066cbff15 100644
--- a/include/nbl/video/utilities/CArithmeticOps.h
+++ b/include/nbl/video/utilities/CArithmeticOps.h
@@ -58,16 +58,16 @@ class CArithmeticOps : public core::IReferenceCounted
 
 				lastElement[0u] = _elementCount-1u;
 				for (topLevel=0u; lastElement[topLevel]>=workgroupSize;)
-					temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = lastElement[topLevel]/workgroupSize;
+					temporaryStorageOffset[topLevel-1u] = lastElement[++topLevel] = std::ceil(lastElement[topLevel] / double(workgroupSize));
 				
 				std::exclusive_scan(temporaryStorageOffset,temporaryStorageOffset+sizeof(temporaryStorageOffset)/sizeof(uint32_t),temporaryStorageOffset,0u);
 			}
-            // given already computed tables of lastElement indices per level, number of levels, and storage offsets, tell us total auxillary buffer size needed
+            // given already computed tables of lastElement indice	s per level, number of levels, and storage offsets, tell us total auxillary buffer size needed
 			inline uint32_t getScratchSize(uint32_t ssboAlignment=256u)
 			{
-				uint32_t uint_count = 1u; // workgroup enumerator
+				uint32_t uint_count = MaxLevels; // workgroup enumerator
 				uint_count += temporaryStorageOffset[MaxLevels/2u-1u]; // last scratch offset
-				uint_count += lastElement[topLevel]+1u; // and its size
+				uint_count += lastElement[topLevel]+1u; // starting from the last storage offset, we also need slots equal to the top level's elementCount (add 1u because 0u based)
 				return core::roundUp<uint32_t>(uint_count*sizeof(uint32_t),ssboAlignment);
 			}
 		};
@@ -91,13 +91,11 @@ class CArithmeticOps : public core::IReferenceCounted
 				const auto topLevel = outScanParams.topLevel;
 
 				std::copy_n(outScanParams.lastElement+1u,topLevel,cumulativeWorkgroupCount);
-				for (auto i=0u; i<=topLevel; i++)
-					cumulativeWorkgroupCount[i] += 1u;
 				std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u);
-
+				cumulativeWorkgroupCount[topLevel] = 1u; // the top level will always end up with 1 workgroup to do the final reduction
 				for (auto i = 0u; i <= topLevel; i++) {
-					workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1;
-					lastWorkgroupSetCountForLevel[i] = cumulativeWorkgroupCount[i] & (workgroupSize - 1u);
+					workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1; // RECHECK: findMSB(511) == 8u !! Here we assume it's 9u !!
+					lastWorkgroupSetCountForLevel[i] = (cumulativeWorkgroupCount[i] - 1u) & (workgroupSize - 1u);
 				}
 
 				const auto wgFinishedFlagsSize = std::accumulate(workgroupFinishFlagsOffset, workgroupFinishFlagsOffset + Parameters::MaxLevels, 0u);
@@ -127,6 +125,7 @@ class CArithmeticOps : public core::IReferenceCounted
 			{
 				constexpr auto workgroupSpinningProtection = 4u; // to prevent first workgroup starving/idling on level 1 after finishing level 0 early
 				wg_count = limits.computeOptimalPersistentWorkgroupDispatchSize(elementCount,workgroupSize,workgroupSpinningProtection);
+				assert(wg_count >= elementCount / workgroupSize^2 + 1 && "Too few workgroups! The workgroup count must be at least the elementCount/(wgSize^2)");
 			}
 
 			uint32_t wg_count;

From 3d252d07612d14b625e4fa32557ca27ba5161ff0 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sun, 7 Jul 2024 17:20:19 +0300
Subject: [PATCH 15/25] refactor global reduce to properly work with required
 global scan changes

---
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    |  17 +--
 .../nbl/builtin/hlsl/scan/scan_scheduler.hlsl |  69 ++++++++++
 include/nbl/builtin/hlsl/scan/scheduler.hlsl  | 123 ++++++++++--------
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  98 ++++++++++----
 include/nbl/video/utilities/CArithmeticOps.h  |  36 ++---
 include/nbl/video/utilities/CReduce.h         |   8 +-
 include/nbl/video/utilities/CScanner.h        |   4 +-
 src/nbl/builtin/CMakeLists.txt                |   8 ++
 src/nbl/video/utilities/CReduce.cpp           |  18 +--
 src/nbl/video/utilities/CScanner.cpp          |   4 +-
 10 files changed, 259 insertions(+), 126 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl

diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index f59b5cb53f..93d383a7dd 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -17,11 +17,12 @@ namespace hlsl
 namespace scan
 {
 
-template<uint32_t scratchElementCount=SCRATCH_SZ - 1>
+template<uint32_t dataElementCount=SCRATCH_EL_CNT - NBL_BUILTIN_MAX_LEVELS>
 struct Scratch
 {
+    uint32_t reduceResult;
     uint32_t workgroupsStarted[NBL_BUILTIN_MAX_LEVELS];
-    uint32_t data[scratchElementCount];
+    uint32_t data[dataElementCount];
 };
 
 [[vk::binding(0 ,0)]] RWStructuredBuffer<Storage_t> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?)
@@ -43,7 +44,7 @@ void getData(
     if (notFirstOrLastLevel)
         offset += params.temporaryStorageOffset[pseudoLevel-1u];
     
-    if (pseudoLevel!=treeLevel) // downsweep
+    if (pseudoLevel!=treeLevel) // downsweep/scan
     {
         const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
         if (bool(levelWorkgroupIndex) && firstInvocationInGroup)
@@ -87,19 +88,19 @@ void setData(
 )
 {
     const Parameters_t params = getParameters();
-    if (treeLevel<params.topLevel)
+    if (!isScan && treeLevel<params.topLevel) // is reduce and we're not at the last level (i.e. we still save into scratch)
     {
-        const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1);
+        const bool lastInvocationInGroup = workgroup::SubgroupContiguousIndex()==(glsl::gl_WorkGroupSize().x-1u);
         if (lastInvocationInGroup)
-            scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
+            scanScratchBuf[0u].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]] = data;
     }
     else if (inRange)
     {
-        // REVIEW: Check if we have to add levelInvocationIndex in offset or levelWorkgroupIndex as above
         if (!isScan && treeLevel == params.topLevel)
         {
-            scanBuffer[levelInvocationIndex] = data;
+            scanScratchBuf[0u].reduceResult = data;
         }
+        // The following only for isScan == true
         else if (bool(pseudoLevel))
         {
             const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u];
diff --git a/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl
new file mode 100644
index 0000000000..ea283a271f
--- /dev/null
+++ b/include/nbl/builtin/hlsl/scan/scan_scheduler.hlsl
@@ -0,0 +1,69 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_
+#define _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_
+
+#include "nbl/builtin/hlsl/scan/declarations.hlsl"
+
+#include "nbl/builtin/hlsl/scan/descriptors.hlsl"
+
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template <uint16_t WorkgroupSize> struct ScanScheduler
+{
+
+    static ScanScheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams)
+    {
+        ScanScheduler<WorkgroupSize> scheduler;
+        scheduler.params = params;
+        scheduler.schedParams = schedParams;
+        return scheduler;
+    }
+
+    template <class Accessor> bool getWork(NBL_REF_ARG(Accessor) accessor) // rename to hasWork()
+    {
+
+        // This part ensures that each workgroup that was not chosen to run the next level
+        // will exit once all the work for the current level is finished
+        glsl::memoryBarrierBuffer();
+        if (scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level))
+            return false;
+
+        if (workgroup::SubgroupContiguousIndex() == 0u)
+        {
+            uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level], 1u);
+        }
+        uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor);
+        // nothing really happens, we just check if there's a workgroup in the level to grab
+        return uberWorkgroupIndex < totalWorkgroupsPerLevel(level);
+    }
+
+    uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level) { return uberWorkgroupIndex; }
+
+    uint32_t totalWorkgroupsPerLevel(NBL_CONST_REF_ARG(uint32_t) level)
+    {
+        return level == 0u
+            ? schedParams.cumulativeWorkgroupCount[level]
+            : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level - 1u];
+    }
+
+    Parameters_t params;
+    DefaultSchedulerParameters_t schedParams;
+    uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex
+    uint32_t level;              // 32 bit to stop warnings from level = workgroup::BroadcastFirst
+};
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/scan/scheduler.hlsl b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
index d7a94e1e78..99effbd88a 100644
--- a/include/nbl/builtin/hlsl/scan/scheduler.hlsl
+++ b/include/nbl/builtin/hlsl/scan/scheduler.hlsl
@@ -2,8 +2,8 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#ifndef _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_
-#define _NBL_HLSL_SCAN_SCHEDULER_INCLUDED_
+#ifndef _NBL_HLSL_SCHEDULER_INCLUDED_
+#define _NBL_HLSL_SCHEDULER_INCLUDED_
 
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
 #include "nbl/builtin/hlsl/scan/descriptors.hlsl"
@@ -20,75 +20,88 @@ namespace scan
 template<uint16_t WorkgroupSize>
 struct Scheduler
 {
-  
-  static Scheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams)
-  {
-      Scheduler<WorkgroupSize> scheduler;
-      scheduler.params = params;
-      scheduler.schedParams = schedParams;
-      return scheduler;
-  }
+    static Scheduler create(NBL_CONST_REF_ARG(Parameters_t) params, NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) schedParams)
+    {
+        Scheduler<WorkgroupSize> scheduler;
+        scheduler.params = params;
+        scheduler.schedParams = schedParams;
+        return scheduler;
+    }
     
-  template<class Accessor>
-  bool getWork(NBL_REF_ARG(Accessor) accessor)  // rename to hasWork()
-  {
-      
-    // This part ensures that each workgroup that was not chosen to run the next level 
-    // will exit once all the work for the current level is finished
-    glsl::memoryBarrierBuffer();
-    if(scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level))
-      return false;
-
-    if (workgroup::SubgroupContiguousIndex()==0u)
+    template<class Accessor>
+    bool getWork(NBL_REF_ARG(Accessor) accessor)  // rename to hasWork()
     {
-      uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level],1u);
+        
+        // This part ensures that each workgroup that was not chosen to run the next level 
+        // will exit once all the work for the current level is finished
+        glsl::memoryBarrierBuffer();
+        if(scanScratchBuf[0u].workgroupsStarted[level] >= totalWorkgroupsPerLevel(level))
+            return false;
+    
+        if (workgroup::SubgroupContiguousIndex()==0u)
+        {
+            uberWorkgroupIndex = glsl::atomicAdd(scanScratchBuf[0u].workgroupsStarted[level],1u);
+        }
+        uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor);
+        // nothing really happens, we just check if there's a workgroup in the level to grab
+        return uberWorkgroupIndex < totalWorkgroupsPerLevel(level);
     }
-    uberWorkgroupIndex = workgroup::BroadcastFirst(uberWorkgroupIndex, accessor);
-    // nothing really happens, we just check if there's a workgroup in the level to grab
-    return uberWorkgroupIndex < totalWorkgroupsPerLevel(level);
-  }
 
-  template<class Accessor>
-  bool markDone(NBL_REF_ARG(Accessor) accessor)
-  {
-    if (level==(params.topLevel)) // check if we reached the lastLevel
+    template<class Accessor>
+    bool markDone(NBL_REF_ARG(Accessor) accessor)
     {
-        // not even sure this is needed
-      uberWorkgroupIndex = ~0u;
-      return true;
+        if (level==(params.topLevel)) // check if we reached the lastLevel
+        {
+            // not even sure this is needed
+            uberWorkgroupIndex = ~0u;
+            return true;
+        }
+
+        uint32_t prevLevel = level;
+        if (workgroup::SubgroupContiguousIndex()==0u)
+        {
+            // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset
+            const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize;
+            const uint32_t lastWorkgroupSetIndexForLevel = totalWorkgroupsPerLevel(level) / WorkgroupSize;
+            const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u);
+            //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
+            if (((uberWorkgroupIndex/WorkgroupSize) < lastWorkgroupSetIndexForLevel  ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
+            {
+                level++;
+            }
+        }
+        level = workgroup::BroadcastFirst(level, accessor);
+        return level == 0 ? false : level == prevLevel; // on level 0 never exit early but on higher levels each workgroup is allowed one operation and exits except if promoted to next level
     }
 
-    uint32_t prevLevel = level;
-    if (workgroup::SubgroupContiguousIndex()==0u)
+    uint32_t levelWorkgroupIndex(NBL_CONST_REF_ARG(uint32_t) level)
     {
-      // The uberWorkgroupIndex is always increasing, even after we switch levels, but for each new level the workgroupSetFinishedIndex must reset
-      const uint32_t workgroupSetFinishedIndex = levelWorkgroupIndex(level) / WorkgroupSize;
-      const uint32_t lastWorkgroupSetIndexForLevel = totalWorkgroupsPerLevel(level) / WorkgroupSize;
-      const uint32_t doneCount = glsl::atomicAdd(scanScratchBuf[0u].data[schedParams.workgroupFinishFlagsOffset[level]+workgroupSetFinishedIndex], 1u);
-      //if ((uberWorkgroupIndex != schedParams.cumulativeWorkgroupCount[level] - 1u ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
-      if (((uberWorkgroupIndex/WorkgroupSize) < lastWorkgroupSetIndexForLevel  ? (WorkgroupSize-1u) : schedParams.lastWorkgroupSetCountForLevel[level])==doneCount)
-      {
-        level++;
-      }
+        return uberWorkgroupIndex;
     }
-    level = workgroup::BroadcastFirst(level, accessor);
-    return level == 0 ? false : level == prevLevel; // on level 0 never exit early but on higher levels each workgroup is allowed one operation and exits except if promoted to next level
-  }
 
-  uint32_t levelWorkgroupIndex(const in uint32_t level)
-  {
-      return uberWorkgroupIndex;
-  }
-
-    uint32_t totalWorkgroupsPerLevel(const in uint32_t level)
+    uint32_t totalWorkgroupsPerLevel(NBL_CONST_REF_ARG(uint32_t) level)
+    {
+        return level == 0 
+            ? schedParams.cumulativeWorkgroupCount[level] 
+            : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u];
+    }
+    
+    void resetWorkgroupsStartedBuffer()
     {
-        return level == 0 ? schedParams.cumulativeWorkgroupCount[level] : schedParams.cumulativeWorkgroupCount[level] - schedParams.cumulativeWorkgroupCount[level-1u];
+        // could do scanScratchBuf[0u].workgroupsStarted[SubgroupContiguousIndex()] = 0u but don't know how many invocations are live during this call
+        if(workgroup::SubgroupContiguousIndex() == 0u)
+        {
+            for(uint32_t i = 0; i < params.topLevel; i++)
+            {
+                scanScratchBuf[0u].workgroupsStarted[i] = 0u;
+            }
+        }
     }
 
   Parameters_t params;
   DefaultSchedulerParameters_t schedParams;
   uint32_t uberWorkgroupIndex; // rename to virtualWorkgroupIndex
-  uint16_t level;
+  uint32_t level; // 32 bit to stop warnings from level = workgroup::BroadcastFirst
 };
 
 }
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 9319ca1306..3bebbd7305 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -6,23 +6,37 @@
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" // This is where all the nbl_glsl_workgroupOPs are defined
 #include "nbl/builtin/hlsl/scan/declarations.hlsl"
 #include "nbl/builtin/hlsl/scan/scheduler.hlsl"
+#include "nbl/builtin/hlsl/scan/scan_scheduler.hlsl"
 
 namespace nbl
 {
 namespace hlsl
 {
 namespace scan
-{
+{   
+    /**
+     * Finds the index on a higher level in the reduce tree which the 
+     * levelInvocationIndex needs to add to level 0
+     */
+    template<uint32_t WorkgroupSize>
+    uint32_t2 findHigherLevelIndex(uint32_t level, uint32_t levelInvocationIndex)
+    {
+		const uint32_t WgSzLog2 = firstbithigh(WorkgroupSize);
+		uint32_t rightIndexNonInclusive = (levelInvocationIndex >> (WgSzLog2 * level)); // formula for finding the index. If it underflows it means that there's nothing to do for the level
+		uint32_t leftIndexInclusive = (rightIndexNonInclusive >> WgSzLog2) << WgSzLog2;
+		uint32_t add = (levelInvocationIndex & (WgSzLog2 * level - 1)) > 0 ? 1u : 0u; // if the index mod WorkgroupSize*level > 0 then we must add 1u
+		return uint32_t2(leftIndexInclusive, rightIndexNonInclusive); // return both the index and the potential underflow information to know we must do nothing
+    }
+
     template<class Binop, typename Storage_t, bool isScan, bool isExclusive, uint16_t WorkgroupSize, class Accessor, class device_capabilities=void>
     void virtualWorkgroup(NBL_CONST_REF_ARG(uint32_t) treeLevel, NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex, NBL_REF_ARG(Accessor) accessor)
     {
         const Parameters_t params = getParameters();
         const uint32_t levelInvocationIndex = levelWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex();
-        const uint32_t lastLevel = params.topLevel << 1u;
         
         // pseudoLevel is the level index going up until toplevel then back down
-        const uint32_t pseudoLevel = treeLevel>params.topLevel ? (lastLevel-treeLevel):treeLevel;
-        const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel];
+        const uint32_t pseudoLevel = isScan ? (params.topLevel-treeLevel) : treeLevel;
+        const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; // the lastElement array contains the lastElement's index hence the '<='
 
         // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel
         // so that it exits after reaching the top?
@@ -38,24 +52,38 @@ namespace scan
             getData<Storage_t, isExclusive>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel);
         }
 
-        bool doReduce = isScan ? treeLevel < params.topLevel : treeLevel <= params.topLevel;
-
+        bool doReduce = !isScan;
         if(doReduce)
         {
             data = workgroup::reduction<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
         }
-        else if (!isExclusive && params.topLevel == 0u)
-        {
-            data = workgroup::inclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
-        }
-        else if (treeLevel != params.topLevel)
+        else 
         {
-            data = workgroup::inclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
-        }
-        else
-        {
-            data = workgroup::exclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
+            if (isExclusive)
+            {
+                data = workgroup::exclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
+            }
+            else
+            {
+                data = workgroup::inclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(data,accessor);
+            }
+            
+            uint32_t reverseLevel = params.topLevel - treeLevel;
+            while(reverseLevel != ~0u)
+            {
+                Storage_t tempData = Binop::identity;
+                uint32_t2 idxRange = findHigherLevelIndex<uint32_t(WorkgroupSize)>(reverseLevel--, levelInvocationIndex);
+                if(workgroup::SubgroupContiguousIndex() < idxRange.y - idxRange.x)
+                {
+                    tempData = scanScratchBuf[0].data[params.temporaryStorageOffset[reverseLevel] + idxRange.x + workgroup::SubgroupContiguousIndex()];
+                }
+
+                // we could potentially do an inclusive scan of this part and cache it for other WGs
+                //tempData = workgroup::inclusive_scan<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(tempData,accessor);
+                data += workgroup::reduction<Binop,WorkgroupSize,device_capabilities>::template __call<Accessor>(tempData,accessor);
+            }
         }
+        
         setData<Storage_t, isScan>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange);
     }
 
@@ -65,21 +93,35 @@ namespace scan
     {
         const Parameters_t params = getParameters();
         const DefaultSchedulerParameters_t schedulerParams = getSchedulerParameters();
-        Scheduler<WorkgroupSize> scheduler = Scheduler<WorkgroupSize>::create(params, schedulerParams);
-        // persistent workgroups
-        while (true)
+        
+        if(!isScan)
         {
-            if (!scheduler.getWork(accessor))
+            Scheduler<WorkgroupSize> scheduler = Scheduler<WorkgroupSize>::create(params, schedulerParams);
+            // persistent workgroups
+            
+            while (true)
             {
-                return;
-            }
+                if (!scheduler.getWork(accessor))
+                {
+                    return;
+                }
 
-            virtualWorkgroup<Binop, Storage_t, isScan, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
-            accessor.workgroupExecutionAndMemoryBarrier();
-            if(scheduler.markDone(accessor))
-			{
-				return;
-			}
+                virtualWorkgroup<Binop, Storage_t, isScan, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
+                accessor.workgroupExecutionAndMemoryBarrier();
+                if(scheduler.markDone(accessor))
+                {
+                    return;
+                }
+            }
+        }
+        else
+        {
+            ScanScheduler<WorkgroupSize> scheduler = ScanScheduler<WorkgroupSize>::create(params, schedulerParams);
+            while(scheduler.getWork(accessor))
+            {
+                virtualWorkgroup<Binop, Storage_t, isScan, isExclusive, WorkgroupSize, Accessor>(scheduler.level, scheduler.levelWorkgroupIndex(scheduler.level), accessor);
+                accessor.workgroupExecutionAndMemoryBarrier();
+            }
         }
     }
 }
diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h
index b066cbff15..492e8fe641 100644
--- a/include/nbl/video/utilities/CArithmeticOps.h
+++ b/include/nbl/video/utilities/CArithmeticOps.h
@@ -65,11 +65,17 @@ class CArithmeticOps : public core::IReferenceCounted
             // given already computed tables of lastElement indice	s per level, number of levels, and storage offsets, tell us total auxillary buffer size needed
 			inline uint32_t getScratchSize(uint32_t ssboAlignment=256u)
 			{
-				uint32_t uint_count = MaxLevels; // workgroup enumerator
+				uint32_t uint_count = 1u; // reduceResult field
+				uint_count += MaxLevels; // workgroup enumerator
 				uint_count += temporaryStorageOffset[MaxLevels/2u-1u]; // last scratch offset
-				uint_count += lastElement[topLevel]+1u; // starting from the last storage offset, we also need slots equal to the top level's elementCount (add 1u because 0u based)
+				uint_count += lastElement[topLevel]+1u; // starting from the last temporary storage offset, we also need slots equal to the top level's elementCount (add 1u because 0u based)
 				return core::roundUp<uint32_t>(uint_count*sizeof(uint32_t),ssboAlignment);
 			}
+
+			inline uint32_t getWorkgroupEnumeratorSize()
+			{
+				return MaxLevels * sizeof(uint32_t);
+			}
 		};
         
         // the default scheduler we provide works as described above in the big documentation block
@@ -178,26 +184,22 @@ class CArithmeticOps : public core::IReferenceCounted
 		}
 
 		static inline void dispatchHelper(
-			IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* pipeline_layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo,
-			const uint32_t srcBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>* srcBufferBarriers,
-			const uint32_t dstBufferBarrierCount, const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>* dstBufferBarriers
+			IGPUCommandBuffer* cmdbuf, const video::IGPUPipelineLayout* layout, const DefaultPushConstants& pushConstants, const DispatchInfo& dispatchInfo,
+			std::span<const IGPUCommandBuffer::SPipelineBarrierDependencyInfo> bufferBarriers
 		)
 		{
-			cmdbuf->pushConstants(pipeline_layout,asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants),&pushConstants);
-			if (srcBufferBarrierCount)
+			cmdbuf->pushConstants(layout, asset::IShader::ESS_COMPUTE, 0u, sizeof(DefaultPushConstants), &pushConstants);
+			if (bufferBarriers.size() > 0)
 			{
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { srcBufferBarriers, 1} };
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
-			}
-			cmdbuf->dispatch(dispatchInfo.wg_count,1u,1u);
-			if (srcBufferBarrierCount)
-			{
-				IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .bufBarriers = { dstBufferBarriers, 1} };
-				cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
+				for (uint32_t i = 0; i < bufferBarriers.size(); i++)
+				{
+					cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, bufferBarriers[i]);
+				}
 			}
+			cmdbuf->dispatch(dispatchInfo.wg_count, 1u, 1u);
 		}
 
-        inline core::smart_refctd_ptr<asset::ICPUShader> createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+        inline core::smart_refctd_ptr<asset::ICPUShader> createBaseShader(const char* shaderFile, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchElCount) const
         {
             auto system = m_device->getPhysicalDevice()->getSystem();
             core::smart_refctd_ptr<const system::IFile> hlsl;
@@ -266,7 +268,7 @@ class CArithmeticOps : public core::IReferenceCounted
                     break;
             }
             
-			return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_SZ %d\n", m_workgroupSize, storageType, binop, scratchSz);
+			return asset::CHLSLCompiler::createOverridenCopy(cpushader.get(), "#define WORKGROUP_SIZE %d\ntypedef %s Storage_t;\n#define BINOP nbl::hlsl::%s\n#define SCRATCH_EL_CNT %d\n", m_workgroupSize, storageType, binop, scratchElCount);
         }
 
 		inline ILogicalDevice* getDevice() const {return m_device.get();}
diff --git a/include/nbl/video/utilities/CReduce.h b/include/nbl/video/utilities/CReduce.h
index 2b4e1a6374..0a3d896f2c 100644
--- a/include/nbl/video/utilities/CReduce.h
+++ b/include/nbl/video/utilities/CReduce.h
@@ -16,10 +16,10 @@ namespace nbl::video
     public:
         CReduce(core::smart_refctd_ptr<ILogicalDevice>&& device) : CReduce(std::move(device), core::roundDownToPoT(device->getPhysicalDevice()->getLimits().maxOptimallyResidentWorkgroupInvocations)) {}
         CReduce(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : CArithmeticOps(core::smart_refctd_ptr(device), workgroupSize) {}
-        asset::ICPUShader* getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz);
-        IGPUShader* getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz);
-        IGPUComputePipeline* getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz);
-        core::smart_refctd_ptr<asset::ICPUShader> createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const;
+        asset::ICPUShader* getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount);
+        IGPUShader* getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount);
+        IGPUComputePipeline* getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount);
+        core::smart_refctd_ptr<asset::ICPUShader> createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) const;
 
     protected:
         ~CReduce()
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index bef690f50c..a6b2a6ca11 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -166,15 +166,13 @@ class CScanner final : public CArithmeticOps
 		asset::ICPUShader* getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
 		IGPUShader* getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
 		IGPUComputePipeline* getDefaultPipeline(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz);
-        core::smart_refctd_ptr<asset::ICPUShader> createShader(const char* shaderFile, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const;
+        core::smart_refctd_ptr<asset::ICPUShader> createShader(/*const bool indirect, */const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const;
     protected:
 		~CScanner()
 		{
 			// all drop themselves automatically
 		}
 
-		core::smart_refctd_ptr<asset::ICPUShader> createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const;
-
 		core::smart_refctd_ptr<asset::ICPUShader> m_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
 		core::smart_refctd_ptr < IGPUShader > m_specialized_shaders[EST_COUNT][EDT_COUNT][EO_COUNT];
 		core::smart_refctd_ptr<IGPUComputePipeline> m_pipelines[EST_COUNT][EDT_COUNT][EO_COUNT];
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index e3772b724b..5183f21242 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -314,6 +314,14 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/ballot.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl")
+#global scan
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/declarations.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/default_scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/descriptors.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/direct.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scan_scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/virtual_workgroup.hlsl")
 #Extensions
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl")
diff --git a/src/nbl/video/utilities/CReduce.cpp b/src/nbl/video/utilities/CReduce.cpp
index bc9f745207..7a56839636 100644
--- a/src/nbl/video/utilities/CReduce.cpp
+++ b/src/nbl/video/utilities/CReduce.cpp
@@ -7,19 +7,19 @@
 namespace nbl::video
 {
 
-asset::ICPUShader* CReduce::getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz)
+asset::ICPUShader* CReduce::getDefaultShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount)
 {
     if (!m_shaders[dataType][op])
-        m_shaders[dataType][op] = createShader(dataType,op,scratchSz);
+        m_shaders[dataType][op] = createShader(dataType,op,scratchElCount);
     return m_shaders[dataType][op].get();
 }
 
-IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz)
+IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount)
 {
     if (!m_specialized_shaders[dataType][op])
     {
-        auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(dataType,op,scratchSz));
-        cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/reduce.hlsl");
+        auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(dataType,op,scratchElCount));
+        cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
         cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
 
         auto gpushader = m_device->createShader(cpuShader.get());
@@ -29,7 +29,7 @@ IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TY
     return m_specialized_shaders[dataType][op].get();
 }
 
-IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz)
+IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount)
 {
     // ondemand
     if (!m_pipelines[dataType][op]) {
@@ -37,7 +37,7 @@ IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TY
         params.layout = m_pipeline_layout.get();
         // Theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one (and glslang used to require it be called "main")
         params.shader.entryPoint = "main";
-        params.shader.shader = getDefaultSpecializedShader(dataType,op,scratchSz);
+        params.shader.shader = getDefaultSpecializedShader(dataType,op,scratchElCount);
 
         m_device->createComputePipelines(
             nullptr, { &params,1 },
@@ -47,9 +47,9 @@ IGPUComputePipeline* CReduce::getDefaultPipeline(const CArithmeticOps::E_DATA_TY
     return m_pipelines[dataType][op].get();
 }
 
-core::smart_refctd_ptr<asset::ICPUShader> CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchSz) const
+core::smart_refctd_ptr<asset::ICPUShader> CReduce::createShader(const CArithmeticOps::E_DATA_TYPE dataType, const CArithmeticOps::E_OPERATOR op, const uint32_t scratchElCount) const
 {
-    core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz);
+    core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchElCount);
     return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", "false", "false");
 }
 
diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp
index 3c6c4088a3..5d0e9dc039 100644
--- a/src/nbl/video/utilities/CScanner.cpp
+++ b/src/nbl/video/utilities/CScanner.cpp
@@ -6,7 +6,7 @@ namespace nbl::video
 asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz)
 {
     if (!m_shaders[scanType][dataType][op])
-        m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz);
+        m_shaders[scanType][dataType][op] = createShader(scanType,dataType,op,scratchSz);
     return m_shaders[scanType][dataType][op].get();
 }
 
@@ -43,7 +43,7 @@ IGPUComputePipeline* CScanner::getDefaultPipeline(const E_SCAN_TYPE scanType, co
     return m_pipelines[scanType][dataType][op].get();
 }
     
-core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const char* shaderFile, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
+core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) const
 {
     core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz);
     return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true");

From dd2cc097d47d32bc94bc9279aa0477f683842674 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sun, 7 Jul 2024 18:01:34 +0300
Subject: [PATCH 16/25] Remove unused pseudoLevel parameter

---
 .../nbl/builtin/hlsl/scan/descriptors.hlsl    | 66 +++++++++----------
 .../builtin/hlsl/scan/virtual_workgroup.hlsl  |  8 +--
 2 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/descriptors.hlsl b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
index 93d383a7dd..2e2896cf37 100644
--- a/include/nbl/builtin/hlsl/scan/descriptors.hlsl
+++ b/include/nbl/builtin/hlsl/scan/descriptors.hlsl
@@ -33,48 +33,47 @@ void getData(
     NBL_REF_ARG(Storage_t) data,
     NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
     NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex,
-    NBL_CONST_REF_ARG(uint32_t) treeLevel,
-    NBL_CONST_REF_ARG(uint32_t) pseudoLevel
+    NBL_CONST_REF_ARG(uint32_t) treeLevel
 )
 {
     const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders
     
     uint32_t offset = levelInvocationIndex;
-    const bool notFirstOrLastLevel = bool(pseudoLevel);
+    const bool notFirstOrLastLevel = bool(treeLevel);
     if (notFirstOrLastLevel)
-        offset += params.temporaryStorageOffset[pseudoLevel-1u];
+        offset += params.temporaryStorageOffset[treeLevel-1u];
     
-    if (pseudoLevel!=treeLevel) // downsweep/scan
-    {
-        const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
-        if (bool(levelWorkgroupIndex) && firstInvocationInGroup)
-            data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[pseudoLevel]];
-
-        if (notFirstOrLastLevel)
-        {
-            if (!firstInvocationInGroup)
-                data = scanScratchBuf[0].data[offset-1u];
-        }
-        else
-        {
-            if(isExclusive)
-            {
-                if (!firstInvocationInGroup)
-                    data += scanBuffer[offset-1u];
-            }
-            else
-            {
-                data += scanBuffer[offset];
-            }
-        }
-    }
-    else
-    {
+    //if (pseudoLevel!=treeLevel) // downsweep/scan
+    //{
+    //    const bool firstInvocationInGroup = workgroup::SubgroupContiguousIndex()==0u;
+    //    if (bool(levelWorkgroupIndex) && firstInvocationInGroup)
+    //        data = scanScratchBuf[0].data[levelWorkgroupIndex+params.temporaryStorageOffset[treeLevel]];
+    //
+    //    if (notFirstOrLastLevel)
+    //    {
+    //        if (!firstInvocationInGroup)
+    //            data = scanScratchBuf[0].data[offset-1u];
+    //    }
+    //    else
+    //    {
+    //        if(isExclusive)
+    //        {
+    //            if (!firstInvocationInGroup)
+    //                data += scanBuffer[offset-1u];
+    //        }
+    //        else
+    //        {
+    //            data += scanBuffer[offset];
+    //        }
+    //    }
+    //}
+    //else
+    //{
         if (notFirstOrLastLevel)
             data = scanScratchBuf[0].data[offset];
         else
             data = scanBuffer[offset];
-    }
+    //}
 }
 
 template<typename Storage_t, bool isScan>
@@ -83,7 +82,6 @@ void setData(
     NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex,
     NBL_CONST_REF_ARG(uint32_t) levelWorkgroupIndex,
     NBL_CONST_REF_ARG(uint32_t) treeLevel,
-    NBL_CONST_REF_ARG(uint32_t) pseudoLevel,
     NBL_CONST_REF_ARG(bool) inRange
 )
 {
@@ -101,9 +99,9 @@ void setData(
             scanScratchBuf[0u].reduceResult = data;
         }
         // The following only for isScan == true
-        else if (bool(pseudoLevel))
+        else if (bool(treeLevel))
         {
-            const uint32_t offset = params.temporaryStorageOffset[pseudoLevel-1u];
+            const uint32_t offset = params.temporaryStorageOffset[treeLevel-1u];
             scanScratchBuf[0].data[levelInvocationIndex+offset] = data;
         }
         else
diff --git a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
index 3bebbd7305..f9bd38a9e3 100644
--- a/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
+++ b/include/nbl/builtin/hlsl/scan/virtual_workgroup.hlsl
@@ -34,9 +34,7 @@ namespace scan
         const Parameters_t params = getParameters();
         const uint32_t levelInvocationIndex = levelWorkgroupIndex * glsl::gl_WorkGroupSize().x + workgroup::SubgroupContiguousIndex();
         
-        // pseudoLevel is the level index going up until toplevel then back down
-        const uint32_t pseudoLevel = isScan ? (params.topLevel-treeLevel) : treeLevel;
-        const bool inRange = levelInvocationIndex <= params.lastElement[pseudoLevel]; // the lastElement array contains the lastElement's index hence the '<='
+        const bool inRange = levelInvocationIndex <= params.lastElement[treeLevel]; // the lastElement array contains the lastElement's index hence the '<='
 
         // REVIEW: Right now in order to support REDUCE operation we need to set the max treeLevel == topLevel
         // so that it exits after reaching the top?
@@ -49,7 +47,7 @@ namespace scan
         Storage_t data = Binop::identity; // REVIEW: replace Storage_t with Binop::type_t?
         if(inRange)
         {
-            getData<Storage_t, isExclusive>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel);
+            getData<Storage_t, isExclusive>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel);
         }
 
         bool doReduce = !isScan;
@@ -84,7 +82,7 @@ namespace scan
             }
         }
         
-        setData<Storage_t, isScan>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, pseudoLevel, inRange);
+        setData<Storage_t, isScan>(data, levelInvocationIndex, levelWorkgroupIndex, treeLevel, inRange);
     }
 
     DefaultSchedulerParameters_t getSchedulerParameters(); // this is defined in the final shader that assembles all the SCAN operation components

From ce78fc09bf3569c2e794c5a68f7dcf113f27ce6c Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sun, 7 Jul 2024 19:06:14 +0300
Subject: [PATCH 17/25] Fix workgroupFinishFlagsOffset[0] being twice the
 needed size

---
 include/nbl/video/utilities/CArithmeticOps.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h
index 492e8fe641..749faa204d 100644
--- a/include/nbl/video/utilities/CArithmeticOps.h
+++ b/include/nbl/video/utilities/CArithmeticOps.h
@@ -100,7 +100,7 @@ class CArithmeticOps : public core::IReferenceCounted
 				std::reverse_copy(cumulativeWorkgroupCount,cumulativeWorkgroupCount+topLevel,cumulativeWorkgroupCount+topLevel+1u);
 				cumulativeWorkgroupCount[topLevel] = 1u; // the top level will always end up with 1 workgroup to do the final reduction
 				for (auto i = 0u; i <= topLevel; i++) {
-					workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize - 1u)) + 1; // RECHECK: findMSB(511) == 8u !! Here we assume it's 9u !!
+					workgroupFinishFlagsOffset[i] = ((cumulativeWorkgroupCount[i] - 1u) >> hlsl::findMSB(workgroupSize)) + 1; // RECHECK: findMSB(511) == 8u !! Here we assume it's 9u !!
 					lastWorkgroupSetCountForLevel[i] = (cumulativeWorkgroupCount[i] - 1u) & (workgroupSize - 1u);
 				}
 

From 0d71686e221bec826741f663c0bbbaf7b10ecd84 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sun, 21 Jul 2024 14:47:39 +0300
Subject: [PATCH 18/25] Revert .gitmodules pointing to examples fork

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index f4bc04a799..ea533a6736 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -69,7 +69,7 @@
 	branch = main
 [submodule "examples_tests"]
 	path = examples_tests
-	url = git@github.com:kpentaris/Nabla-Examples-and-Tests.git
+	url = git@github.com:Devsh-Graphics-Programming/Nabla-Examples-and-Tests.git
 	branch = master
 [submodule "3rdparty/dxc/dxc"]
 	path = 3rdparty/dxc/dxc

From 7f4fcd527423ac34ec5e18e9c09a514ef0ed18ba Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Tue, 25 Feb 2025 20:59:51 +0200
Subject: [PATCH 19/25] merge upstream Nable  to local

---
 3rdparty/Vulkan-Headers | 2 +-
 3rdparty/dxc/dxc        | 2 +-
 3rdparty/glslang        | 2 +-
 3rdparty/imgui          | 2 +-
 3rdparty/imguizmo       | 2 +-
 3rdparty/openexr        | 2 +-
 3rdparty/shaderc        | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers
index 234c4b7370..2c823b7f27 160000
--- a/3rdparty/Vulkan-Headers
+++ b/3rdparty/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 234c4b7370a8ea3239a214c9e871e4b17c89f4ab
+Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb
diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 8a281b7e34..a08b6cbeb1 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 8a281b7e34513b3c142e895b96c522810d585ad9
+Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb
diff --git a/3rdparty/glslang b/3rdparty/glslang
index ff26c2e995..f397c9b6e9 160000
--- a/3rdparty/glslang
+++ b/3rdparty/glslang
@@ -1 +1 @@
-Subproject commit ff26c2e995ae521cb9fbc902fb4d686a47e4eb53
+Subproject commit f397c9b6e90bc53aa2e9feaef1a9cdf20ca43298
diff --git a/3rdparty/imgui b/3rdparty/imgui
index a29e9dba30..e489e40a85 160000
--- a/3rdparty/imgui
+++ b/3rdparty/imgui
@@ -1 +1 @@
-Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175
+Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e
diff --git a/3rdparty/imguizmo b/3rdparty/imguizmo
index b10e91756d..6f4b2197ef 160000
--- a/3rdparty/imguizmo
+++ b/3rdparty/imguizmo
@@ -1 +1 @@
-Subproject commit b10e91756d32395f5c1fefd417899b657ed7cb88
+Subproject commit 6f4b2197efd715d16b19775b00f36c6c6f5aacb6
diff --git a/3rdparty/openexr b/3rdparty/openexr
index c8a74d9ac9..fca936a964 160000
--- a/3rdparty/openexr
+++ b/3rdparty/openexr
@@ -1 +1 @@
-Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd
+Subproject commit fca936a964da5983daecdbed7cd249934701b41a
diff --git a/3rdparty/shaderc b/3rdparty/shaderc
index d2564ba598..e72186b66b 160000
--- a/3rdparty/shaderc
+++ b/3rdparty/shaderc
@@ -1 +1 @@
-Subproject commit d2564ba5989c9de1a76714b3e59ec60595e9be50
+Subproject commit e72186b66bb90ed06aaf15cbdc9a053581a0616b

From 201636d50412e4dc685bec18f0f5fe34ab0b9ec2 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sat, 15 Mar 2025 20:48:22 +0200
Subject: [PATCH 20/25] update 3rdparty to point to master submodules

---
 3rdparty/Vulkan-Headers | 2 +-
 3rdparty/dxc/dxc        | 2 +-
 3rdparty/glslang        | 2 +-
 3rdparty/imgui          | 2 +-
 3rdparty/imguizmo       | 2 +-
 3rdparty/openexr        | 2 +-
 3rdparty/shaderc        | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers
index 2c823b7f27..234c4b7370 160000
--- a/3rdparty/Vulkan-Headers
+++ b/3rdparty/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb
+Subproject commit 234c4b7370a8ea3239a214c9e871e4b17c89f4ab
diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index a08b6cbeb1..b2e75826b7 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb
+Subproject commit b2e75826b70d85d03686dd8a755ef477b4fa3807
diff --git a/3rdparty/glslang b/3rdparty/glslang
index f397c9b6e9..ff26c2e995 160000
--- a/3rdparty/glslang
+++ b/3rdparty/glslang
@@ -1 +1 @@
-Subproject commit f397c9b6e90bc53aa2e9feaef1a9cdf20ca43298
+Subproject commit ff26c2e995ae521cb9fbc902fb4d686a47e4eb53
diff --git a/3rdparty/imgui b/3rdparty/imgui
index e489e40a85..a29e9dba30 160000
--- a/3rdparty/imgui
+++ b/3rdparty/imgui
@@ -1 +1 @@
-Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e
+Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175
diff --git a/3rdparty/imguizmo b/3rdparty/imguizmo
index 6f4b2197ef..b10e91756d 160000
--- a/3rdparty/imguizmo
+++ b/3rdparty/imguizmo
@@ -1 +1 @@
-Subproject commit 6f4b2197efd715d16b19775b00f36c6c6f5aacb6
+Subproject commit b10e91756d32395f5c1fefd417899b657ed7cb88
diff --git a/3rdparty/openexr b/3rdparty/openexr
index fca936a964..c8a74d9ac9 160000
--- a/3rdparty/openexr
+++ b/3rdparty/openexr
@@ -1 +1 @@
-Subproject commit fca936a964da5983daecdbed7cd249934701b41a
+Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd
diff --git a/3rdparty/shaderc b/3rdparty/shaderc
index e72186b66b..d2564ba598 160000
--- a/3rdparty/shaderc
+++ b/3rdparty/shaderc
@@ -1 +1 @@
-Subproject commit e72186b66bb90ed06aaf15cbdc9a053581a0616b
+Subproject commit d2564ba5989c9de1a76714b3e59ec60595e9be50

From 21bfa0feb3dbed4b4ca2745f3d067363d10f5151 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sun, 16 Mar 2025 13:02:18 +0200
Subject: [PATCH 21/25] update the hlsl/scan module paths to CMakeLists

---
 src/nbl/builtin/CMakeLists.txt | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index d3dccd1ffb..1bafe6a332 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -330,21 +330,21 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h")
 #workgroup
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/arithmetic.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/ballot.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/basic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/broadcast.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl")
-#global scan
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/declarations.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/default_scheduler.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/descriptors.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/direct.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scan_scheduler.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scheduler.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/virtual_workgroup.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl")
+#global scan
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/declarations.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/default_scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/descriptors.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/direct.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/scan_scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/scheduler.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/scan/virtual_workgroup.hlsl")
 #Extensions
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl")

From 0a3728ccc3525e9552b7072b42dc2ad03e4ebc67 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sun, 16 Mar 2025 13:50:44 +0200
Subject: [PATCH 22/25] Update carithmeticops with proper enum paths

---
 include/nbl/video/utilities/CArithmeticOps.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h
index 749faa204d..c5d54234af 100644
--- a/include/nbl/video/utilities/CArithmeticOps.h
+++ b/include/nbl/video/utilities/CArithmeticOps.h
@@ -142,10 +142,12 @@ class CArithmeticOps : public core::IReferenceCounted
         CArithmeticOps(core::smart_refctd_ptr<ILogicalDevice>&& device, const uint32_t workgroupSize) : m_device(std::move(device)), m_workgroupSize(workgroupSize)
 		{
 			assert(core::isPoT(m_workgroupSize));
-			const asset::SPushConstantRange pc_range[] = {asset::IShader::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)};
+			const asset::SPushConstantRange pc_range[] = {asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,0u,sizeof(DefaultPushConstants)};
 			const IGPUDescriptorSetLayout::SBinding bindings[2] = {
-				{ 0u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr }, // main buffer
-				{ 1u, asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, video::IGPUShader::ESS_COMPUTE, 1u, nullptr } // scratch
+				{.binding = 0u, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+					.stageFlags = video::IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1u, .immutableSamplers = nullptr },
+				{ .binding = 1u, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, 
+					.stageFlags = video::IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, .count = 1u, .immutableSamplers = nullptr } // scratch
 			};
 
 			m_ds_layout = m_device->createDescriptorSetLayout(bindings);
@@ -188,7 +190,7 @@ class CArithmeticOps : public core::IReferenceCounted
 			std::span<const IGPUCommandBuffer::SPipelineBarrierDependencyInfo> bufferBarriers
 		)
 		{
-			cmdbuf->pushConstants(layout, asset::IShader::ESS_COMPUTE, 0u, sizeof(DefaultPushConstants), &pushConstants);
+			cmdbuf->pushConstants(layout, asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(DefaultPushConstants), &pushConstants);
 			if (bufferBarriers.size() > 0)
 			{
 				for (uint32_t i = 0; i < bufferBarriers.size(); i++)
@@ -219,7 +221,7 @@ class CArithmeticOps : public core::IReferenceCounted
             
             auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(hlsl->getSize());
             memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize());
-            auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????");
+            auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????");
 
             // REVIEW: Probably combine the commons parts of CReduce and CScanner
             const char* storageType = nullptr;

From 1c32528240d96a3d04d53a9f061d2f6416333f8f Mon Sep 17 00:00:00 2001
From: pentakon <kpentaris@gmail.com>
Date: Mon, 17 Mar 2025 15:06:30 +0200
Subject: [PATCH 23/25] merge submodules changes to branch

---
 3rdparty/dxc/dxc | 2 +-
 examples_tests   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index b2e75826b7..8a281b7e34 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit b2e75826b70d85d03686dd8a755ef477b4fa3807
+Subproject commit 8a281b7e34513b3c142e895b96c522810d585ad9
diff --git a/examples_tests b/examples_tests
index fb30e097e4..73b4a8ead9 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit fb30e097e4ad32217f8588df2a81a266acf4a045
+Subproject commit 73b4a8ead98daed4a731cb3ae79ec84396d0e53c

From 82bd9a1a7f74a88b57c5ecf369ea325b697dfdfc Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Sat, 19 Apr 2025 17:49:15 +0300
Subject: [PATCH 24/25] Fix issues with ICPUBuffer changes

---
 include/nbl/video/utilities/CArithmeticOps.h | 3 +--
 src/nbl/video/utilities/CReduce.cpp          | 2 +-
 src/nbl/video/utilities/CScanner.cpp         | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/nbl/video/utilities/CArithmeticOps.h b/include/nbl/video/utilities/CArithmeticOps.h
index c5d54234af..ebeae08181 100644
--- a/include/nbl/video/utilities/CArithmeticOps.h
+++ b/include/nbl/video/utilities/CArithmeticOps.h
@@ -218,8 +218,7 @@ class CArithmeticOps : public core::IReferenceCounted
                 // hlsl = loadBuiltinData("nbl/builtin/hlsl/scan/indirect_reduce.hlsl"); ??
                 hlsl = loadBuiltinData(shaderFile);
             }
-            
-            auto buffer = core::make_smart_refctd_ptr<asset::ICPUBuffer>(hlsl->getSize());
+			auto buffer = asset::ICPUBuffer::create({ {.size = hlsl->getSize()} });
             memcpy(buffer->getPointer(), hlsl->getMappedPointer(), hlsl->getSize());
             auto cpushader = core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(buffer), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, asset::IShader::E_CONTENT_TYPE::ECT_HLSL, "????");
 
diff --git a/src/nbl/video/utilities/CReduce.cpp b/src/nbl/video/utilities/CReduce.cpp
index 7a56839636..16b5cc5e29 100644
--- a/src/nbl/video/utilities/CReduce.cpp
+++ b/src/nbl/video/utilities/CReduce.cpp
@@ -20,7 +20,7 @@ IGPUShader* CReduce::getDefaultSpecializedShader(const CArithmeticOps::E_DATA_TY
     {
         auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(dataType,op,scratchElCount));
         cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
-        cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
+        cpuShader->setShaderStage(asset::IShader::E_SHADER_STAGE::ESS_COMPUTE);
 
         auto gpushader = m_device->createShader(cpuShader.get());
 
diff --git a/src/nbl/video/utilities/CScanner.cpp b/src/nbl/video/utilities/CScanner.cpp
index 5d0e9dc039..3cdcc82ee7 100644
--- a/src/nbl/video/utilities/CScanner.cpp
+++ b/src/nbl/video/utilities/CScanner.cpp
@@ -16,7 +16,7 @@ IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, co
     {
         auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op,scratchSz));
         cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl");
-        cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE);
+        cpuShader->setShaderStage(asset::IShader::E_SHADER_STAGE::ESS_COMPUTE);
 
         auto gpushader = m_device->createShader(cpuShader.get());
 

From 6e740b3d5e3fefd262ceb042b8cad9c49a4fce71 Mon Sep 17 00:00:00 2001
From: Pentaris Konstantinos <kpentaris@gmail.com>
Date: Tue, 22 Apr 2025 13:52:05 +0300
Subject: [PATCH 25/25] Fix bad shared-mem accessor pattern in direct.hlsl and
 properly export symbols for CReduce and CScanner

---
 include/nbl/builtin/hlsl/scan/direct.hlsl | 4 ++--
 include/nbl/video/utilities/CReduce.h     | 2 +-
 include/nbl/video/utilities/CScanner.h    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/nbl/builtin/hlsl/scan/direct.hlsl b/include/nbl/builtin/hlsl/scan/direct.hlsl
index e7bbcf7233..3075816521 100644
--- a/include/nbl/builtin/hlsl/scan/direct.hlsl
+++ b/include/nbl/builtin/hlsl/scan/direct.hlsl
@@ -20,9 +20,9 @@ groupshared uint32_t wgScratch[SharedScratchSz];
 template<uint16_t offset>
 struct WGScratchProxy
 {
-    uint32_t get(const uint32_t ix)
+    void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
     {
-        return wgScratch[ix+offset];
+        value = wgScratch[ix+offset];
     }
     void set(const uint32_t ix, const uint32_t value)
     {
diff --git a/include/nbl/video/utilities/CReduce.h b/include/nbl/video/utilities/CReduce.h
index 0a3d896f2c..4a311aa738 100644
--- a/include/nbl/video/utilities/CReduce.h
+++ b/include/nbl/video/utilities/CReduce.h
@@ -10,7 +10,7 @@
 namespace nbl::video
 {
 
-    class CReduce final : public CArithmeticOps
+    class NBL_API2 CReduce final : public CArithmeticOps
     {
 
     public:
diff --git a/include/nbl/video/utilities/CScanner.h b/include/nbl/video/utilities/CScanner.h
index a6b2a6ca11..d705113f1a 100644
--- a/include/nbl/video/utilities/CScanner.h
+++ b/include/nbl/video/utilities/CScanner.h
@@ -142,7 +142,7 @@ prefix sum with subgroup sized workgroups at peak Bandwidth efficiency in about
 
 Console devs get to bring a gun to a knife fight...
 **/
-class CScanner final : public CArithmeticOps
+class NBL_API2 CScanner final : public CArithmeticOps
 {
 	public:		
 		enum E_SCAN_TYPE : uint8_t