From 89b3af8ba2a0285cb95afc0590c8b8bd5f36657b Mon Sep 17 00:00:00 2001 From: devsh Date: Thu, 14 Nov 2024 17:49:49 +0100 Subject: [PATCH 1/6] redo the snake curves --- .../nbl/builtin/hlsl/ndarray_addressing.hlsl | 64 +++++++++++++++---- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/include/nbl/builtin/hlsl/ndarray_addressing.hlsl b/include/nbl/builtin/hlsl/ndarray_addressing.hlsl index 79f10e3d85..c9390c16c9 100644 --- a/include/nbl/builtin/hlsl/ndarray_addressing.hlsl +++ b/include/nbl/builtin/hlsl/ndarray_addressing.hlsl @@ -13,27 +13,67 @@ namespace hlsl namespace ndarray_addressing { -uint32_t snakeCurve(NBL_CONST_REF_ARG(uint32_t3) coordinate, NBL_CONST_REF_ARG(uint32_t3) extents) +template::type> +T snakeCurve(const vector coordinate, const vector extents) { - return (coordinate.z * extents.y + coordinate.y) * extents.x + coordinate.x; + T retval = _static_cast(coordinate[Dims-1]); + for (int32_t i=Dims-2; i>=0; i--) + { + retval *= _static_cast(extents[i]); + retval += _static_cast(coordinate[i]); + } + return retval; } -uint32_t3 snakeCurveInverse(uint32_t linearIndex, NBL_CONST_REF_ARG(uint32_t2) gridDimPrefixProduct) +// highly specialized function, requires you know the prefices already and that dimension is higher than 1 +// TODO: make an even better one that takes precomputed reciprocals and stuff for fast integer division and modulo +template,uint16_t>::type> // TODO: NBL_REQUIRE Dims>=2 +vector snakeCurveInverse(const U linearIndex, const vector gridDimPrefixProduct) { - uint32_t3 index3D; + vector coord; + coord[Dims-1] = linearIndex/gridDimPrefixProduct[Dims-2]; + { + U prevRemainder = linearIndex; + for (int32_t i=Dims-2; i>0; i--) + { + prevRemainder -= gridDimPrefixProduct[i]*coord[i+1]; + coord[i] = prevRemainder/gridDimPrefixProduct[i-1]; + } + coord[0] = prevRemainder-gridDimPrefixProduct[0]*coord[1]; + } + coord[Dims-2] = linearIndex-coord[Dims-1]*gridDimPrefixProduct[Dims-2]; - index3D.z = linearIndex / gridDimPrefixProduct.y; - - const uint32_t tmp = linearIndex - (index3D.z * gridDimPrefixProduct.y); - index3D.y = tmp / gridDimPrefixProduct.x; - index3D.x = tmp - (index3D.y * gridDimPrefixProduct.x); + return coord; +} - return index3D; +namespace impl +{ +template +struct snakeCurveInverse +{ + static vector __call(const U linearIndex, const vector gridDim) + { + vector gridDimPrefixProduct; + gridDimPrefixProduct[0] = gridDim[0]; + for (int32_t i=1; i(linearIndex,gridDimPrefixProduct); + } +}; +template +struct snakeCurveInverse<1,U,T> +{ + static vector __call(const U linearIndex, const vector gridDim) + { + return vector(linearIndex); + } +}; } -uint32_t3 snakeCurveInverse(uint32_t linearIndex, NBL_CONST_REF_ARG(uint32_t3) gridDim) +template,uint16_t>::type> +vector snakeCurveInverse(const U linearIndex, const vector gridDim) { - return snakeCurveInverse(linearIndex, uint32_t2(gridDim.x, gridDim.x*gridDim.y)); + return impl::snakeCurveInverse::__call(linearIndex,gridDim); } } From 8e84fb97eb47a15c76034767608225d9f6b25cff Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 15 Nov 2024 17:55:08 +0100 Subject: [PATCH 2/6] save progress --- include/nbl/builtin/hlsl/blit/common.hlsl | 22 ++- .../nbl/builtin/hlsl/blit/compute_blit.hlsl | 153 +++++++++++++----- .../builtin/hlsl/blit/default_blit.comp.hlsl | 79 ++++++--- .../hlsl/blit/default_normalize.comp.hlsl | 4 +- 4 files changed, 187 insertions(+), 71 deletions(-) diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl index 739c17b652..47b22a151e 100644 --- a/include/nbl/builtin/hlsl/blit/common.hlsl +++ b/include/nbl/builtin/hlsl/blit/common.hlsl @@ -57,18 +57,30 @@ struct HistogramAccessor InterlockedAdd(statsBuff[wgID * (ConstevalParameters::AlphaBinCount + 1) + bucket], v); } }; +*/ + struct SharedAccessor { - float32_t get(float32_t idx) + template) + T get(uint16_t idx) { - return sMem[idx]; + return bit_cast(sMem[idx]); } - void set(float32_t idx, float32_t val) + + template) + void atomicIncr(uint16_t idx) { - sMem[idx] = val; + glsl::atomicAdd(sMem[idx],1u); + } + + // TODO: figure out how to provide 16bit access, subgroup op compact? + template) + void set(uint16_t idx, T val) + { + sMem[idx] = bit_cast(val); } }; -*/ +static SharedAccessor sharedAccessor; struct OutImgAccessor { diff --git a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl index 84e62913e5..1987ab3930 100644 --- a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl +++ b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl @@ -6,8 +6,8 @@ #include +#include #include -#include namespace nbl @@ -17,6 +17,116 @@ namespace hlsl namespace blit { +template< + bool DoCoverage, + uint16_t WorkGroupSize, + int32_t Dims, + typename InCombinedSamplerAccessor, + typename OutImageAccessor, +// typename KernelWeightsAccessor, +// typename HistogramAccessor, + typename SharedAccessor +> +void execute( + NBL_CONST_REF_ARG(InCombinedSamplerAccessor) inCombinedSamplerAccessor, + NBL_REF_ARG(OutImageAccessor) outImageAccessor, +// NBL_CONST_REF_ARG(KernelWeightsAccessor) kernelWeightsAccessor, +// NBL_REF_ARG(HistogramAccessor) histogramAccessor, + NBL_REF_ARG(SharedAccessor) sharedAccessor, + NBL_CONST_REF_ARG(SPerWorkgroup) params, + const uint16_t layer, + const vector virtWorkGroupID +) +{ + using uint16_tN = vector; + // the dimensional truncation is desired + const uint16_tN outputTexelsPerWG = uint16_tN(params.getOutputBaseCoord(uint16_t3(1,1,1))); + // its the min XYZ corner of the area the workgroup will sample from to produce its output + const uint16_tN minOutputPixel = virtWorkGroupID*outputTexelsPerWG; + + using float32_tN = vector; + const float32_tN scale = _static_cast(params.scale); + const float32_tN lastInputTexel = _static_cast(params.getInputLastTexel()); + const uint16_t inLevel = _static_cast(params.inLevel); + const float32_tN inImageSizeRcp = inCombinedSamplerAccessor.template extentRcp(inLevel); + + using int32_tN = vector; + // intermediate result only needed to compute `regionStartCoord`, basically the sampling coordinate of the minOutputPixel in the input texture + const float32_tN noGoodNameForThisThing = (float32_tN(minOutputPixel)+promote(0.5f))*scale-promote(0.5f); + // can be negative, its the min XYZ corner of the area the workgroup will sample from to produce its output + // TODO: is there a HLSL/SPIR-V round() that can simplify ceil(x-0.5)+0.5 ? + const float32_tN regionStartCoord = ceil(noGoodNameForThisThing)+promote(0.5f); + const float32_tN regionNextStartCoord = ceil(noGoodNameForThisThing+float32_tN(outputTexelsPerWG)*scale)+promote(0.5f); + + const uint16_tN preloadRegion = _static_cast(params.getPreloadExtentExceptLast()); + const uint16_t localInvocationIndex = _static_cast(glsl::gl_LocalInvocationIndex()); // workgroup::SubgroupContiguousIndex() + // need to clear our atomic coverage counter to 0 + const uint16_t coverageDWORD = _static_cast(params.coverageDWORD); + if (DoCoverage) + { + if (localInvocationIndex==0) + sharedAccessor.set(coverageDWORD,0u); + glsl::barrier(); + } + const uint16_t preloadLast = _static_cast(params.preloadLast); + for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation<=preloadLast; virtualInvocation+=WorkGroupSize) + { + // if we make all args in snakeCurveInverse 16bit maybe compiler will optimize the divisions into using float32_t + const uint16_tN virtualInvocationID = ndarray_addressing::snakeCurveInverse(virtualInvocation,preloadRegion); + const float32_tN inputTexCoordUnnorm = regionStartCoord + float32_tN(virtualInvocationID); + const float32_tN inputTexCoord = inputTexCoordUnnorm * inImageSizeRcp; + + const float32_t4 loadedData = inCombinedSamplerAccessor.template get(inputTexCoord,layer,inLevel); + + if (DoCoverage) + if (loadedData[params.coverageChannel]>=params.alphaRefValue && + all(inputTexCoordUnnorm=promote(0.5f)) && // within the image from below + all(inputTexCoordUnnorm<=lastInputTexel) // within the image from above + ) + { + sharedAccessor.template atomicIncr(coverageDWORD); + } + + [unroll(4)] + for (uint16_t ch=0; ch<4 && ch<=params.lastChannel; ch++) + sharedAccessor.set(preloadLast*ch+ch+virtualInvocation,loadedData[ch]); + } + glsl::barrier(); + + uint16_t readScratchOffset = uint16_t(0); + uint16_t writeScratchOffset = _static_cast(params.secondScratchOffset); + uint16_tN currentOutRegion = preloadRegion; + currentOutRegion[0] = outputTexelsPerWG[0]; + [unroll(3)] + for (int32_t axis=0; axis(virtualInvocation,currentOutRegion); + + // + } +/* + for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation struct compute_blit_t { @@ -26,7 +136,6 @@ struct compute_blit_t uint32_t kernelWeightsOffsetZ; uint32_t inPixelCount; uint32_t outPixelCount; - uint16_t3 outputTexelsPerWG; uint16_t3 inDims; uint16_t3 outDims; uint16_t3 windowDims; @@ -37,30 +146,6 @@ struct compute_blit_t uint16_t3 iterationRegionZPrefixProducts; uint16_t secondScratchOffset; - static compute_blit_t create(NBL_CONST_REF_ARG(parameters_t) params) - { - compute_blit_t compute_blit; - - compute_blit.scale = params.fScale; - compute_blit.negativeSupport = params.negativeSupport; - compute_blit.kernelWeightsOffsetY = params.kernelWeightsOffsetY; - compute_blit.kernelWeightsOffsetZ = params.kernelWeightsOffsetZ; - compute_blit.inPixelCount = params.inPixelCount; - compute_blit.outPixelCount = params.outPixelCount; - compute_blit.outputTexelsPerWG = params.getOutputTexelsPerWG(); - compute_blit.inDims = params.inputDims; - compute_blit.outDims = params.outputDims; - compute_blit.windowDims = params.windowDims; - compute_blit.phaseCount = params.phaseCount; - compute_blit.preloadRegion = params.preloadRegion; - compute_blit.iterationRegionXPrefixProducts = params.iterationRegionXPrefixProducts; - compute_blit.iterationRegionYPrefixProducts = params.iterationRegionYPrefixProducts; - compute_blit.iterationRegionZPrefixProducts = params.iterationRegionZPrefixProducts; - compute_blit.secondScratchOffset = params.secondScratchOffset; - - return compute_blit; - } - template < typename InCombinedSamplerAccessor, typename OutImageAccessor, @@ -76,24 +161,9 @@ struct compute_blit_t uint16_t3 workGroupID, uint16_t localInvocationIndex) { - const float3 halfScale = scale * float3(0.5f, 0.5f, 0.5f); // bottom of the input tile const uint32_t3 minOutputPixel = workGroupID * outputTexelsPerWG; - const float3 minOutputPixelCenterOfWG = float3(minOutputPixel)*scale + halfScale; - // this can be negative, in which case HW sampler takes care of wrapping for us - const int32_t3 regionStartCoord = int32_t3(ceil(minOutputPixelCenterOfWG - float3(0.5f, 0.5f, 0.5f) + negativeSupport)); - const uint32_t virtualInvocations = preloadRegion.x * preloadRegion.y * preloadRegion.z; - for (uint32_t virtualInvocation = localInvocationIndex; virtualInvocation < virtualInvocations; virtualInvocation += ConstevalParameters::WorkGroupSize) - { - const int32_t3 inputPixelCoord = regionStartCoord + int32_t3(ndarray_addressing::snakeCurveInverse(virtualInvocation, preloadRegion)); - float32_t3 inputTexCoord = (inputPixelCoord + float32_t3(0.5f, 0.5f, 0.5f)) / inDims; - const float4 loadedData = inCombinedSamplerAccessor.get(inputTexCoord, workGroupID.z); - - for (uint32_t ch = 0; ch < ConstevalParameters::BlitOutChannelCount; ++ch) - sharedAccessor.set(ch * ConstevalParameters::SMemFloatsPerChannel + virtualInvocation, loadedData[ch]); - } - GroupMemoryBarrierWithGroupSync(); const uint32_t3 iterationRegionPrefixProducts[3] = {iterationRegionXPrefixProducts, iterationRegionYPrefixProducts, iterationRegionZPrefixProducts}; @@ -188,6 +258,7 @@ struct compute_blit_t } } }; +#endif } } diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl index 1407d7fc77..19bf557012 100644 --- a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl +++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl @@ -2,9 +2,9 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/builtin/hlsl/blit/parameters.hlsl" +#include "nbl/builtin/hlsl/blit/compute_blit.hlsl" #include "nbl/builtin/hlsl/blit/common.hlsl" -//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl" /* struct HistogramAccessor @@ -21,22 +21,51 @@ struct KernelWeightsAccessor return kernelWeights[idx]; } }; +*/ -inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties::getIndexCoord(c, l), 0); +namespace nbl +{ +namespace hlsl +{ +template +struct SImageDimensions +{ +/* + static SImageDimensions query(NBL_CONST_REF_ARG(Texture1DArray) tex) + { + SImageDimensions image; + return + } */ + + T width,height,depth; + uint16_t layers; + uint16_t levels : 5; + uint16_t samples : 6; +}; +} +} + struct InImgAccessor { + template + vector extentRcp(const uint16_t level) + { + return pc.inputImageExtentRcp; + } + template) - vector get(const vector uv, uint16_t layer, uint16_t level) + vector get(const vector uv, uint16_t layer, uint16_t level) { return __get_impl(uv,_static_cast(layer),_static_cast(level)); } - + +// private template float32_t4 __get_impl(const vector uv, float layer, float level); - uint32_t descIx : 20; - uint32_t samplerIx : 12; + uint32_t descIx; + uint32_t samplerIx; }; template<> float32_t4 InImgAccessor::__get_impl<1>(const float32_t1 uv, float layer, float level) @@ -56,38 +85,40 @@ float32_t4 InImgAccessor::__get_impl<3>(const float32_t3 uv, float layer, float using namespace nbl::hlsl::blit; -// TODO: push constants - -[numthreads(ConstevalParameters::WorkGroupSize,1,1)] +// https://github.com/microsoft/DirectXShaderCompiler/issues/7001 +//[numthreads(ConstevalParameters::WorkGroupSize,1,1)] +[numthreads(NBL_WORKGROUP_SIZE,1,1)] void main() { InImgAccessor inImgA; + inImgA.descIx = pc.inputDescIx; + inImgA.samplerIx = pc.samplerDescIx; OutImgAccessor outImgA; outImgA.descIx = pc.outputDescIx; - const uint16_t3 wgID = _static_cast(glsl::gl_WorkGroupID()); - const uint16_t3 baseCoord = pc.perWG.getOutputBaseCoord(wgID); - // TODO: If and when someone can be bothered, change the blit api to compile a pipeline per image dimension, maybe it will be faster + const uint16_t3 virtWorkGroupID = _static_cast(glsl::gl_WorkGroupID()); + const uint16_t layer = virtWorkGroupID.z; + // TODO: If and when someone can be bothered, change the blit api to compile a pipeline per image dimension, maybe it will be faster. Library target could be useful for that! switch (pc.perWG.imageDim) { case 1: - outImgA.set(uint16_t1(baseCoord.x),wgID.z,float32_t4(1,0,1,1)); + if (pc.perWG.doCoverage()) + blit::execute(inImgA,outImgA,/*kernelW,histoA,*/sharedAccessor,pc.perWG,layer,uint16_t1(virtWorkGroupID.x)); + else + blit::execute(inImgA,outImgA,/*kernelW,histoA,*/sharedAccessor,pc.perWG,layer,uint16_t1(virtWorkGroupID.x)); break; case 2: - outImgA.set(baseCoord.xy,wgID.z,float32_t4(1,0,1,1)); + if (pc.perWG.doCoverage()) + blit::execute(inImgA,outImgA,/*kernelW,histoA,*/sharedAccessor,pc.perWG,layer,virtWorkGroupID.xy); + else + blit::execute(inImgA,outImgA,/*kernelW,histoA,*/sharedAccessor,pc.perWG,layer,virtWorkGroupID.xy); break; case 3: - outImgA.set(baseCoord,0xdeadu,float32_t4(1,0,1,1)); + if (pc.perWG.doCoverage()) + blit::execute(inImgA,outImgA,/*kernelW,histoA,*/sharedAccessor,pc.perWG,layer,virtWorkGroupID); + else + blit::execute(inImgA,outImgA,/*kernelW,histoA,*/sharedAccessor,pc.perWG,layer,virtWorkGroupID); break; } -/* - blit::compute_blit_t blit = blit::compute_blit_t::create(params); - InCSAccessor inCSA; - OutImgAccessor outImgA; - KernelWeightsAccessor kwA; - HistogramAccessor hA; - SharedAccessor sA; - blit.execute(inCSA, outImgA, kwA, hA, sA, workGroupID, localInvocationIndex); -*/ } \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl index 8e2f4beb23..5e683395d1 100644 --- a/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl +++ b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl @@ -10,7 +10,9 @@ using namespace nbl::hlsl::blit; // TODO: push constants -[numthreads(ConstevalParameters::WorkGroupSize,1,1)] +// https://github.com/microsoft/DirectXShaderCompiler/issues/7001 +//[numthreads(ConstevalParameters::WorkGroupSize,1,1)] +[numthreads(NBL_WORKGROUP_SIZE,1,1)] void main() { } \ No newline at end of file From 30022f5156a88280514363986ecbd2e62489fe4f Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 15 Nov 2024 17:56:01 +0100 Subject: [PATCH 3/6] try to work around DXC assert/bug --- .../nbl/builtin/hlsl/ndarray_addressing.hlsl | 53 ++++--------------- 1 file changed, 11 insertions(+), 42 deletions(-) diff --git a/include/nbl/builtin/hlsl/ndarray_addressing.hlsl b/include/nbl/builtin/hlsl/ndarray_addressing.hlsl index c9390c16c9..c31e705748 100644 --- a/include/nbl/builtin/hlsl/ndarray_addressing.hlsl +++ b/include/nbl/builtin/hlsl/ndarray_addressing.hlsl @@ -13,7 +13,7 @@ namespace hlsl namespace ndarray_addressing { -template::type> +template::type> T snakeCurve(const vector coordinate, const vector extents) { T retval = _static_cast(coordinate[Dims-1]); @@ -25,57 +25,26 @@ T snakeCurve(const vector coordinate, const vector extents) return retval; } -// highly specialized function, requires you know the prefices already and that dimension is higher than 1 // TODO: make an even better one that takes precomputed reciprocals and stuff for fast integer division and modulo -template,uint16_t>::type> // TODO: NBL_REQUIRE Dims>=2 -vector snakeCurveInverse(const U linearIndex, const vector gridDimPrefixProduct) +// https://github.com/milakov/int_fastdiv +template,uint16_t>::type> +vector snakeCurveInverse(const U linearIndex, const vector gridDim) { vector coord; - coord[Dims-1] = linearIndex/gridDimPrefixProduct[Dims-2]; { - U prevRemainder = linearIndex; - for (int32_t i=Dims-2; i>0; i--) + U prev = linearIndex; + U next; + for (int32_t i=0; i -struct snakeCurveInverse -{ - static vector __call(const U linearIndex, const vector gridDim) - { - vector gridDimPrefixProduct; - gridDimPrefixProduct[0] = gridDim[0]; - for (int32_t i=1; i(linearIndex,gridDimPrefixProduct); - } -}; -template -struct snakeCurveInverse<1,U,T> -{ - static vector __call(const U linearIndex, const vector gridDim) - { - return vector(linearIndex); - } -}; -} - -template,uint16_t>::type> -vector snakeCurveInverse(const U linearIndex, const vector gridDim) -{ - return impl::snakeCurveInverse::__call(linearIndex,gridDim); -} - } } } From fc00985fd71f9162626becdbb08841e216e12bb8 Mon Sep 17 00:00:00 2001 From: devsh Date: Fri, 15 Nov 2024 17:56:54 +0100 Subject: [PATCH 4/6] add some more type traits --- include/nbl/builtin/hlsl/type_traits.hlsl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl index 58cedd81dd..8124d0e89b 100644 --- a/include/nbl/builtin/hlsl/type_traits.hlsl +++ b/include/nbl/builtin/hlsl/type_traits.hlsl @@ -610,6 +610,8 @@ NBL_CONSTEXPR bool is_floating_point_v = is_floating_point::value; template NBL_CONSTEXPR bool is_signed_v = is_signed::value; template +NBL_CONSTEXPR bool is_fundamental_v = is_fundamental::value; +template NBL_CONSTEXPR bool is_scalar_v = is_scalar::value; template NBL_CONSTEXPR uint32_t alignment_of_v = alignment_of::value; From 6c849c6fa4c9c6d4c950ff98d62f9a03b89ed04a Mon Sep 17 00:00:00 2001 From: devsh Date: Sat, 16 Nov 2024 00:15:07 +0100 Subject: [PATCH 5/6] save some ideas --- .../nbl/builtin/hlsl/blit/compute_blit.hlsl | 226 +++++++----------- .../builtin/hlsl/blit/default_blit.comp.hlsl | 2 +- include/nbl/builtin/hlsl/blit/parameters.hlsl | 136 ++++++++--- 3 files changed, 191 insertions(+), 173 deletions(-) diff --git a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl index 1987ab3930..37c809935b 100644 --- a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl +++ b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl @@ -38,27 +38,30 @@ void execute( const vector virtWorkGroupID ) { + const uint16_t lastChannel = params.lastChannel; + const uint16_t coverageChannel = params.coverageChannel; + using uint16_tN = vector; // the dimensional truncation is desired - const uint16_tN outputTexelsPerWG = uint16_tN(params.getOutputBaseCoord(uint16_t3(1,1,1))); + const uint16_tN outputTexelsPerWG = truncate(params.getOutputBaseCoord(uint16_t3(1,1,1))); // its the min XYZ corner of the area the workgroup will sample from to produce its output - const uint16_tN minOutputPixel = virtWorkGroupID*outputTexelsPerWG; + const uint16_tN minOutputTexel = virtWorkGroupID*outputTexelsPerWG; using float32_tN = vector; - const float32_tN scale = _static_cast(params.scale); - const float32_tN lastInputTexel = _static_cast(params.getInputLastTexel()); + const float32_tN scale = truncate(params.scale); + const float32_tN inputEndCoord = truncate(params.getInputEndCoord()); const uint16_t inLevel = _static_cast(params.inLevel); const float32_tN inImageSizeRcp = inCombinedSamplerAccessor.template extentRcp(inLevel); using int32_tN = vector; - // intermediate result only needed to compute `regionStartCoord`, basically the sampling coordinate of the minOutputPixel in the input texture - const float32_tN noGoodNameForThisThing = (float32_tN(minOutputPixel)+promote(0.5f))*scale-promote(0.5f); + // intermediate result only needed to compute `regionStartCoord`, basically the sampling coordinate of the minOutputTexel in the input texture + const float32_tN noGoodNameForThisThing = (float32_tN(minOutputTexel)+promote(0.5f))*scale-promote(0.5f); // can be negative, its the min XYZ corner of the area the workgroup will sample from to produce its output // TODO: is there a HLSL/SPIR-V round() that can simplify ceil(x-0.5)+0.5 ? const float32_tN regionStartCoord = ceil(noGoodNameForThisThing)+promote(0.5f); const float32_tN regionNextStartCoord = ceil(noGoodNameForThisThing+float32_tN(outputTexelsPerWG)*scale)+promote(0.5f); - const uint16_tN preloadRegion = _static_cast(params.getPreloadExtentExceptLast()); + const uint16_tN preloadRegion = truncate(params.getPreloadExtentExceptLast()); const uint16_t localInvocationIndex = _static_cast(glsl::gl_LocalInvocationIndex()); // workgroup::SubgroupContiguousIndex() // need to clear our atomic coverage counter to 0 const uint16_t coverageDWORD = _static_cast(params.coverageDWORD); @@ -68,8 +71,8 @@ void execute( sharedAccessor.set(coverageDWORD,0u); glsl::barrier(); } - const uint16_t preloadLast = _static_cast(params.preloadLast); - for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation<=preloadLast; virtualInvocation+=WorkGroupSize) + const uint16_t preloadCount = _static_cast(params.preloadCount); + for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation(virtualInvocation,preloadRegion); @@ -79,37 +82,104 @@ void execute( const float32_t4 loadedData = inCombinedSamplerAccessor.template get(inputTexCoord,layer,inLevel); if (DoCoverage) - if (loadedData[params.coverageChannel]>=params.alphaRefValue && + if (loadedData[coverageChannel]>=params.alphaRefValue && all(inputTexCoordUnnorm=promote(0.5f)) && // within the image from below - all(inputTexCoordUnnorm<=lastInputTexel) // within the image from above + all(inputTexCoordUnnorm(coverageDWORD); + // TODO: atomicIncr or a workgroup reduction of ballots? +// sharedAccessor.template atomicIncr(coverageDWORD); } [unroll(4)] - for (uint16_t ch=0; ch<4 && ch<=params.lastChannel; ch++) - sharedAccessor.set(preloadLast*ch+ch+virtualInvocation,loadedData[ch]); + for (uint16_t ch=0; ch<4 && ch<=lastChannel; ch++) + sharedAccessor.template set(preloadCount*ch+virtualInvocation,loadedData[ch]); } glsl::barrier(); uint16_t readScratchOffset = uint16_t(0); uint16_t writeScratchOffset = _static_cast(params.secondScratchOffset); + uint16_t prevPassInvocationCount = preloadCount; + uint32_t kernelWeightOffset = 0; + // uint16_tN currentOutRegion = preloadRegion; - currentOutRegion[0] = outputTexelsPerWG[0]; [unroll(3)] for (int32_t axis=0; axis(virtualInvocation,currentOutRegion); - + const uint16_t phaseCount = params.getPhaseCount(axis); + const uint32_t windowExtent = 0x45; + // We sweep along X, then Y, then Z, at every step we need the loads from smem to be performed on consecutive values so that we don't have bank conflicts + currentOutRegion[axis] = outputTexelsPerWG[axis]; // + const uint16_t invocationCount = params.getPassInvocationCount(axis); + for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation(virtualInvocation,currentOutRegion); + + // we sweep along a line at a time + uint16_t localOutputCoord = virtualInvocation[0]; // TODO + // we can actually compute the output position of this line + const uint16_t globalOutputCoord = localOutputCoord+minOutputTexel[axis]; + // hopefull the compiler will see that float32_t may be possible here due to sizeof(float32_t mantissa)>sizeof(uint16_t) + const uint32_t windowPhase = globalOutputCoord % phaseCount; + + //const int32_t windowStart = ceil(localOutputCoord+0.5f; + + // let us sweep + float32_t4 accum = promote(0.f); + { + uint32_t kernelWeightIndex = windowPhase*windowExtent+kernelWeightOffset; + uint16_t inputIndex = readScratchOffset+0x45; // (minKernelWindow - regionStartCoord[axis]) + combinedStride*preloadRegion[axis]; + for (uint16_t i=0; i(ch*prevPassInvocationCount+inputIndex)*kernelWeight[ch]; + } + } + + // now write outputs + if (axis(outCoord,layer,accum); + if (DoCoverage) + { +// const uint32_t bucketIndex = uint32_t(round(accum[coverageChannel] * float(ConstevalParameters::AlphaBinCount - 1))); +// histogramAccessor.atomicAdd(workGroupID.z,bucketIndex,uint32_t(1)); + } + } + else + { + uint32_t scratchOffset = writeScratchOffset; + if (axis == 0) + scratchOffset += ndarray_addressing::snakeCurve(virtualInvocationID.yxz, uint32_t3(preloadRegion.y, outputTexelsPerWG.x, preloadRegion.z)); + else + scratchOffset += writeScratchOffset + ndarray_addressing::snakeCurve(virtualInvocationID.zxy, uint32_t3(preloadRegion.z, outputTexelsPerWG.y, outputTexelsPerWG.x)); + + [unroll(4)] + for (uint16_t ch=0; ch<4 && ch<=lastChannel; ch++) + sharedAccessor.template set(ch*invocationCount+scratchOffset,accum[ch]); + } + } + glsl::barrier(); + kernelWeightOffset += phaseCount*windowExtent; + prevPassInvocationCount = invocationCount; + // TODO: use Przemog's `nbl::hlsl::swap` method when the float64 stuff gets merged + const uint32_t tmp = readScratchOffset; + readScratchOffset = writeScratchOffset; + writeScratchOffset = tmp; } /* - for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation -struct compute_blit_t -{ - float32_t3 scale; - float32_t3 negativeSupport; - uint32_t kernelWeightsOffsetY; - uint32_t kernelWeightsOffsetZ; - uint32_t inPixelCount; - uint32_t outPixelCount; - uint16_t3 inDims; - uint16_t3 outDims; - uint16_t3 windowDims; - uint16_t3 phaseCount; - uint16_t3 preloadRegion; - uint16_t3 iterationRegionXPrefixProducts; - uint16_t3 iterationRegionYPrefixProducts; - uint16_t3 iterationRegionZPrefixProducts; - uint16_t secondScratchOffset; - - template < - typename InCombinedSamplerAccessor, - typename OutImageAccessor, - typename KernelWeightsAccessor, - typename HistogramAccessor, - typename SharedAccessor> - void execute( - NBL_CONST_REF_ARG(InCombinedSamplerAccessor) inCombinedSamplerAccessor, - NBL_REF_ARG(OutImageAccessor) outImageAccessor, - NBL_CONST_REF_ARG(KernelWeightsAccessor) kernelWeightsAccessor, - NBL_REF_ARG(HistogramAccessor) histogramAccessor, - NBL_REF_ARG(SharedAccessor) sharedAccessor, - uint16_t3 workGroupID, - uint16_t localInvocationIndex) - { - // bottom of the input tile - const uint32_t3 minOutputPixel = workGroupID * outputTexelsPerWG; - - - const uint32_t3 iterationRegionPrefixProducts[3] = {iterationRegionXPrefixProducts, iterationRegionYPrefixProducts, iterationRegionZPrefixProducts}; - - uint32_t readScratchOffset = 0; - uint32_t writeScratchOffset = secondScratchOffset; - for (uint32_t axis = 0; axis < ConstevalParameters::BlitDimCount; ++axis) - { - for (uint32_t virtualInvocation = localInvocationIndex; virtualInvocation < iterationRegionPrefixProducts[axis].z; virtualInvocation += ConstevalParameters::WorkGroupSize) - { - const uint32_t3 virtualInvocationID = ndarray_addressing::snakeCurveInverse(virtualInvocation, iterationRegionPrefixProducts[axis].xy); - uint32_t outputPixel = virtualInvocationID.x; if (axis == 2) outputPixel = virtualInvocationID.z; outputPixel += minOutputPixel[axis]; - if (outputPixel >= outDims[axis]) - break; - const int32_t minKernelWindow = int32_t(ceil((outputPixel + 0.5f) * scale[axis] - 0.5f + negativeSupport[axis])); // Combined stride for the two non-blitting dimensions, tightly coupled and experimentally derived with/by `iterationRegionPrefixProducts` above and the general order of iteration we use to avoid @@ -197,67 +215,7 @@ struct compute_blit_t combinedStride = virtualInvocationID.y * outputTexelsPerWG.y + virtualInvocationID.x; } - uint32_t offset = readScratchOffset + (minKernelWindow - regionStartCoord[axis]) + combinedStride*preloadRegion[axis]; - const uint32_t windowPhase = outputPixel % phaseCount[axis]; - - uint32_t kernelWeightIndex; - if (axis == 0) - kernelWeightIndex = windowPhase * windowDims.x; - else if (axis == 1) - kernelWeightIndex = kernelWeightsOffsetY + windowPhase * windowDims.y; - else if (axis == 2) - kernelWeightIndex = kernelWeightsOffsetZ + windowPhase * windowDims.z; - - float4 kernelWeight = kernelWeightsAccessor.get(kernelWeightIndex); - - float4 accum = float4(0.f, 0.f, 0.f, 0.f); - for (uint32_t ch = 0; ch < ConstevalParameters::BlitOutChannelCount; ++ch) - accum[ch] = sharedAccessor.get(ch * ConstevalParameters::SMemFloatsPerChannel + offset) * kernelWeight[ch]; - - for (uint32_t i = 1; i < windowDims[axis]; ++i) - { - kernelWeightIndex++; - offset++; - - kernelWeight = kernelWeightsAccessor.get(kernelWeightIndex); - for (uint ch = 0; ch < ConstevalParameters::BlitOutChannelCount; ++ch) - accum[ch] += sharedAccessor.get(ch * ConstevalParameters::SMemFloatsPerChannel + offset) * kernelWeight[ch]; - } - - const bool lastPass = (axis == (ConstevalParameters::BlitDimCount - 1)); - if (lastPass) - { - // Tightly coupled with iteration order (`iterationRegionPrefixProducts`) - uint32_t3 outCoord = virtualInvocationID.yxz; - if (axis == 0) - outCoord = virtualInvocationID.xyz; - outCoord += minOutputPixel; - - const uint32_t bucketIndex = uint32_t(round(clamp(accum.a, 0, 1) * float(ConstevalParameters::AlphaBinCount-1))); - histogramAccessor.atomicAdd(workGroupID.z, bucketIndex, uint32_t(1)); - - outImageAccessor.set(outCoord, workGroupID.z, accum); - } - else - { - uint32_t scratchOffset = writeScratchOffset; - if (axis == 0) - scratchOffset += ndarray_addressing::snakeCurve(virtualInvocationID.yxz, uint32_t3(preloadRegion.y, outputTexelsPerWG.x, preloadRegion.z)); - else - scratchOffset += writeScratchOffset + ndarray_addressing::snakeCurve(virtualInvocationID.zxy, uint32_t3(preloadRegion.z, outputTexelsPerWG.y, outputTexelsPerWG.x)); - - for (uint32_t ch = 0; ch < ConstevalParameters::BlitOutChannelCount; ++ch) - sharedAccessor.set(ch * ConstevalParameters::SMemFloatsPerChannel + scratchOffset, accum[ch]); - } } - - const uint32_t tmp = readScratchOffset; - readScratchOffset = writeScratchOffset; - writeScratchOffset = tmp; - GroupMemoryBarrierWithGroupSync(); - } - } -}; #endif } diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl index 19bf557012..28a89c2284 100644 --- a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl +++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl @@ -51,7 +51,7 @@ struct InImgAccessor template vector extentRcp(const uint16_t level) { - return pc.inputImageExtentRcp; + return truncate(pc.inputImageExtentRcp); } template) diff --git a/include/nbl/builtin/hlsl/blit/parameters.hlsl b/include/nbl/builtin/hlsl/blit/parameters.hlsl index 163a24fac3..5b67b5e84d 100644 --- a/include/nbl/builtin/hlsl/blit/parameters.hlsl +++ b/include/nbl/builtin/hlsl/blit/parameters.hlsl @@ -5,6 +5,10 @@ #define _NBL_BUILTIN_HLSL_BLIT_PARAMETERS_INCLUDED_ +#include "nbl/builtin/hlsl/bit.hlsl" +#include "nbl/builtin/hlsl/limits.hlsl" + + namespace nbl { namespace hlsl @@ -16,13 +20,9 @@ struct parameters_t { float32_t3 fScale; // float32_t3 negativeSupport; - float32_t referenceAlpha; - uint32_t kernelWeightsOffsetY; - uint32_t kernelWeightsOffsetZ; - uint32_t inPixelCount; - uint32_t outPixelCount; + float32_t referenceAlpha; // - uint16_t3 inputDims; + uint16_t3 inputDims; // uint16_t3 outputDims; uint16_t3 windowDims; // uint16_t3 phaseCount; @@ -30,33 +30,22 @@ struct parameters_t uint16_t3 iterationRegionXPrefixProducts; uint16_t3 iterationRegionYPrefixProducts; uint16_t3 iterationRegionZPrefixProducts; - - uint16_t secondScratchOffset; // - uint16_t outputTexelsPerWGZ; // - - uint32_t3 getOutputTexelsPerWG() - { - //! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is - //! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change. - return uint32_t3(iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ); - } }; // We do some dumb things with bitfields here like not using `vector`, because AMD doesn't support them in push constants struct SPerWorkgroup { - static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t _imageDim, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset) + static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t _imageDim, const uint16_t3 output, const uint16_t3 preload, const uint16_t _secondScratchOffDWORD) { SPerWorkgroup retval; retval.scale = _scale; retval.imageDim = _imageDim; retval.preloadWidth = preload[0]; retval.preloadHeight = preload[1]; - retval.preloadDepth = preload[2]; retval.outputWidth = output[0]; retval.outputHeight = output[1]; retval.outputDepth = output[2]; - retval.otherPreloadOffset = _otherPreloadOffset; + retval.secondScratchOffDWORD = _secondScratchOffDWORD; return retval; } @@ -75,28 +64,102 @@ struct SPerWorkgroup return retval; } + inline uint16_t2 getPreloadExtentExceptLast() NBL_CONST_MEMBER_FUNC + { + return uint16_t2(preloadWidth,preloadHeight); + } + + inline uint16_t getPhaseCount(const int32_t axis) NBL_CONST_MEMBER_FUNC + { + switch (axis) + { + case 2: + return phaseCountZ; + break; + case 1: + return phaseCountY; + break; + default: + break; + } + return phaseCountX; + } + + inline uint16_t getPassInvocationCount(const int32_t axis) NBL_CONST_MEMBER_FUNC + { + switch (axis) + { + case 2: + return zPassInvocations; + break; + case 1: + return yPassInvocations; + break; + default: + break; + } + return xPassInvocations; + } + + inline bool doCoverage() NBL_CONST_MEMBER_FUNC + { + return bool(coverageDWORD); + } + + inline uint32_t3 getInputEndCoord() NBL_CONST_MEMBER_FUNC + { + return uint32_t3(inputEndX,inputEndX,inputEndZ); + } + #ifndef __HLSL_VERSION explicit inline operator bool() const { - return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth; + return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadLast; } #endif // ratio of input pixels to output float32_t3 scale; - // whether its an image1D, image2D or image3D - uint32_t imageDim : 2; - uint32_t unused0 : 14; // channel, iterationRegionPrefixSums ? - // 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels + // TODO: rename + float32_t3 negativeSupport; + // 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels per dimension + // TODO: rename `output` to `perWGOutput` uint32_t outputWidth : 16; uint32_t outputHeight : 16; uint32_t outputDepth : 16; - uint32_t preloadWidth : 16; - uint32_t preloadHeight : 16; - uint32_t preloadDepth : 16; + // 16bit because we can theoretically have a very thin preload region + uint32_t preloadWidth : 16; + uint32_t preloadHeight : 16; + // 64kb of smem is absolute max you'll see in the wild + uint32_t preloadCount : 16; + // worst case is a phase of 2^16-1 + // while the last pass invocations need to be less than 64k because of the smem constraint + uint32_t phaseCountX : 16; + uint32_t xPassInvocations : 16; + uint32_t phaseCountY : 16; + uint32_t yPassInvocations : 16; + uint32_t phaseCountZ : 16; + uint32_t zPassInvocations : 16; //! Offset into the shared memory array which tells us from where the second buffer of shared memory begins //! Given by max(memory_for_preload_region, memory_for_result_of_y_pass) - uint32_t otherPreloadOffset : 16; + uint32_t secondScratchOffDWORD : 14; + //! coverage settings + uint32_t inputEndX : 17; + uint32_t unused0 : 1; + uint32_t inputEndY : 17; + uint32_t unused1 : 15; + uint32_t inputEndZ : 17; + // whether its an image1D, image2D or image3D + uint32_t imageDim : 2; + // saving a bit + uint32_t lastChannel : 2; + uint32_t inLevel : 5; + uint32_t unused2 : 6; + + //! coverage settings + uint32_t coverageChannel : 2; + uint32_t coverageDWORD : 14; + float32_t alphaRefValue; }; struct Parameters @@ -110,18 +173,15 @@ struct Parameters SPerWorkgroup perWG; // rename to perBlitWG? //! general settings - uint32_t inputDescIx : 19; - uint32_t samplerDescIx : 11; - uint32_t unused0 : 2; - // - uint32_t outputDescIx : 19; - uint32_t channelCount : 3; - uint32_t unused1 : 10; + float32_t3 inputImageExtentRcp; + uint32_t inputDescIx : 20; + uint32_t samplerDescIx : 12; // - uint32_t unused2 : 12; + uint32_t outputDescIx : 20; + uint32_t unused0 : 12; //! coverage settings - uint32_t intermAlphaDescIx : 19; - uint32_t coverage : 1; + uint32_t unused1 : 12; + uint32_t intermAlphaDescIx : 20; // required to compare the atomic count of passing pixels against, so we can get original coverage uint32_t inPixelCount; }; From 9b25533363a669ca9b8b823173fb1e5db9d6bc05 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 19 Nov 2024 01:22:39 +0100 Subject: [PATCH 6/6] Its over --- .../nbl/builtin/hlsl/blit/compute_blit.hlsl | 135 +++----- include/nbl/builtin/hlsl/blit/parameters.hlsl | 299 +++++++++++++----- .../nbl/builtin/hlsl/cpp_compat/promote.hlsl | 14 + .../nbl/builtin/hlsl/ndarray_addressing.hlsl | 2 +- 4 files changed, 287 insertions(+), 163 deletions(-) diff --git a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl index 37c809935b..2ff5148759 100644 --- a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl +++ b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl @@ -5,7 +5,6 @@ #define _NBL_BUILTIN_HLSL_BLIT_INCLUDED_ -#include #include #include @@ -43,26 +42,23 @@ void execute( using uint16_tN = vector; // the dimensional truncation is desired - const uint16_tN outputTexelsPerWG = truncate(params.getOutputBaseCoord(uint16_t3(1,1,1))); + const uint16_tN outputTexelsPerWG = params.template getPerWGOutputExtent(); // its the min XYZ corner of the area the workgroup will sample from to produce its output const uint16_tN minOutputTexel = virtWorkGroupID*outputTexelsPerWG; using float32_tN = vector; const float32_tN scale = truncate(params.scale); - const float32_tN inputEndCoord = truncate(params.getInputEndCoord()); + const float32_tN inputMaxCoord = params.template getInputMaxCoord(); const uint16_t inLevel = _static_cast(params.inLevel); const float32_tN inImageSizeRcp = inCombinedSamplerAccessor.template extentRcp(inLevel); using int32_tN = vector; - // intermediate result only needed to compute `regionStartCoord`, basically the sampling coordinate of the minOutputTexel in the input texture - const float32_tN noGoodNameForThisThing = (float32_tN(minOutputTexel)+promote(0.5f))*scale-promote(0.5f); // can be negative, its the min XYZ corner of the area the workgroup will sample from to produce its output - // TODO: is there a HLSL/SPIR-V round() that can simplify ceil(x-0.5)+0.5 ? - const float32_tN regionStartCoord = ceil(noGoodNameForThisThing)+promote(0.5f); - const float32_tN regionNextStartCoord = ceil(noGoodNameForThisThing+float32_tN(outputTexelsPerWG)*scale)+promote(0.5f); + const float32_tN regionStartCoord = params.inputUpperBound(minOutputTexel); + const float32_tN regionNextStartCoord = params.inputUpperBound(minOutputTexel+outputTexelsPerWG); - const uint16_tN preloadRegion = truncate(params.getPreloadExtentExceptLast()); const uint16_t localInvocationIndex = _static_cast(glsl::gl_LocalInvocationIndex()); // workgroup::SubgroupContiguousIndex() + // need to clear our atomic coverage counter to 0 const uint16_t coverageDWORD = _static_cast(params.coverageDWORD); if (DoCoverage) @@ -71,21 +67,23 @@ void execute( sharedAccessor.set(coverageDWORD,0u); glsl::barrier(); } - const uint16_t preloadCount = _static_cast(params.preloadCount); - for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation preloadLayout = params.getPreloadMeta(); + for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation(virtualInvocation,preloadRegion); + const uint16_tN virtualInvocationID = preloadLayout.getID(virtualInvocation); const float32_tN inputTexCoordUnnorm = regionStartCoord + float32_tN(virtualInvocationID); - const float32_tN inputTexCoord = inputTexCoordUnnorm * inImageSizeRcp; + const float32_tN inputTexCoord = (inputTexCoordUnnorm + promote(0.5f)) * inImageSizeRcp; const float32_t4 loadedData = inCombinedSamplerAccessor.template get(inputTexCoord,layer,inLevel); if (DoCoverage) if (loadedData[coverageChannel]>=params.alphaRefValue && all(inputTexCoordUnnorm=promote(0.5f)) && // within the image from below - all(inputTexCoordUnnorm=promote(0.f)) && // within the image from below + all(inputTexCoordUnnorm<=inputMaxCoord) // within the image from above ) { // TODO: atomicIncr or a workgroup reduction of ballots? @@ -99,30 +97,28 @@ void execute( glsl::barrier(); uint16_t readScratchOffset = uint16_t(0); - uint16_t writeScratchOffset = _static_cast(params.secondScratchOffset); - uint16_t prevPassInvocationCount = preloadCount; + uint16_t writeScratchOffset = _static_cast(params.secondScratchOffDWORD); + const uint16_tN windowExtent = params.template getWindowExtent(); + uint16_t prevLayout = preloadLayout; uint32_t kernelWeightOffset = 0; - // - uint16_tN currentOutRegion = preloadRegion; [unroll(3)] for (int32_t axis=0; axis outputLayout = params.getPassMeta(axis); + const uint16_t invocationCount = outputLayout.getLinearEnd(); const uint16_t phaseCount = params.getPhaseCount(axis); - const uint32_t windowExtent = 0x45; - // We sweep along X, then Y, then Z, at every step we need the loads from smem to be performed on consecutive values so that we don't have bank conflicts - currentOutRegion[axis] = outputTexelsPerWG[axis]; - // - const uint16_t invocationCount = params.getPassInvocationCount(axis); + const uint16_t windowLength = windowExtent[axis]; + const uint16_t prevPassInvocationCount = prevLayout.getLinearEnd(); for (uint16_t virtualInvocation=localInvocationIndex; virtualInvocation(virtualInvocation,currentOutRegion); + const uint16_tN virtualInvocationID = outputLayout.getID(virtualInvocation); - // we sweep along a line at a time - uint16_t localOutputCoord = virtualInvocation[0]; // TODO + // we sweep along a line at a time, `[0]` is not a typo, look at the definition of `params.getPassMeta` + uint16_t localOutputCoord = virtualInvocationID[0]; // we can actually compute the output position of this line const uint16_t globalOutputCoord = localOutputCoord+minOutputTexel[axis]; - // hopefull the compiler will see that float32_t may be possible here due to sizeof(float32_t mantissa)>sizeof(uint16_t) + // hopefull the compiler will see that float32_t may be possible here due to `sizeof(float32_t mantissa)>sizeof(uint16_t)` const uint32_t windowPhase = globalOutputCoord % phaseCount; //const int32_t windowStart = ceil(localOutputCoord+0.5f; @@ -130,9 +126,15 @@ void execute( // let us sweep float32_t4 accum = promote(0.f); { - uint32_t kernelWeightIndex = windowPhase*windowExtent+kernelWeightOffset; - uint16_t inputIndex = readScratchOffset+0x45; // (minKernelWindow - regionStartCoord[axis]) + combinedStride*preloadRegion[axis]; - for (uint16_t i=0; i(outCoord,layer,accum); + const uint32_t scratchOffset = writeScratchOffset+params.template getStorageIndex(axis,virtualInvocationID); + [unroll(4)] + for (uint16_t ch=0; ch<4 && ch<=lastChannel; ch++) + sharedAccessor.template set(ch*invocationCount+scratchOffset,accum[ch]); + } + else + { + const uint16_tN coord = SPerWorkgroup::unswizzle(virtualInvocationID)+minOutputTexel; + outImageAccessor.template set(coord,layer,accum); if (DoCoverage) { // const uint32_t bucketIndex = uint32_t(round(accum[coverageChannel] * float(ConstevalParameters::AlphaBinCount - 1))); // histogramAccessor.atomicAdd(workGroupID.z,bucketIndex,uint32_t(1)); +// intermediateAlphaImageAccessor.template set(coord,layer,accum); } } - else - { - uint32_t scratchOffset = writeScratchOffset; - if (axis == 0) - scratchOffset += ndarray_addressing::snakeCurve(virtualInvocationID.yxz, uint32_t3(preloadRegion.y, outputTexelsPerWG.x, preloadRegion.z)); - else - scratchOffset += writeScratchOffset + ndarray_addressing::snakeCurve(virtualInvocationID.zxy, uint32_t3(preloadRegion.z, outputTexelsPerWG.y, outputTexelsPerWG.x)); - - [unroll(4)] - for (uint16_t ch=0; ch<4 && ch<=lastChannel; ch++) - sharedAccessor.template set(ch*invocationCount+scratchOffset,accum[ch]); - } } glsl::barrier(); kernelWeightOffset += phaseCount*windowExtent; - prevPassInvocationCount = invocationCount; + prevLayout = outputLayout; // TODO: use Przemog's `nbl::hlsl::swap` method when the float64 stuff gets merged const uint32_t tmp = readScratchOffset; readScratchOffset = writeScratchOffset; writeScratchOffset = tmp; } -/* - float32_t4 fullValue; - [unroll(4)] - for (uint16_t ch=0; ch<4 && ch<=params.lastChannel; ch++) - { - float32_t value; // TODO - if (DoCoverage && ch==params.coverageChannel) - { - // TODO: global histogram increment - } - fullValue[ch] = value; - } - outImageAccessor.set(minOutputTexel,layer,fullValue); -*/ } -#if 0 - uint32_t outputPixel = virtualInvocationID.x; - if (axis == 2) - outputPixel = virtualInvocationID.z; - outputPixel += minOutputPixel[axis]; - - const int32_t minKernelWindow = int32_t(ceil((outputPixel + 0.5f) * scale[axis] - 0.5f + negativeSupport[axis])); - - // Combined stride for the two non-blitting dimensions, tightly coupled and experimentally derived with/by `iterationRegionPrefixProducts` above and the general order of iteration we use to avoid - // read bank conflicts. - uint32_t combinedStride; - { - if (axis == 0) - combinedStride = virtualInvocationID.z * preloadRegion.y + virtualInvocationID.y; - else if (axis == 1) - combinedStride = virtualInvocationID.z * outputTexelsPerWG.x + virtualInvocationID.y; - else if (axis == 2) - combinedStride = virtualInvocationID.y * outputTexelsPerWG.y + virtualInvocationID.x; - } - - } -#endif - } } } diff --git a/include/nbl/builtin/hlsl/blit/parameters.hlsl b/include/nbl/builtin/hlsl/blit/parameters.hlsl index 5b67b5e84d..d6ef60d19d 100644 --- a/include/nbl/builtin/hlsl/blit/parameters.hlsl +++ b/include/nbl/builtin/hlsl/blit/parameters.hlsl @@ -7,6 +7,7 @@ #include "nbl/builtin/hlsl/bit.hlsl" #include "nbl/builtin/hlsl/limits.hlsl" +#include "nbl/builtin/hlsl/ndarray_addressing.hlsl" namespace nbl @@ -16,25 +17,31 @@ namespace hlsl namespace blit { -struct parameters_t -{ - float32_t3 fScale; // - float32_t3 negativeSupport; - float32_t referenceAlpha; // - - uint16_t3 inputDims; // - uint16_t3 outputDims; - uint16_t3 windowDims; // - uint16_t3 phaseCount; - uint16_t3 preloadRegion; // - uint16_t3 iterationRegionXPrefixProducts; - uint16_t3 iterationRegionYPrefixProducts; - uint16_t3 iterationRegionZPrefixProducts; -}; - // We do some dumb things with bitfields here like not using `vector`, because AMD doesn't support them in push constants struct SPerWorkgroup { + // + template + struct PatchLayout + { + inline uint16_t getLinearEnd() NBL_CONST_MEMBER_FUNC + { + return value[2]; + } + + inline uint16_t getIndex(const vector id) NBL_CONST_MEMBER_FUNC + { + return ndarray_addressing::snakeCurve(id,value); + } + + inline vector getID(const uint16_t linearIndex) NBL_CONST_MEMBER_FUNC + { + return ndarray_addressing::snakeCurveInverse(linearIndex,value); + } + + vector value; + }; + static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t _imageDim, const uint16_t3 output, const uint16_t3 preload, const uint16_t _secondScratchOffDWORD) { SPerWorkgroup retval; @@ -49,56 +56,70 @@ struct SPerWorkgroup return retval; } - inline uint16_t3 getOutputBaseCoord(const uint16_t3 workgroup) NBL_CONST_MEMBER_FUNC - { - return workgroup*uint16_t3(outputWidth,outputHeight,outputDepth); - } + template + vector getPerWGOutputExtent() NBL_CONST_MEMBER_FUNC; inline uint16_t3 getWorkgroupCount(const uint16_t3 outExtent, const uint16_t layersToBlit=0) NBL_CONST_MEMBER_FUNC { const uint16_t3 unit = uint16_t3(1,1,1); uint16_t3 retval = unit; - retval += (outExtent-unit)/getOutputBaseCoord(unit); + retval += (outExtent-unit)/getPerWGOutputExtent(); if (layersToBlit) retval[2] = layersToBlit; return retval; } - inline uint16_t2 getPreloadExtentExceptLast() NBL_CONST_MEMBER_FUNC + // tells you the first pixel in the other image that's either the same or with a greater coordinate when snapped to the other grid + inline float32_t inputUpperBound(const uint16_t coord, const int32_t axis) { - return uint16_t2(preloadWidth,preloadHeight); + return ceil(float32_t(coord)*scale[axis]+halfScalePlusMinSupportMinusHalf[axis]); + } + template + inline vector inputUpperBound(const vector coord) + { + return ceil(vector(coord)*scale+truncate(halfScalePlusMinSupportMinusHalf)); } - inline uint16_t getPhaseCount(const int32_t axis) NBL_CONST_MEMBER_FUNC + // + template + inline PatchLayout getPreloadMeta() NBL_CONST_MEMBER_FUNC { - switch (axis) + PatchLayout retval; + retval.value[Dims-1] = preloadDepth; + if (Dims>1) { - case 2: - return phaseCountZ; - break; - case 1: - return phaseCountY; - break; - default: - break; + retval.value[0] = preloadWidth; + retval.value[Dims-1] *= retval.value[0]; + // if any problems with indexing OOB, then explicitly specialize + if (Dims>2) + { + retval.value[1] = preloadHeight; + retval.value[Dims-1] *= retval.value[1]; + } } - return phaseCountX; + return retval; } + + template + vector getWindowExtent() NBL_CONST_MEMBER_FUNC; - inline uint16_t getPassInvocationCount(const int32_t axis) NBL_CONST_MEMBER_FUNC + // We sweep along X, then Y, then Z, at every step we need the loads from smem to be performed on consecutive values so that we don't have bank conflicts + // 1st pass input: we output oW x pH x pD, X must be minor on input, for cheap `snakeCurveInverse` for 1D and 2D cases, do XYZ order + // 2nd pass input: we output oW x oH x pD, Y must be minor on input, for cheap `snakeCurveInverse` for 2D case, do YXZ order + // 3rd pass input: we output oW x oH x oD, Z must be minor on input, order can be ZYX or ZXY, but settled on ZYX + template + PatchLayout getPassMeta(const int32_t axis) NBL_CONST_MEMBER_FUNC; + + // can only be called with `axis + uint16_t getStorageIndex(const int32_t axis, const vector coord) NBL_CONST_MEMBER_FUNC; + + template + static vector unswizzle(const vector coord); + + inline uint16_t getPhaseCount(const int32_t axis) NBL_CONST_MEMBER_FUNC { - switch (axis) - { - case 2: - return zPassInvocations; - break; - case 1: - return yPassInvocations; - break; - default: - break; - } - return xPassInvocations; + return axis!=0 ? (axis!=1 ? phaseCountZ:phaseCountY):phaseCountX; } inline bool doCoverage() NBL_CONST_MEMBER_FUNC @@ -106,62 +127,196 @@ struct SPerWorkgroup return bool(coverageDWORD); } - inline uint32_t3 getInputEndCoord() NBL_CONST_MEMBER_FUNC - { - return uint32_t3(inputEndX,inputEndX,inputEndZ); - } + template + vector getInputMaxCoord() NBL_CONST_MEMBER_FUNC; #ifndef __HLSL_VERSION explicit inline operator bool() const { - return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadLast; + return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth; } #endif // ratio of input pixels to output float32_t3 scale; - // TODO: rename - float32_t3 negativeSupport; + // `0.5*scale+minSupport-0.5` + float32_t3 halfScalePlusMinSupportMinusHalf; // 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels per dimension // TODO: rename `output` to `perWGOutput` uint32_t outputWidth : 16; uint32_t outputHeight : 16; uint32_t outputDepth : 16; - // 16bit because we can theoretically have a very thin preload region + // 16bit because we can theoretically have a very thin preload region, but 64kb of smem is absolute max you'll see in the wild uint32_t preloadWidth : 16; uint32_t preloadHeight : 16; - // 64kb of smem is absolute max you'll see in the wild - uint32_t preloadCount : 16; + uint32_t preloadDepth : 16; + // kernel gather area for a single output texel + uint32_t windowWidth : 16; + uint32_t windowHeight : 16; + uint32_t windowDepth : 16; // worst case is a phase of 2^16-1 - // while the last pass invocations need to be less than 64k because of the smem constraint uint32_t phaseCountX : 16; - uint32_t xPassInvocations : 16; uint32_t phaseCountY : 16; - uint32_t yPassInvocations : 16; uint32_t phaseCountZ : 16; - uint32_t zPassInvocations : 16; //! Offset into the shared memory array which tells us from where the second buffer of shared memory begins //! Given by max(memory_for_preload_region, memory_for_result_of_y_pass) uint32_t secondScratchOffDWORD : 14; - //! coverage settings - uint32_t inputEndX : 17; - uint32_t unused0 : 1; - uint32_t inputEndY : 17; - uint32_t unused1 : 15; - uint32_t inputEndZ : 17; // whether its an image1D, image2D or image3D - uint32_t imageDim : 2; + uint32_t imageDim : 2; // saving a bit - uint32_t lastChannel : 2; - uint32_t inLevel : 5; - uint32_t unused2 : 6; + uint32_t lastChannel : 2; + uint32_t inLevel : 5; + uint32_t unused2 : 9; //! coverage settings - uint32_t coverageChannel : 2; uint32_t coverageDWORD : 14; + uint32_t coverageChannel : 2; + uint32_t inputMaxX : 16; + uint32_t inputMaxY : 16; + uint32_t inputMaxZ : 16; float32_t alphaRefValue; }; +template<> +inline uint16_t1 SPerWorkgroup::getPerWGOutputExtent<1>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t1(outputWidth); +} +template<> +inline uint16_t2 SPerWorkgroup::getPerWGOutputExtent<2>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t2(outputWidth,outputHeight); +} +template<> +inline uint16_t3 SPerWorkgroup::getPerWGOutputExtent<3>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t3(outputWidth,outputHeight,outputDepth); +} + +template<> +inline uint16_t1 SPerWorkgroup::getWindowExtent<1>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t1(windowWidth); +} +template<> +inline uint16_t2 SPerWorkgroup::getWindowExtent<2>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t2(windowWidth,windowHeight); +} +template<> +inline uint16_t3 SPerWorkgroup::getWindowExtent<3>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t3(windowWidth,windowHeight,windowDepth); +} + +template<> +inline SPerWorkgroup::PatchLayout<1> SPerWorkgroup::getPassMeta<1>(const int32_t axis) NBL_CONST_MEMBER_FUNC +{ + PatchLayout<1> retval; + retval.value = uint16_t1(outputWidth); + return retval; +} +// TODO: eliminate the potential for bank conflicts during storage by making sure `outputHeight` used for snake curve addressing is odd +template<> +inline SPerWorkgroup::PatchLayout<2> SPerWorkgroup::getPassMeta<2>(const int32_t axis) NBL_CONST_MEMBER_FUNC +{ + PatchLayout<2> retval; + if (axis==0) // XY + { + retval.value[0] = outputWidth; + retval.value[1] = preloadHeight; + } + else // YX + { + retval.value[0] = outputHeight; + retval.value[1] = outputWidth; + } + retval.value[1] *= retval.value[0]; + return retval; +} +// TODO: eliminate the potential for bank conflicts during storage by making sure `outputHeight` and `outputDepth` used for snake curve addressing is odd +template<> +inline SPerWorkgroup::PatchLayout<3> SPerWorkgroup::getPassMeta<3>(const int32_t axis) NBL_CONST_MEMBER_FUNC +{ + PatchLayout<3> retval; + if (axis==0) // XYZ + { + retval.value[0] = outputWidth; + retval.value[1] = preloadHeight; + retval.value[2] = preloadDepth; + } + else + { + if (axis==1) // YXZ + { + retval.value[0] = outputHeight; + retval.value[1] = outputWidth; + retval.value[2] = preloadDepth; + } + else // ZYX or ZXY, ZYX may cause less bank conflicts if preaload and output extents are both PoT + { + retval.value[0] = outputDepth; + retval.value[1] = outputHeight; + retval.value[2] = outputWidth; + } + } + retval.value[2] *= retval.value[1]*retval.value[0]; + return retval; +} + +// have to specialize the Dims=1 case otherwise code won't compile +template<> +inline uint16_t SPerWorkgroup::getStorageIndex<1>(const int32_t axis, const uint16_t1 coord) NBL_CONST_MEMBER_FUNC +{ + return coord[0]; +} +template<> +inline uint16_t SPerWorkgroup::getStorageIndex<2>(const int32_t axis, const uint16_t2 coord) NBL_CONST_MEMBER_FUNC +{ + return coord[0]*preloadHeight+coord[1]; +} +template<> +inline uint16_t SPerWorkgroup::getStorageIndex<3>(const int32_t axis, const uint16_t3 coord) NBL_CONST_MEMBER_FUNC +{ + if (axis==0) // XYZ was the layout, prepping for YXZ + return (coord[2]*outputWidth+coord[0])*preloadHeight+coord[1]; + // YXZ was the layout, prepping for ZYX + return (coord[1]*outputHeight+coord[0])*preloadDepth+coord[2]; +} + +template<> +inline uint16_t1 SPerWorkgroup::unswizzle<1>(const uint16_t1 coord) +{ + return coord; +} +template<> +inline uint16_t2 SPerWorkgroup::unswizzle<2>(const uint16_t2 coord) +{ + return coord.yx; // YX -> XY +} +template<> +inline uint16_t3 SPerWorkgroup::unswizzle<3>(const uint16_t3 coord) +{ + return coord.zyx; // ZYX -> XYZ +} + +template<> +inline uint16_t1 SPerWorkgroup::getInputMaxCoord<1>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t1(inputMaxX); +} +template<> +inline uint16_t2 SPerWorkgroup::getInputMaxCoord<2>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t2(inputMaxX,inputMaxY); +} +template<> +inline uint16_t3 SPerWorkgroup::getInputMaxCoord<3>() NBL_CONST_MEMBER_FUNC +{ + return uint16_t3(inputMaxX,inputMaxY,inputMaxZ); +} + + struct Parameters { #ifndef __HLSL_VERSION diff --git a/include/nbl/builtin/hlsl/cpp_compat/promote.hlsl b/include/nbl/builtin/hlsl/cpp_compat/promote.hlsl index 51ca73f6d3..f1584f3fc7 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/promote.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat/promote.hlsl @@ -74,6 +74,20 @@ T promote(const U v) // TODO: use NBL_CONST_REF_ARG(U) instead of U v (circular return _promote(v); } + +//TODO: move to `truncate` header outside of cpp_compat later +template // TODO: NBL_REQUIRE that N<=M +vector truncate(const vector v) // TODO: use NBL_CONST_REF_ARG(U) instead of U v (circular ref) +{ + vector retval; +#ifdef __HLSL_VERSION + [unroll(4)] +#endif + for (int32_t i=0; i::type> -T snakeCurve(const vector coordinate, const vector extents) +T snakeCurve(const vector coordinate, const vector extents) { T retval = _static_cast(coordinate[Dims-1]); for (int32_t i=Dims-2; i>=0; i--)