diff --git a/src/layer/vulkan/cumulativesum_vulkan.cpp b/src/layer/vulkan/cumulativesum_vulkan.cpp new file mode 100644 index 000000000000..24e8716acaf8 --- /dev/null +++ b/src/layer/vulkan/cumulativesum_vulkan.cpp @@ -0,0 +1,267 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "cumulativesum_vulkan.h" + +#include "layer_shader_type.h" + +namespace ncnn { + +CumulativeSum_vulkan::CumulativeSum_vulkan() +{ + support_vulkan = true; + support_vulkan_packing = false; + support_vulkan_any_packing = false; + + pipeline_cumulativesum_blockscan = 0; + pipeline_cumulativesum_blocksums_scan = 0; + pipeline_cumulativesum_addoffset = 0; +} + +int CumulativeSum_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + + std::vector specializations; + + Mat local_size_xyz; + local_size_xyz.w = 256; + local_size_xyz.h = 1; + local_size_xyz.c = 1; + + pipeline_cumulativesum_blockscan = new Pipeline(vkdev); + pipeline_cumulativesum_blockscan->set_optimal_local_size_xyz(local_size_xyz); + pipeline_cumulativesum_blockscan->create(LayerShaderType::cumulativesum_blockscan, opt, specializations); + + pipeline_cumulativesum_blocksums_scan = new Pipeline(vkdev); + pipeline_cumulativesum_blocksums_scan->set_optimal_local_size_xyz(local_size_xyz); + pipeline_cumulativesum_blocksums_scan->create(LayerShaderType::cumulativesum_blocksums_scan, opt, specializations); + + pipeline_cumulativesum_addoffset = new Pipeline(vkdev); + pipeline_cumulativesum_addoffset->set_optimal_local_size_xyz(local_size_xyz); + pipeline_cumulativesum_addoffset->create(LayerShaderType::cumulativesum_addoffset, opt, specializations); + + return 0; +} + +int CumulativeSum_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_cumulativesum_blockscan; + pipeline_cumulativesum_blockscan = 0; + + delete pipeline_cumulativesum_blocksums_scan; + pipeline_cumulativesum_blocksums_scan = 0; + + delete pipeline_cumulativesum_addoffset; + pipeline_cumulativesum_addoffset = 0; + + return 0; +} + +static inline int cumsum_positive_axis(int axis, int dims) +{ + return axis < 0 ? dims + axis : axis; +} + +static inline void get_line_shape(int dims, int axis, int w, int h, int c, int& linecount, int& linelen) +{ + if (dims == 1) + { + linecount = 1; + linelen = w; + return; + } + + if (dims == 2) + { + if (axis == 0) + { + // sum along h, each x is a line + linecount = w; + linelen = h; + } + else + { + // sum along w, each y is a line + linecount = h; + linelen = w; + } + return; + } + + // dims == 3 + if (axis == 0) + { + // sum along c, each (x,y) is a line + linecount = w * h; + linelen = c; + } + else if (axis == 1) + { + // sum along h, each (q,x) is a line + linecount = c * w; + linelen = h; + } + else + { + // sum along w, each (q,y) is a line + linecount = c * h; + linelen = w; + } +} + +int CumulativeSum_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const +{ + if (bottom_top_blob.empty()) + return 0; + + if (bottom_top_blob.elempack != 1) + return -100; + + const int dims = bottom_top_blob.dims; + if (dims != 1 && dims != 2 && dims != 3) + return -100; + + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int c = bottom_top_blob.c; + const int cstep = bottom_top_blob.cstep; + + int positive_axis = cumsum_positive_axis(axis, dims); + + if (dims == 1) + { + positive_axis = 0; + } + else if (dims == 2) + { + if (positive_axis < 0 || positive_axis > 1) + return -100; + } + else // dims == 3 + { + if (positive_axis < 0 || positive_axis > 2) + return -100; + } + + int linecount = 0; + int linelen = 0; + get_line_shape(dims, positive_axis, w, h, c, linecount, linelen); + + const int WG = 256; + const int blocks_per_line = (linelen + WG - 1) / WG; + + // pass1 only + if (blocks_per_line <= 1) + { + VkMat dummy_blocksums; + dummy_blocksums.create(1, 1, bottom_top_blob.elemsize, 1, opt.workspace_vkallocator); + if (dummy_blocksums.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = bottom_top_blob; + bindings[2] = dummy_blocksums; + + std::vector constants(8); + constants[0].i = dims; + constants[1].i = positive_axis; + constants[2].i = w; + constants[3].i = h; + constants[4].i = c; + constants[5].i = cstep; + constants[6].i = linelen; + constants[7].i = linecount; + + VkMat dispatcher; + dispatcher.w = WG; + dispatcher.h = linecount; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_cumulativesum_blockscan, bindings, constants, dispatcher); + return 0; + } + + VkMat blocksums; + blocksums.create(blocks_per_line, linecount, bottom_top_blob.elemsize, 1, opt.workspace_vkallocator); + if (blocksums.empty()) + return -100; + + VkMat blockoffsets; + blockoffsets.create(blocks_per_line, linecount, bottom_top_blob.elemsize, 1, opt.workspace_vkallocator); + if (blockoffsets.empty()) + return -100; + + // pass1: blockscan + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = bottom_top_blob; + bindings[2] = blocksums; + + std::vector constants(8); + constants[0].i = dims; + constants[1].i = positive_axis; + constants[2].i = w; + constants[3].i = h; + constants[4].i = c; + constants[5].i = cstep; + constants[6].i = linelen; + constants[7].i = linecount; + + VkMat dispatcher; + dispatcher.w = blocks_per_line * WG; + dispatcher.h = linecount; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_cumulativesum_blockscan, bindings, constants, dispatcher); + } + + // pass2: scan blocksums + { + std::vector bindings(2); + bindings[0] = blocksums; + bindings[1] = blockoffsets; + + std::vector constants(2); + constants[0].i = blocks_per_line; + constants[1].i = linecount; + + VkMat dispatcher; + dispatcher.w = WG; + dispatcher.h = linecount; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_cumulativesum_blocksums_scan, bindings, constants, dispatcher); + } + + // pass3: add offsets + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = bottom_top_blob; + bindings[2] = blockoffsets; + + std::vector constants(8); + constants[0].i = dims; + constants[1].i = positive_axis; + constants[2].i = w; + constants[3].i = h; + constants[4].i = c; + constants[5].i = cstep; + constants[6].i = linelen; + constants[7].i = linecount; + + VkMat dispatcher; + dispatcher.w = blocks_per_line * WG; + dispatcher.h = linecount; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_cumulativesum_addoffset, bindings, constants, dispatcher); + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/vulkan/cumulativesum_vulkan.h b/src/layer/vulkan/cumulativesum_vulkan.h new file mode 100644 index 000000000000..aef833483158 --- /dev/null +++ b/src/layer/vulkan/cumulativesum_vulkan.h @@ -0,0 +1,30 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_CUMULATIVESUM_VULKAN_H +#define LAYER_CUMULATIVESUM_VULKAN_H + +#include "cumulativesum.h" + +namespace ncnn { + +class CumulativeSum_vulkan : public CumulativeSum +{ +public: + CumulativeSum_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + using CumulativeSum::forward_inplace; + virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_cumulativesum_blockscan; + Pipeline* pipeline_cumulativesum_blocksums_scan; + Pipeline* pipeline_cumulativesum_addoffset; +}; + +} // namespace ncnn + +#endif // LAYER_CUMULATIVESUM_VULKAN_H diff --git a/src/layer/vulkan/shader/cumulativesum_addoffset.comp b/src/layer/vulkan/shader/cumulativesum_addoffset.comp new file mode 100644 index 000000000000..65cba58cc5a8 --- /dev/null +++ b/src/layer/vulkan/shader/cumulativesum_addoffset.comp @@ -0,0 +1,89 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer blockoffsets_blob { sfp blockoffsets_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int axis; + int w; + int h; + int c; + int cstep; + int linelen; + int linecount; +} p; + +int index_from_line(int line_id, int elem_id) +{ + if (p.dims == 1) + { + return elem_id; + } + + if (p.dims == 2) + { + if (p.axis == 0) + { + int x = line_id; + int y = elem_id; + return y * p.w + x; + } + + int x = elem_id; + int y = line_id; + return y * p.w + x; + } + + if (p.axis == 0) + { + int x = line_id % p.w; + int y = line_id / p.w; + int q = elem_id; + return q * p.cstep + y * p.w + x; + } + + if (p.axis == 1) + { + int x = line_id % p.w; + int q = line_id / p.w; + int y = elem_id; + return q * p.cstep + y * p.w + x; + } + + int y = line_id % p.h; + int q = line_id / p.h; + int x = elem_id; + return q * p.cstep + y * p.w + x; +} + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int line_id = int(gl_GlobalInvocationID.y); + + if (line_id >= p.linecount) + return; + + int block_id = gx >> 8; + int inblock = gx & 255; + int elem_id = block_id * 256 + inblock; + + if (elem_id >= p.linelen || block_id == 0) + return; + + int blocks_per_line = (p.linelen + 255) / 256; + int oi = line_id * blocks_per_line + (block_id - 1); + afp offsetv = buffer_ld1(blockoffsets_data, oi); + + int idx = index_from_line(line_id, elem_id); + + afp v = buffer_ld1(bottom_blob_data, idx); + v = v + offsetv; + buffer_st1(top_blob_data, idx, v); +} diff --git a/src/layer/vulkan/shader/cumulativesum_blockscan.comp b/src/layer/vulkan/shader/cumulativesum_blockscan.comp new file mode 100644 index 000000000000..e9f35d0b2bd6 --- /dev/null +++ b/src/layer/vulkan/shader/cumulativesum_blockscan.comp @@ -0,0 +1,137 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) writeonly buffer blocksums_blob { sfp blocksums_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int axis; + int w; + int h; + int c; + int cstep; + int linelen; + int linecount; +} p; + +shared lfp sdata[256]; + +int index_from_line(int line_id, int elem_id) +{ + if (p.dims == 1) + { + return elem_id; + } + + if (p.dims == 2) + { + // axis=0: line_id=x, elem_id=y + if (p.axis == 0) + { + int x = line_id; + int y = elem_id; + return y * p.w + x; + } + + // axis=1: line_id=y, elem_id=x + int x = elem_id; + int y = line_id; + return y * p.w + x; + } + + // dims == 3, index = q*cstep + y*w + x + if (p.axis == 0) + { + // line_id = y*w + x, elem_id = q + int x = line_id % p.w; + int y = line_id / p.w; + int q = elem_id; + return q * p.cstep + y * p.w + x; + } + + if (p.axis == 1) + { + // line_id = q*w + x, elem_id = y + int x = line_id % p.w; + int q = line_id / p.w; + int y = elem_id; + return q * p.cstep + y * p.w + x; + } + + // axis == 2 + // line_id = q*h + y, elem_id = x + int y = line_id % p.h; + int q = line_id / p.h; + int x = elem_id; + return q * p.cstep + y * p.w + x; +} + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int line_id = int(gl_GlobalInvocationID.y); + + if (line_id >= p.linecount) + return; + + int lid = int(gl_LocalInvocationID.x); + int block_id = gx >> 8; + int inblock = gx & 255; + + int elem_id = block_id * 256 + inblock; + + afp v = afp(0.0); + if (elem_id < p.linelen) + { + int idx = index_from_line(line_id, elem_id); + v = buffer_ld1(bottom_blob_data, idx); + } + + // store to shared as lfp + sdata[lid] = sfp2lfp(v); + barrier(); + + // Kogge-Stone inclusive scan + for (int offset = 1; offset < 256; offset <<= 1) + { + afp t = afp(0.0); + if (lid >= offset) + t = lfp2afp(sdata[lid - offset]); + + barrier(); + + afp selfv = lfp2afp(sdata[lid]); + selfv = selfv + t; + sdata[lid] = sfp2lfp(selfv); + + barrier(); + } + + if (elem_id < p.linelen) + { + int idx = index_from_line(line_id, elem_id); + buffer_st1(top_blob_data, idx, lfp2afp(sdata[lid])); + } + + // write block sum + if (lid == 255) + { + afp bsum = lfp2afp(sdata[255]); + + if ((block_id + 1) * 256 > p.linelen) + { + int last = p.linelen - block_id * 256 - 1; + if (last < 0) last = 0; + bsum = lfp2afp(sdata[last]); + } + + int blocks_per_line = (p.linelen + 255) / 256; + int o = line_id * blocks_per_line + block_id; + buffer_st1(blocksums_data, o, bsum); + } +} diff --git a/src/layer/vulkan/shader/cumulativesum_blocksums_scan.comp b/src/layer/vulkan/shader/cumulativesum_blocksums_scan.comp new file mode 100644 index 000000000000..dcc8b1586461 --- /dev/null +++ b/src/layer/vulkan/shader/cumulativesum_blocksums_scan.comp @@ -0,0 +1,79 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout (binding = 0) readonly buffer blocksums_blob { sfp blocksums_data[]; }; +layout (binding = 1) writeonly buffer blockoffsets_blob { sfp blockoffsets_data[]; }; + +layout (push_constant) uniform parameter +{ + int blocks_per_line; + int linecount; +} p; + +shared lfp sdata[256]; + +void main() +{ + int lid = int(gl_LocalInvocationID.x); + int line_id = int(gl_GlobalInvocationID.y); + + if (line_id >= p.linecount) + return; + + afp carry = afp(0.0); + + for (int base = 0; base < p.blocks_per_line; base += 256) + { + int idx_in = base + lid; + + afp v = afp(0.0); + if (idx_in < p.blocks_per_line) + { + int gi = line_id * p.blocks_per_line + idx_in; + v = buffer_ld1(blocksums_data, gi); + } + + v = v + carry; + + sdata[lid] = sfp2lfp(v); + barrier(); + + for (int offset = 1; offset < 256; offset <<= 1) + { + afp t = afp(0.0); + if (lid >= offset) + t = lfp2afp(sdata[lid - offset]); + + barrier(); + + afp selfv = lfp2afp(sdata[lid]); + selfv = selfv + t; + sdata[lid] = sfp2lfp(selfv); + + barrier(); + } + + if (idx_in < p.blocks_per_line) + { + int go = line_id * p.blocks_per_line + idx_in; + buffer_st1(blockoffsets_data, go, lfp2afp(sdata[lid])); + } + + if (lid == 255) + { + afp lastv = lfp2afp(sdata[255]); + if (base + 256 > p.blocks_per_line) + { + int last = p.blocks_per_line - base - 1; + if (last < 0) + last = 0; + lastv = lfp2afp(sdata[last]); + } + carry = lastv; + } + + barrier(); + } +}