diff --git a/src/layer/vulkan/cumulativesum_vulkan.cpp b/src/layer/vulkan/cumulativesum_vulkan.cpp
new file mode 100644
index 000000000000..24e8716acaf8
--- /dev/null
+++ b/src/layer/vulkan/cumulativesum_vulkan.cpp
@@ -0,0 +1,267 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "cumulativesum_vulkan.h"
+
+#include "layer_shader_type.h"
+
+namespace ncnn {
+
+CumulativeSum_vulkan::CumulativeSum_vulkan()
+{
+    support_vulkan = true;
+    support_vulkan_packing = false;
+    support_vulkan_any_packing = false;
+
+    pipeline_cumulativesum_blockscan = 0;
+    pipeline_cumulativesum_blocksums_scan = 0;
+    pipeline_cumulativesum_addoffset = 0;
+}
+
+int CumulativeSum_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+
+    std::vector<vk_specialization_type> specializations;
+
+    Mat local_size_xyz;
+    local_size_xyz.w = 256;
+    local_size_xyz.h = 1;
+    local_size_xyz.c = 1;
+
+    pipeline_cumulativesum_blockscan = new Pipeline(vkdev);
+    pipeline_cumulativesum_blockscan->set_optimal_local_size_xyz(local_size_xyz);
+    pipeline_cumulativesum_blockscan->create(LayerShaderType::cumulativesum_blockscan, opt, specializations);
+
+    pipeline_cumulativesum_blocksums_scan = new Pipeline(vkdev);
+    pipeline_cumulativesum_blocksums_scan->set_optimal_local_size_xyz(local_size_xyz);
+    pipeline_cumulativesum_blocksums_scan->create(LayerShaderType::cumulativesum_blocksums_scan, opt, specializations);
+
+    pipeline_cumulativesum_addoffset = new Pipeline(vkdev);
+    pipeline_cumulativesum_addoffset->set_optimal_local_size_xyz(local_size_xyz);
+    pipeline_cumulativesum_addoffset->create(LayerShaderType::cumulativesum_addoffset, opt, specializations);
+
+    return 0;
+}
+
+int CumulativeSum_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_cumulativesum_blockscan;
+    pipeline_cumulativesum_blockscan = 0;
+
+    delete pipeline_cumulativesum_blocksums_scan;
+    pipeline_cumulativesum_blocksums_scan = 0;
+
+    delete pipeline_cumulativesum_addoffset;
+    pipeline_cumulativesum_addoffset = 0;
+
+    return 0;
+}
+
+static inline int cumsum_positive_axis(int axis, int dims)
+{
+    return axis < 0 ? dims + axis : axis;
+}
+
+static inline void get_line_shape(int dims, int axis, int w, int h, int c, int& linecount, int& linelen)
+{
+    if (dims == 1)
+    {
+        linecount = 1;
+        linelen = w;
+        return;
+    }
+
+    if (dims == 2)
+    {
+        if (axis == 0)
+        {
+            // sum along h, each x is a line
+            linecount = w;
+            linelen = h;
+        }
+        else
+        {
+            // sum along w, each y is a line
+            linecount = h;
+            linelen = w;
+        }
+        return;
+    }
+
+    // dims == 3
+    if (axis == 0)
+    {
+        // sum along c, each (x,y) is a line
+        linecount = w * h;
+        linelen = c;
+    }
+    else if (axis == 1)
+    {
+        // sum along h, each (q,x) is a line
+        linecount = c * w;
+        linelen = h;
+    }
+    else
+    {
+        // sum along w, each (q,y) is a line
+        linecount = c * h;
+        linelen = w;
+    }
+}
+
+int CumulativeSum_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+{
+    if (bottom_top_blob.empty())
+        return 0;
+
+    if (bottom_top_blob.elempack != 1)
+        return -100;
+
+    const int dims = bottom_top_blob.dims;
+    if (dims != 1 && dims != 2 && dims != 3)
+        return -100;
+
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int c = bottom_top_blob.c;
+    const int cstep = bottom_top_blob.cstep;
+
+    int positive_axis = cumsum_positive_axis(axis, dims);
+
+    if (dims == 1)
+    {
+        positive_axis = 0;
+    }
+    else if (dims == 2)
+    {
+        if (positive_axis < 0 || positive_axis > 1)
+            return -100;
+    }
+    else // dims == 3
+    {
+        if (positive_axis < 0 || positive_axis > 2)
+            return -100;
+    }
+
+    int linecount = 0;
+    int linelen = 0;
+    get_line_shape(dims, positive_axis, w, h, c, linecount, linelen);
+
+    const int WG = 256;
+    const int blocks_per_line = (linelen + WG - 1) / WG;
+
+    // pass1 only
+    if (blocks_per_line <= 1)
+    {
+        VkMat dummy_blocksums;
+        dummy_blocksums.create(1, 1, bottom_top_blob.elemsize, 1, opt.workspace_vkallocator);
+        if (dummy_blocksums.empty())
+            return -100;
+
+        std::vector<VkMat> bindings(3);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = bottom_top_blob;
+        bindings[2] = dummy_blocksums;
+
+        std::vector<vk_constant_type> constants(8);
+        constants[0].i = dims;
+        constants[1].i = positive_axis;
+        constants[2].i = w;
+        constants[3].i = h;
+        constants[4].i = c;
+        constants[5].i = cstep;
+        constants[6].i = linelen;
+        constants[7].i = linecount;
+
+        VkMat dispatcher;
+        dispatcher.w = WG;
+        dispatcher.h = linecount;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_cumulativesum_blockscan, bindings, constants, dispatcher);
+        return 0;
+    }
+
+    VkMat blocksums;
+    blocksums.create(blocks_per_line, linecount, bottom_top_blob.elemsize, 1, opt.workspace_vkallocator);
+    if (blocksums.empty())
+        return -100;
+
+    VkMat blockoffsets;
+    blockoffsets.create(blocks_per_line, linecount, bottom_top_blob.elemsize, 1, opt.workspace_vkallocator);
+    if (blockoffsets.empty())
+        return -100;
+
+    // pass1: blockscan
+    {
+        std::vector<VkMat> bindings(3);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = bottom_top_blob;
+        bindings[2] = blocksums;
+
+        std::vector<vk_constant_type> constants(8);
+        constants[0].i = dims;
+        constants[1].i = positive_axis;
+        constants[2].i = w;
+        constants[3].i = h;
+        constants[4].i = c;
+        constants[5].i = cstep;
+        constants[6].i = linelen;
+        constants[7].i = linecount;
+
+        VkMat dispatcher;
+        dispatcher.w = blocks_per_line * WG;
+        dispatcher.h = linecount;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_cumulativesum_blockscan, bindings, constants, dispatcher);
+    }
+
+    // pass2: scan blocksums
+    {
+        std::vector<VkMat> bindings(2);
+        bindings[0] = blocksums;
+        bindings[1] = blockoffsets;
+
+        std::vector<vk_constant_type> constants(2);
+        constants[0].i = blocks_per_line;
+        constants[1].i = linecount;
+
+        VkMat dispatcher;
+        dispatcher.w = WG;
+        dispatcher.h = linecount;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_cumulativesum_blocksums_scan, bindings, constants, dispatcher);
+    }
+
+    // pass3: add offsets
+    {
+        std::vector<VkMat> bindings(3);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = bottom_top_blob;
+        bindings[2] = blockoffsets;
+
+        std::vector<vk_constant_type> constants(8);
+        constants[0].i = dims;
+        constants[1].i = positive_axis;
+        constants[2].i = w;
+        constants[3].i = h;
+        constants[4].i = c;
+        constants[5].i = cstep;
+        constants[6].i = linelen;
+        constants[7].i = linecount;
+
+        VkMat dispatcher;
+        dispatcher.w = blocks_per_line * WG;
+        dispatcher.h = linecount;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_cumulativesum_addoffset, bindings, constants, dispatcher);
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/cumulativesum_vulkan.h b/src/layer/vulkan/cumulativesum_vulkan.h
new file mode 100644
index 000000000000..aef833483158
--- /dev/null
+++ b/src/layer/vulkan/cumulativesum_vulkan.h
@@ -0,0 +1,30 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_CUMULATIVESUM_VULKAN_H
+#define LAYER_CUMULATIVESUM_VULKAN_H
+
+#include "cumulativesum.h"
+
+namespace ncnn {
+
+class CumulativeSum_vulkan : public CumulativeSum
+{
+public:
+    CumulativeSum_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    using CumulativeSum::forward_inplace;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_cumulativesum_blockscan;
+    Pipeline* pipeline_cumulativesum_blocksums_scan;
+    Pipeline* pipeline_cumulativesum_addoffset;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CUMULATIVESUM_VULKAN_H
diff --git a/src/layer/vulkan/shader/cumulativesum_addoffset.comp b/src/layer/vulkan/shader/cumulativesum_addoffset.comp
new file mode 100644
index 000000000000..65cba58cc5a8
--- /dev/null
+++ b/src/layer/vulkan/shader/cumulativesum_addoffset.comp
@@ -0,0 +1,89 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer blockoffsets_blob { sfp blockoffsets_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int axis;
+    int w;
+    int h;
+    int c;
+    int cstep;
+    int linelen;
+    int linecount;
+} p;
+
+int index_from_line(int line_id, int elem_id)
+{
+    if (p.dims == 1)
+    {
+        return elem_id;
+    }
+
+    if (p.dims == 2)
+    {
+        if (p.axis == 0)
+        {
+            int x = line_id;
+            int y = elem_id;
+            return y * p.w + x;
+        }
+
+        int x = elem_id;
+        int y = line_id;
+        return y * p.w + x;
+    }
+
+    if (p.axis == 0)
+    {
+        int x = line_id % p.w;
+        int y = line_id / p.w;
+        int q = elem_id;
+        return q * p.cstep + y * p.w + x;
+    }
+
+    if (p.axis == 1)
+    {
+        int x = line_id % p.w;
+        int q = line_id / p.w;
+        int y = elem_id;
+        return q * p.cstep + y * p.w + x;
+    }
+
+    int y = line_id % p.h;
+    int q = line_id / p.h;
+    int x = elem_id;
+    return q * p.cstep + y * p.w + x;
+}
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int line_id = int(gl_GlobalInvocationID.y);
+
+    if (line_id >= p.linecount)
+        return;
+
+    int block_id = gx >> 8;
+    int inblock = gx & 255;
+    int elem_id = block_id * 256 + inblock;
+
+    if (elem_id >= p.linelen || block_id == 0)
+        return;
+
+    int blocks_per_line = (p.linelen + 255) / 256;
+    int oi = line_id * blocks_per_line + (block_id - 1);
+    afp offsetv = buffer_ld1(blockoffsets_data, oi);
+
+    int idx = index_from_line(line_id, elem_id);
+
+    afp v = buffer_ld1(bottom_blob_data, idx);
+    v = v + offsetv;
+    buffer_st1(top_blob_data, idx, v);
+}
diff --git a/src/layer/vulkan/shader/cumulativesum_blockscan.comp b/src/layer/vulkan/shader/cumulativesum_blockscan.comp
new file mode 100644
index 000000000000..e9f35d0b2bd6
--- /dev/null
+++ b/src/layer/vulkan/shader/cumulativesum_blockscan.comp
@@ -0,0 +1,137 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) writeonly buffer blocksums_blob { sfp blocksums_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int axis;
+    int w;
+    int h;
+    int c;
+    int cstep;
+    int linelen;
+    int linecount;
+} p;
+
+shared lfp sdata[256];
+
+int index_from_line(int line_id, int elem_id)
+{
+    if (p.dims == 1)
+    {
+        return elem_id;
+    }
+
+    if (p.dims == 2)
+    {
+        // axis=0: line_id=x, elem_id=y
+        if (p.axis == 0)
+        {
+            int x = line_id;
+            int y = elem_id;
+            return y * p.w + x;
+        }
+
+        // axis=1: line_id=y, elem_id=x
+        int x = elem_id;
+        int y = line_id;
+        return y * p.w + x;
+    }
+
+    // dims == 3, index = q*cstep + y*w + x
+    if (p.axis == 0)
+    {
+        // line_id = y*w + x, elem_id = q
+        int x = line_id % p.w;
+        int y = line_id / p.w;
+        int q = elem_id;
+        return q * p.cstep + y * p.w + x;
+    }
+
+    if (p.axis == 1)
+    {
+        // line_id = q*w + x, elem_id = y
+        int x = line_id % p.w;
+        int q = line_id / p.w;
+        int y = elem_id;
+        return q * p.cstep + y * p.w + x;
+    }
+
+    // axis == 2
+    // line_id = q*h + y, elem_id = x
+    int y = line_id % p.h;
+    int q = line_id / p.h;
+    int x = elem_id;
+    return q * p.cstep + y * p.w + x;
+}
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int line_id = int(gl_GlobalInvocationID.y);
+
+    if (line_id >= p.linecount)
+        return;
+
+    int lid = int(gl_LocalInvocationID.x);
+    int block_id = gx >> 8;
+    int inblock = gx & 255;
+
+    int elem_id = block_id * 256 + inblock;
+
+    afp v = afp(0.0);
+    if (elem_id < p.linelen)
+    {
+        int idx = index_from_line(line_id, elem_id);
+        v = buffer_ld1(bottom_blob_data, idx);
+    }
+
+    // store to shared as lfp
+    sdata[lid] = sfp2lfp(v);
+    barrier();
+
+    // Kogge-Stone inclusive scan
+    for (int offset = 1; offset < 256; offset <<= 1)
+    {
+        afp t = afp(0.0);
+        if (lid >= offset)
+        t = lfp2afp(sdata[lid - offset]);
+
+        barrier();
+
+        afp selfv = lfp2afp(sdata[lid]);
+        selfv = selfv + t;
+        sdata[lid] = sfp2lfp(selfv);
+
+        barrier();
+    }
+
+    if (elem_id < p.linelen)
+    {
+        int idx = index_from_line(line_id, elem_id);
+        buffer_st1(top_blob_data, idx, lfp2afp(sdata[lid]));
+    }
+
+    // write block sum
+    if (lid == 255)
+    {
+        afp bsum = lfp2afp(sdata[255]);
+
+        if ((block_id + 1) * 256 > p.linelen)
+        {
+            int last = p.linelen - block_id * 256 - 1;
+            if (last < 0) last = 0;
+            bsum = lfp2afp(sdata[last]);
+        }
+
+        int blocks_per_line = (p.linelen + 255) / 256;
+        int o = line_id * blocks_per_line + block_id;
+        buffer_st1(blocksums_data, o, bsum);
+    }
+}
diff --git a/src/layer/vulkan/shader/cumulativesum_blocksums_scan.comp b/src/layer/vulkan/shader/cumulativesum_blocksums_scan.comp
new file mode 100644
index 000000000000..dcc8b1586461
--- /dev/null
+++ b/src/layer/vulkan/shader/cumulativesum_blocksums_scan.comp
@@ -0,0 +1,79 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout (binding = 0) readonly buffer blocksums_blob { sfp blocksums_data[]; };
+layout (binding = 1) writeonly buffer blockoffsets_blob { sfp blockoffsets_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int blocks_per_line;
+    int linecount;
+} p;
+
+shared lfp sdata[256];
+
+void main()
+{
+    int lid = int(gl_LocalInvocationID.x);
+    int line_id = int(gl_GlobalInvocationID.y);
+
+    if (line_id >= p.linecount)
+        return;
+
+    afp carry = afp(0.0);
+
+    for (int base = 0; base < p.blocks_per_line; base += 256)
+    {
+        int idx_in = base + lid;
+
+        afp v = afp(0.0);
+        if (idx_in < p.blocks_per_line)
+        {
+            int gi = line_id * p.blocks_per_line + idx_in;
+            v = buffer_ld1(blocksums_data, gi);
+        }
+
+        v = v + carry;
+
+        sdata[lid] = sfp2lfp(v);
+        barrier();
+
+        for (int offset = 1; offset < 256; offset <<= 1)
+        {
+            afp t = afp(0.0);
+            if (lid >= offset)
+                t = lfp2afp(sdata[lid - offset]);
+
+            barrier();
+
+            afp selfv = lfp2afp(sdata[lid]);
+            selfv = selfv + t;
+            sdata[lid] = sfp2lfp(selfv);
+
+            barrier();
+        }
+
+        if (idx_in < p.blocks_per_line)
+        {
+            int go = line_id * p.blocks_per_line + idx_in;
+            buffer_st1(blockoffsets_data, go, lfp2afp(sdata[lid]));
+        }
+
+        if (lid == 255)
+        {
+            afp lastv = lfp2afp(sdata[255]);
+            if (base + 256 > p.blocks_per_line)
+            {
+                int last = p.blocks_per_line - base - 1;
+                if (last < 0)
+                    last = 0;
+                lastv = lfp2afp(sdata[last]);
+            }
+            carry = lastv;
+        }
+
+        barrier();
+    }
+}