diff --git a/src/layer/vulkan/pooling_vulkan.cpp b/src/layer/vulkan/pooling_vulkan.cpp
index 8e38c9a5b23b..69855240b68a 100644
--- a/src/layer/vulkan/pooling_vulkan.cpp
+++ b/src/layer/vulkan/pooling_vulkan.cpp
@@ -1,75 +1,61 @@
-// Copyright 2019 Tencent
+// Copyright 2026 Futz12 <pchar.cn>
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "pooling_vulkan.h"
 
 #include "layer_shader_type.h"
-#include "layer_type.h"
-
-#include <float.h>
 
 namespace ncnn {
 
-Pooling_vulkan::Pooling_vulkan()
+static inline void calc_same_pad(int w, int h, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_mode, int& pl, int& pr, int& pt, int& pb)
 {
-    support_vulkan = true;
-    support_vulkan_packing = true;
-
-    padding = 0;
-    pipeline_pooling = 0;
-    pipeline_pooling_pack4 = 0;
+    int wpad = kernel_w + (w - 1) / stride_w * stride_w - w;
+    int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
+    if (wpad < 0) wpad = 0;
+    if (hpad < 0) hpad = 0;
 
-    pipeline_pooling_adaptive = 0;
-    pipeline_pooling_adaptive_pack4 = 0;
-
-    pipeline_pooling_global_reduce_first = 0;
-    pipeline_pooling_global_reduce_first_pack4 = 0;
-    pipeline_pooling_global_reduce = 0;
-    pipeline_pooling_global_reduce_pack4 = 0;
-    pipeline_pooling_global_reduce_last = 0;
-    pipeline_pooling_global_reduce_last_pack4 = 0;
+    if (pad_mode == 2)
+    {
+        pl = wpad / 2;
+        pr = wpad - pl;
+        pt = hpad / 2;
+        pb = hpad - pt;
+    }
+    else
+    {
+        pl = wpad - wpad / 2;
+        pr = wpad / 2;
+        pt = hpad - hpad / 2;
+        pb = hpad / 2;
+    }
 }
 
-int Pooling_vulkan::create_pipeline(const Option& _opt)
+static inline void calc_output_and_pad(int w, int h,
+                                       int kernel_w, int kernel_h,
+                                       int stride_w, int stride_h,
+                                       int pad_left, int pad_right, int pad_top, int pad_bottom,
+                                       int pad_mode, int& outw, int& outh,
+                                       int& pl, int& pr, int& pt, int& pb)
 {
-    Option opt = _opt;
-    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
-    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+    pl = 0;
+    pr = 0;
+    pt = 0;
+    pb = 0;
 
-    // the shape after padding
-    Mat shape_bordered;
-    if (shape.dims != 0)
+    if (pad_mode == 0 || pad_mode == 1)
     {
+        pl = pad_left;
+        pr = pad_right;
+        pt = pad_top;
+        pb = pad_bottom;
+
         if (pad_mode == 0)
         {
-            int wtail = (shape.w + pad_left + pad_right - kernel_w) % stride_w;
-            int htail = (shape.h + pad_top + pad_bottom - kernel_h) % stride_h;
+            int wtail = (w + pl + pr - kernel_w) % stride_w;
+            int htail = (h + pt + pb - kernel_h) % stride_h;
 
-            int wtailpad = 0;
-            int htailpad = 0;
-            if (wtail != 0)
-                wtailpad = stride_w - wtail;
-            if (htail != 0)
-                htailpad = stride_h - htail;
-
-            shape_bordered = Mat(shape.w + pad_left + pad_right + wtailpad, shape.h + pad_top + pad_bottom + htailpad, shape.c, (void*)0);
-        }
-        else if (pad_mode == 1)
-        {
-            shape_bordered = Mat(shape.w + pad_left + pad_right, shape.h + pad_top + pad_bottom, shape.c, (void*)0);
-        }
-        else if (pad_mode == 2 || pad_mode == 3)
-        {
-            int wpad = kernel_w + (shape.w - 1) / stride_w * stride_w - shape.w;
-            int hpad = kernel_h + (shape.h - 1) / stride_h * stride_h - shape.h;
-            if (wpad > 0 || hpad > 0)
-            {
-                shape_bordered = Mat(shape.w + wpad, shape.h + hpad, shape.c, (void*)0);
-            }
-        }
-        else
-        {
-            shape_bordered = shape;
+            if (wtail != 0) pr += stride_w - wtail;
+            if (htail != 0) pb += stride_h - htail;
         }
     }
 
@@ -85,395 +71,274 @@ int Pooling_vulkan::create_pipeline(const Option& _opt)
     }
     else
     {
-        elemsize = elempack * 4u;
-        out_elemsize = out_elempack * 4u;
+        calc_same_pad(w, h, kernel_w, kernel_h, stride_w, stride_h, pad_mode, pl, pr, pt, pb);
     }
 
-    Mat shape_bordered_packed;
-    if (shape_bordered.dims == 1) shape_bordered_packed = Mat(shape_bordered.w / elempack, (void*)0, elemsize, elempack);
-    if (shape_bordered.dims == 2) shape_bordered_packed = Mat(shape_bordered.w, shape_bordered.h / elempack, (void*)0, elemsize, elempack);
-    if (shape_bordered.dims == 3) shape_bordered_packed = Mat(shape_bordered.w, shape_bordered.h, shape_bordered.c / elempack, (void*)0, elemsize, elempack);
+    outw = (w + pl + pr - kernel_w) / stride_w + 1;
+    outh = (h + pt + pb - kernel_h) / stride_h + 1;
+}
 
-    Mat out_shape_packed;
-    if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
-    if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
-    if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
+Pooling_vulkan::Pooling_vulkan()
+{
+    support_vulkan = true;
 
-    {
-        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
-        padding->vkdev = vkdev;
-
-        padding->bottom_shapes.resize(1);
-        padding->bottom_shapes[0] = shape;
-        padding->top_shapes.resize(1);
-        padding->top_shapes[0] = shape_bordered;
-
-        ncnn::ParamDict pd;
-        pd.set(0, pad_top);
-        pd.set(1, pad_bottom);
-        pd.set(2, pad_left);
-        pd.set(3, pad_right);
-        pd.set(4, 0);
-
-        if (pooling_type == PoolMethod_MAX)
-        {
-            // FLT_MAX becomes NaN during fp16 conversion in shader with swiftshader
-            // use a proper fp16-representable max as workaround   --- nihui
-            if (opt.use_fp16_packed || opt.use_fp16_storage || opt.use_fp16_arithmetic)
-                pd.set(5, -65000.f);
-            else
-                pd.set(5, -FLT_MAX);
-        }
-        else if (pooling_type == PoolMethod_AVE)
-        {
-            pd.set(5, 0.f);
-        }
+    support_vulkan_packing = false;
+    support_vulkan_any_packing = false;
 
-        padding->load_param(pd);
+    pipeline_pooling = 0;
+    pipeline_pooling_tile = 0;
 
-        padding->create_pipeline(opt);
-    }
+    pipeline_pooling_global = 0;
+    pipeline_pooling_global_stage1 = 0;
+    pipeline_pooling_global_stage2 = 0;
+
+    pipeline_pooling_adaptive = 0;
+}
+
+int Pooling_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+
+    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    int elempack = 1;
+    int out_elempack = 1;
+
+    size_t elemsize = (opt.use_fp16_storage || opt.use_fp16_packed) ? 2u : 4u;
+    size_t out_elemsize = elemsize;
+
+    Mat shape_packed;
+    Mat out_shape_packed;
+
+    if (shape.dims == 1) shape_packed = Mat(shape.w, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c, (void*)0, elemsize, elempack);
+
+    if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c, (void*)0, out_elemsize, out_elempack);
 
     if (global_pooling)
     {
-        // reduce first
         {
-            std::vector<vk_specialization_type> specializations(6);
-            specializations[0].i = shape_bordered_packed.w;
-            specializations[1].i = shape_bordered_packed.h;
-            specializations[2].i = shape_bordered_packed.c;
-            specializations[3].i = shape_bordered_packed.cstep;
-            specializations[4].i = 0;
-            specializations[5].i = 0;
-
-            Mat local_size_xyz(64, 1, 1, (void*)0);
-
-            // pack1
-            if (shape.dims == 0 || elempack == 1)
-            {
-                int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_first : LayerShaderType::pooling_global_reduce_sum_first;
-
-                pipeline_pooling_global_reduce_first = new Pipeline(vkdev);
-                pipeline_pooling_global_reduce_first->set_optimal_local_size_xyz(local_size_xyz);
-                pipeline_pooling_global_reduce_first->create(layer_shader_type, opt, specializations);
-            }
-
-            // pack4
-            if (shape.dims == 0 || elempack == 4)
-            {
-                int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_first_pack4 : LayerShaderType::pooling_global_reduce_sum_first_pack4;
-
-                pipeline_pooling_global_reduce_first_pack4 = new Pipeline(vkdev);
-                pipeline_pooling_global_reduce_first_pack4->set_optimal_local_size_xyz(local_size_xyz);
-                pipeline_pooling_global_reduce_first_pack4->create(layer_shader_type, opt, specializations);
-            }
+            std::vector<vk_specialization_type> specializations(1);
+            specializations[0].i = pooling_type;
+
+            pipeline_pooling_global = new Pipeline(vkdev);
+            pipeline_pooling_global->set_local_size_xyz(256, 1, 1);
+            pipeline_pooling_global->create(LayerShaderType::pooling_global, opt, specializations);
         }
 
-        // reduce more
         {
-            std::vector<vk_specialization_type> specializations(5);
-            specializations[0].i = 0;
-            specializations[1].i = shape_bordered_packed.c;
-            specializations[2].i = 0;
-            specializations[3].i = 0;
-            specializations[4].i = 0;
-
-            Mat local_size_xyz(64, 1, 1, (void*)0);
-
-            // pack1
-            if (shape.dims == 0 || elempack == 1)
-            {
-                int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max : LayerShaderType::pooling_global_reduce_sum;
-
-                pipeline_pooling_global_reduce = new Pipeline(vkdev);
-                pipeline_pooling_global_reduce->set_optimal_local_size_xyz(local_size_xyz);
-                pipeline_pooling_global_reduce->create(layer_shader_type, opt, specializations);
-            }
-
-            // pack4
-            if (shape.dims == 0 || elempack == 4)
-            {
-                int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_pack4 : LayerShaderType::pooling_global_reduce_sum_pack4;
-
-                pipeline_pooling_global_reduce_pack4 = new Pipeline(vkdev);
-                pipeline_pooling_global_reduce_pack4->set_optimal_local_size_xyz(local_size_xyz);
-                pipeline_pooling_global_reduce_pack4->create(layer_shader_type, opt, specializations);
-            }
+            std::vector<vk_specialization_type> specializations(1);
+            specializations[0].i = pooling_type;
+
+            pipeline_pooling_global_stage1 = new Pipeline(vkdev);
+            pipeline_pooling_global_stage1->set_local_size_xyz(256, 1, 1);
+            pipeline_pooling_global_stage1->create(LayerShaderType::pooling_global_stage1, opt, specializations);
         }
 
-        // reduce last
         {
-            std::vector<vk_specialization_type> specializations(3);
-            specializations[0].i = 0;
-            specializations[1].i = shape_bordered_packed.c;
-            specializations[2].i = 0;
-
-            Mat local_size_xyz(1, 1, 64, (void*)0);
-
-            // pack1
-            if (shape.dims == 0 || elempack == 1)
-            {
-                int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_last : LayerShaderType::pooling_global_reduce_sum_last;
-
-                pipeline_pooling_global_reduce_last = new Pipeline(vkdev);
-                pipeline_pooling_global_reduce_last->set_optimal_local_size_xyz(local_size_xyz);
-                pipeline_pooling_global_reduce_last->create(layer_shader_type, opt, specializations);
-            }
-
-            // pack4
-            if (shape.dims == 0 || elempack == 4)
-            {
-                int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_last_pack4 : LayerShaderType::pooling_global_reduce_sum_last_pack4;
-
-                pipeline_pooling_global_reduce_last_pack4 = new Pipeline(vkdev);
-                pipeline_pooling_global_reduce_last_pack4->set_optimal_local_size_xyz(local_size_xyz);
-                pipeline_pooling_global_reduce_last_pack4->create(layer_shader_type, opt, specializations);
-            }
+            std::vector<vk_specialization_type> specializations(1);
+            specializations[0].i = pooling_type;
+
+            pipeline_pooling_global_stage2 = new Pipeline(vkdev);
+            pipeline_pooling_global_stage2->set_local_size_xyz(256, 1, 1);
+            pipeline_pooling_global_stage2->create(LayerShaderType::pooling_global_stage2, opt, specializations);
         }
+
+        return 0;
     }
-    else if (adaptive_pooling)
+
+    if (adaptive_pooling)
     {
-        std::vector<vk_specialization_type> specializations(1 + 10);
+        std::vector<vk_specialization_type> specializations(5);
         specializations[0].i = pooling_type;
-        specializations[1 + 0].i = shape_bordered_packed.dims;
-        specializations[1 + 1].i = shape_bordered_packed.w;
-        specializations[1 + 2].i = shape_bordered_packed.h;
-        specializations[1 + 3].i = shape_bordered_packed.c;
-        specializations[1 + 4].i = shape_bordered_packed.cstep;
-        specializations[1 + 5].i = out_shape_packed.dims;
-        specializations[1 + 6].i = out_shape_packed.w;
-        specializations[1 + 7].i = out_shape_packed.h;
-        specializations[1 + 8].i = out_shape_packed.c;
-        specializations[1 + 9].i = out_shape_packed.cstep;
-
-        Mat local_size_xyz;
-        if (out_shape_packed.dims != 0)
-        {
-            local_size_xyz.w = std::min(4, out_shape_packed.w);
-            local_size_xyz.h = std::min(4, out_shape_packed.h);
-            local_size_xyz.c = std::min(4, out_shape_packed.c);
-        }
+        specializations[1].i = out_w;
+        specializations[2].i = out_h;
+        specializations[3].i = 0;
+        specializations[4].i = 0;
+
+        pipeline_pooling_adaptive = new Pipeline(vkdev);
+        pipeline_pooling_adaptive->set_local_size_xyz(8, 8, 1);
+        pipeline_pooling_adaptive->create(LayerShaderType::pooling_adaptive, opt, specializations);
+        return 0;
+    }
 
-        // pack1
-        if (shape.dims == 0 || elempack == 1)
-        {
-            pipeline_pooling_adaptive = new Pipeline(vkdev);
-            pipeline_pooling_adaptive->set_optimal_local_size_xyz(local_size_xyz);
-            pipeline_pooling_adaptive->create(LayerShaderType::pooling_adaptive, opt, specializations);
-        }
+    bool use_tile = true;
+    {
+        const int tile_out_w = 8;
+        const int tile_out_h = 8;
+        const int tile_in_w = (tile_out_w - 1) * stride_w + kernel_w;
+        const int tile_in_h = (tile_out_h - 1) * stride_h + kernel_h;
+
+        if (tile_in_w > 36 || tile_in_h > 36) use_tile = false;
+        if (kernel_w <= 0 || kernel_h <= 0) use_tile = false;
+        if (stride_w <= 0 || stride_h <= 0) use_tile = false;
+    }
 
-        // pack4
-        if (shape.dims == 0 || elempack == 4)
-        {
-            pipeline_pooling_adaptive_pack4 = new Pipeline(vkdev);
-            pipeline_pooling_adaptive_pack4->set_optimal_local_size_xyz(local_size_xyz);
-            pipeline_pooling_adaptive_pack4->create(LayerShaderType::pooling_adaptive_pack4, opt, specializations);
-        }
+    std::vector<vk_specialization_type> specializations(11 + 12);
+    specializations[0].i = pooling_type;
+    specializations[1].i = kernel_w;
+    specializations[2].i = kernel_h;
+    specializations[3].i = stride_w;
+    specializations[4].i = stride_h;
+    specializations[5].i = pad_left;
+    specializations[6].i = pad_right;
+    specializations[7].i = pad_top;
+    specializations[8].i = pad_bottom;
+    specializations[9].i = pad_mode;
+    specializations[10].i = avgpool_count_include_pad;
+
+    specializations[11 + 0].i = shape_packed.dims;
+    specializations[11 + 1].i = shape_packed.w;
+    specializations[11 + 2].i = shape_packed.h;
+    specializations[11 + 3].i = shape_packed.d;
+    specializations[11 + 4].i = shape_packed.c;
+    specializations[11 + 5].i = shape_packed.cstep;
+
+    specializations[11 + 6].i = out_shape_packed.dims;
+    specializations[11 + 7].i = out_shape_packed.w;
+    specializations[11 + 8].i = out_shape_packed.h;
+    specializations[11 + 9].i = out_shape_packed.d;
+    specializations[11 + 10].i = out_shape_packed.c;
+    specializations[11 + 11].i = out_shape_packed.cstep;
+
+    if (use_tile)
+    {
+        pipeline_pooling_tile = new Pipeline(vkdev);
+        pipeline_pooling_tile->set_local_size_xyz(8, 8, 1);
+        pipeline_pooling_tile->create(LayerShaderType::pooling_tile, opt, specializations);
     }
     else
     {
-        std::vector<vk_specialization_type> specializations(12 + 10);
-        specializations[0].i = pooling_type;
-        specializations[1].i = kernel_w;
-        specializations[2].i = kernel_h;
-        specializations[3].i = stride_w;
-        specializations[4].i = stride_h;
-        specializations[5].i = pad_left;
-        specializations[6].i = pad_right;
-        specializations[7].i = pad_top;
-        specializations[8].i = pad_bottom;
-        specializations[9].i = global_pooling;
-        specializations[10].i = pad_mode;
-        specializations[11].i = avgpool_count_include_pad;
-        specializations[12 + 0].i = shape_bordered_packed.dims;
-        specializations[12 + 1].i = shape_bordered_packed.w;
-        specializations[12 + 2].i = shape_bordered_packed.h;
-        specializations[12 + 3].i = shape_bordered_packed.c;
-        specializations[12 + 4].i = shape_bordered_packed.cstep;
-        specializations[12 + 5].i = out_shape_packed.dims;
-        specializations[12 + 6].i = out_shape_packed.w;
-        specializations[12 + 7].i = out_shape_packed.h;
-        specializations[12 + 8].i = out_shape_packed.c;
-        specializations[12 + 9].i = out_shape_packed.cstep;
-
-        Mat local_size_xyz;
-        if (out_shape_packed.dims != 0)
-        {
-            local_size_xyz.w = std::min(4, out_shape_packed.w);
-            local_size_xyz.h = std::min(4, out_shape_packed.h);
-            local_size_xyz.c = std::min(4, out_shape_packed.c);
-        }
-
-        // pack1
-        if (shape.dims == 0 || elempack == 1)
-        {
-            pipeline_pooling = new Pipeline(vkdev);
-            pipeline_pooling->set_optimal_local_size_xyz(local_size_xyz);
-            pipeline_pooling->create(LayerShaderType::pooling, opt, specializations);
-        }
-
-        // pack4
-        if (shape.dims == 0 || elempack == 4)
-        {
-            pipeline_pooling_pack4 = new Pipeline(vkdev);
-            pipeline_pooling_pack4->set_optimal_local_size_xyz(local_size_xyz);
-            pipeline_pooling_pack4->create(LayerShaderType::pooling_pack4, opt, specializations);
-        }
+        pipeline_pooling = new Pipeline(vkdev);
+        pipeline_pooling->set_local_size_xyz(8, 8, 1);
+        pipeline_pooling->create(LayerShaderType::pooling, opt, specializations);
     }
 
     return 0;
 }
 
-int Pooling_vulkan::destroy_pipeline(const Option& opt)
+int Pooling_vulkan::destroy_pipeline(const Option& /*opt*/)
 {
-    if (padding)
-    {
-        padding->destroy_pipeline(opt);
-        delete padding;
-        padding = 0;
-    }
-
     delete pipeline_pooling;
     pipeline_pooling = 0;
 
-    delete pipeline_pooling_pack4;
-    pipeline_pooling_pack4 = 0;
-
-    delete pipeline_pooling_adaptive;
-    pipeline_pooling_adaptive = 0;
-
-    delete pipeline_pooling_adaptive_pack4;
-    pipeline_pooling_adaptive_pack4 = 0;
-
-    delete pipeline_pooling_global_reduce_first;
-    pipeline_pooling_global_reduce_first = 0;
+    delete pipeline_pooling_tile;
+    pipeline_pooling_tile = 0;
 
-    delete pipeline_pooling_global_reduce_first_pack4;
-    pipeline_pooling_global_reduce_first_pack4 = 0;
+    delete pipeline_pooling_global;
+    pipeline_pooling_global = 0;
 
-    delete pipeline_pooling_global_reduce;
-    pipeline_pooling_global_reduce = 0;
+    delete pipeline_pooling_global_stage1;
+    pipeline_pooling_global_stage1 = 0;
 
-    delete pipeline_pooling_global_reduce_pack4;
-    pipeline_pooling_global_reduce_pack4 = 0;
+    delete pipeline_pooling_global_stage2;
+    pipeline_pooling_global_stage2 = 0;
 
-    delete pipeline_pooling_global_reduce_last;
-    pipeline_pooling_global_reduce_last = 0;
-
-    delete pipeline_pooling_global_reduce_last_pack4;
-    pipeline_pooling_global_reduce_last_pack4 = 0;
+    delete pipeline_pooling_adaptive;
+    pipeline_pooling_adaptive = 0;
 
     return 0;
 }
 
-int Pooling_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
-    if (padding)
+    const int dims = bottom_blob.dims;
+
+    if (dims == 1)
     {
-        padding->upload_model(cmd, opt);
+        top_blob = bottom_blob;
+        return 0;
     }
 
-    return 0;
-}
+    if (dims != 2 && dims != 3)
+        return -100;
 
-int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
-{
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int channels = (dims == 3) ? bottom_blob.c : 1;
+    const size_t elemsize = bottom_blob.elemsize;
 
     if (global_pooling)
     {
-        // reduce first
-        VkMat reduced_blob;
-        {
-            int reduced_size = (w * h + 7) / 8;
-            size_t reduced_elemsize = pooling_type == 0 ? elemsize : 4u * elempack;
-            reduced_blob.create(reduced_size, 1, channels, reduced_elemsize, elempack, opt.workspace_vkallocator);
-            if (reduced_blob.empty())
-                return -100;
+        top_blob.create(channels, elemsize, 1, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int size = w * h;
 
+        const bool use_two_stage = (channels < 8 && size >= 4096);
+
+        if (!use_two_stage)
+        {
             std::vector<VkMat> bindings(2);
             bindings[0] = bottom_blob;
-            bindings[1] = reduced_blob;
+            bindings[1] = top_blob;
 
-            std::vector<vk_constant_type> constants(6);
-            constants[0].i = bottom_blob.w;
-            constants[1].i = bottom_blob.h;
-            constants[2].i = bottom_blob.c;
+            std::vector<vk_constant_type> constants(4);
+            constants[0].i = w;
+            constants[1].i = h;
+            constants[2].i = channels;
             constants[3].i = bottom_blob.cstep;
-            constants[4].i = reduced_blob.w;
-            constants[5].i = reduced_blob.cstep;
-
-            const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_global_reduce_first_pack4 : pipeline_pooling_global_reduce_first;
 
             VkMat dispatcher;
-            dispatcher.w = reduced_blob.w;
+            dispatcher.w = channels * 256;
             dispatcher.h = 1;
-            dispatcher.c = bottom_blob.c;
+            dispatcher.c = 1;
 
-            cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+            cmd.record_pipeline(pipeline_pooling_global, bindings, constants, dispatcher);
+            return 0;
         }
 
-        // reduce more
-        while (reduced_blob.w > 32)
-        {
-            int reduced_size = (reduced_blob.w + 7) / 8;
-            size_t reduced_elemsize = pooling_type == 0 ? elemsize : 4u * elempack;
-            VkMat reduced_blob2;
-            reduced_blob2.create(reduced_size, 1, channels, reduced_elemsize, elempack, opt.workspace_vkallocator);
-            if (reduced_blob2.empty())
-                return -100;
+        const int wg = 256;
+        const int unroll = 4;
+        const int chunk = wg * unroll;
+        const int partial_w = (size + chunk - 1) / chunk;
 
+        VkMat partial;
+        partial.create(partial_w, channels, elemsize, 1, opt.workspace_vkallocator);
+        if (partial.empty())
+            return -100;
+
+        {
             std::vector<VkMat> bindings(2);
-            bindings[0] = reduced_blob;
-            bindings[1] = reduced_blob2;
+            bindings[0] = bottom_blob;
+            bindings[1] = partial;
 
             std::vector<vk_constant_type> constants(5);
-            constants[0].i = reduced_blob.w;
-            constants[1].i = reduced_blob.c;
-            constants[2].i = reduced_blob.cstep;
-            constants[3].i = reduced_blob2.w;
-            constants[4].i = reduced_blob2.cstep;
-
-            const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_global_reduce_pack4 : pipeline_pooling_global_reduce;
+            constants[0].i = w;
+            constants[1].i = h;
+            constants[2].i = channels;
+            constants[3].i = bottom_blob.cstep;
+            constants[4].i = partial_w;
 
             VkMat dispatcher;
-            dispatcher.w = reduced_blob2.w;
-            dispatcher.h = 1;
-            dispatcher.c = reduced_blob2.c;
-
-            cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+            dispatcher.w = partial_w * 256;
+            dispatcher.h = channels;
+            dispatcher.c = 1;
 
-            reduced_blob = reduced_blob2;
+            cmd.record_pipeline(pipeline_pooling_global_stage1, bindings, constants, dispatcher);
         }
 
-        // reduce last
         {
-            top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator);
-            if (top_blob.empty())
-                return -100;
-
             std::vector<VkMat> bindings(2);
-            bindings[0] = reduced_blob;
+            bindings[0] = partial;
             bindings[1] = top_blob;
 
-            std::vector<vk_constant_type> constants(4);
-            constants[0].i = reduced_blob.w;
-            constants[1].i = reduced_blob.c;
-            constants[2].i = reduced_blob.cstep;
-            constants[3].i = w * h;
-
-            const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_global_reduce_last_pack4 : pipeline_pooling_global_reduce_last;
+            std::vector<vk_constant_type> constants(3);
+            constants[0].i = partial_w;
+            constants[1].i = channels;
+            constants[2].i = size;
 
             VkMat dispatcher;
-            dispatcher.w = 1;
+            dispatcher.w = channels * 256;
             dispatcher.h = 1;
-            dispatcher.c = top_blob.w;
+            dispatcher.c = 1;
 
-            cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+            cmd.record_pipeline(pipeline_pooling_global_stage2, bindings, constants, dispatcher);
         }
 
         return 0;
@@ -481,16 +346,16 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
 
     if (adaptive_pooling)
     {
-        int _out_w = out_w == -233 ? w : out_w;
-        int _out_h = out_h == -233 ? h : out_h;
+        int outw = out_w == -233 ? w : out_w;
+        int outh = out_h == -233 ? h : out_h;
 
-        if (_out_w == w && _out_h == h)
+        if (outw == w && outh == h)
         {
             top_blob = bottom_blob;
             return 0;
         }
 
-        top_blob.create(_out_w, _out_h, channels, elemsize, elempack, opt.blob_vkallocator);
+        top_blob.create(outw, outh, channels, elemsize, 1, opt.blob_vkallocator);
         if (top_blob.empty())
             return -100;
 
@@ -498,155 +363,64 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
         bindings[0] = bottom_blob;
         bindings[1] = top_blob;
 
-        std::vector<vk_constant_type> constants(10);
+        std::vector<vk_constant_type> constants(12);
         constants[0].i = bottom_blob.dims;
         constants[1].i = bottom_blob.w;
         constants[2].i = bottom_blob.h;
-        constants[3].i = bottom_blob.c;
-        constants[4].i = bottom_blob.cstep;
-        constants[5].i = top_blob.dims;
-        constants[6].i = top_blob.w;
-        constants[7].i = top_blob.h;
-        constants[8].i = top_blob.c;
-        constants[9].i = top_blob.cstep;
-
-        const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_adaptive_pack4 : pipeline_pooling_adaptive;
-
-        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
-
+        constants[3].i = bottom_blob.d;
+        constants[4].i = (dims == 3) ? bottom_blob.c : 1;
+        constants[5].i = bottom_blob.cstep;
+        constants[6].i = top_blob.dims;
+        constants[7].i = top_blob.w;
+        constants[8].i = top_blob.h;
+        constants[9].i = top_blob.d;
+        constants[10].i = (dims == 3) ? top_blob.c : 1;
+        constants[11].i = top_blob.cstep;
+
+        cmd.record_pipeline(pipeline_pooling_adaptive, bindings, constants, top_blob);
         return 0;
     }
 
-    VkMat bottom_blob_bordered = bottom_blob;
-
-    int wtailpad = 0;
-    int htailpad = 0;
-
-    if (pad_mode == 0) // full padding
+    if (kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0 && pad_mode == 1)
     {
-        int wtail = (w + pad_left + pad_right - kernel_w) % stride_w;
-        int htail = (h + pad_top + pad_bottom - kernel_h) % stride_h;
-
-        if (wtail != 0)
-            wtailpad = stride_w - wtail;
-        if (htail != 0)
-            htailpad = stride_h - htail;
-
-        Option opt_pad = opt;
-        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
-
-        VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
-        int* padding_params = padding_param_blob.mapped();
-
-        padding_params[0] = pad_top;
-        padding_params[1] = pad_bottom + htailpad;
-        padding_params[2] = pad_left;
-        padding_params[3] = pad_right + wtailpad;
-        padding_params[4] = 0;
-        padding_params[5] = 0;
-
-        std::vector<VkMat> padding_inputs(2);
-        padding_inputs[0] = bottom_blob;
-        padding_inputs[1] = padding_param_blob;
-
-        std::vector<VkMat> padding_outputs(1);
-        padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
-        bottom_blob_bordered = padding_outputs[0];
+        top_blob = bottom_blob;
+        return 0;
     }
-    else if (pad_mode == 1) // valid padding
-    {
-        Option opt_pad = opt;
-        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
 
-        padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
-    }
-    else if (pad_mode == 2) // tensorflow padding=SAME or onnx padding=SAME_UPPER
+    int outw, outh;
+    int pl, pr, pt, pb;
+    calc_output_and_pad(w, h, kernel_w, kernel_h, stride_w, stride_h, pad_left, pad_right, pad_top, pad_bottom, pad_mode, outw, outh, pl, pr, pt, pb);
+
+    if (dims == 2)
     {
-        int wpad = kernel_w + (w - 1) / stride_w * stride_w - w;
-        int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
-        if (wpad > 0 || hpad > 0)
-        {
-            Option opt_pad = opt;
-            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
-
-            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
-            int* padding_params = padding_param_blob.mapped();
-
-            padding_params[0] = hpad / 2;
-            padding_params[1] = hpad - hpad / 2;
-            padding_params[2] = wpad / 2;
-            padding_params[3] = wpad - wpad / 2;
-            padding_params[4] = 0;
-            padding_params[5] = 0;
-
-            std::vector<VkMat> padding_inputs(2);
-            padding_inputs[0] = bottom_blob;
-            padding_inputs[1] = padding_param_blob;
-
-            std::vector<VkMat> padding_outputs(1);
-            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
-            bottom_blob_bordered = padding_outputs[0];
-        }
+        top_blob.create(outw, outh, elemsize, 1, opt.blob_vkallocator);
     }
-    else if (pad_mode == 3) // onnx padding=SAME_LOWER
+    else
     {
-        int wpad = kernel_w + (w - 1) / stride_w * stride_w - w;
-        int hpad = kernel_h + (h - 1) / stride_h * stride_h - h;
-        if (wpad > 0 || hpad > 0)
-        {
-            Option opt_pad = opt;
-            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
-
-            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
-            int* padding_params = padding_param_blob.mapped();
-
-            padding_params[0] = hpad - hpad / 2;
-            padding_params[1] = hpad / 2;
-            padding_params[2] = wpad - wpad / 2;
-            padding_params[3] = wpad / 2;
-            padding_params[4] = 0;
-            padding_params[5] = 0;
-
-            std::vector<VkMat> padding_inputs(2);
-            padding_inputs[0] = bottom_blob;
-            padding_inputs[1] = padding_param_blob;
-
-            std::vector<VkMat> padding_outputs(1);
-            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
-            bottom_blob_bordered = padding_outputs[0];
-        }
+        top_blob.create(outw, outh, channels, elemsize, 1, opt.blob_vkallocator);
     }
-
-    w = bottom_blob_bordered.w;
-    h = bottom_blob_bordered.h;
-
-    int outw = (w - kernel_w) / stride_w + 1;
-    int outh = (h - kernel_h) / stride_h + 1;
-
-    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
     if (top_blob.empty())
         return -100;
 
     std::vector<VkMat> bindings(2);
-    bindings[0] = bottom_blob_bordered;
+    bindings[0] = bottom_blob;
     bindings[1] = top_blob;
 
     std::vector<vk_constant_type> constants(12);
-    constants[0].i = bottom_blob_bordered.dims;
-    constants[1].i = bottom_blob_bordered.w;
-    constants[2].i = bottom_blob_bordered.h;
-    constants[3].i = bottom_blob_bordered.c;
-    constants[4].i = bottom_blob_bordered.cstep;
-    constants[5].i = top_blob.dims;
-    constants[6].i = top_blob.w;
-    constants[7].i = top_blob.h;
-    constants[8].i = top_blob.c;
-    constants[9].i = top_blob.cstep;
-    constants[10].i = wtailpad;
-    constants[11].i = htailpad;
-
-    const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_pack4 : pipeline_pooling;
-
+    constants[0].i = bottom_blob.dims;
+    constants[1].i = bottom_blob.w;
+    constants[2].i = bottom_blob.h;
+    constants[3].i = bottom_blob.d;
+    constants[4].i = (dims == 3) ? bottom_blob.c : 1;
+    constants[5].i = bottom_blob.cstep;
+    constants[6].i = top_blob.dims;
+    constants[7].i = top_blob.w;
+    constants[8].i = top_blob.h;
+    constants[9].i = top_blob.d;
+    constants[10].i = (dims == 3) ? top_blob.c : 1;
+    constants[11].i = top_blob.cstep;
+
+    const Pipeline* pipeline = pipeline_pooling_tile ? pipeline_pooling_tile : pipeline_pooling;
     cmd.record_pipeline(pipeline, bindings, constants, top_blob);
 
     return 0;
diff --git a/src/layer/vulkan/pooling_vulkan.h b/src/layer/vulkan/pooling_vulkan.h
index 273d9aa7f2ff..bb11414c7f9a 100644
--- a/src/layer/vulkan/pooling_vulkan.h
+++ b/src/layer/vulkan/pooling_vulkan.h
@@ -1,4 +1,4 @@
-// Copyright 2019 Tencent
+// Copyright 2026 Futz12 <pchar.cn>
 // SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef LAYER_POOLING_VULKAN_H
@@ -16,26 +16,18 @@ class Pooling_vulkan : public Pooling
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
-    virtual int upload_model(VkTransfer& cmd, const Option& opt);
-
     using Pooling::forward;
     virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
 
 public:
-    ncnn::Layer* padding;
-
     Pipeline* pipeline_pooling;
-    Pipeline* pipeline_pooling_pack4;
+    Pipeline* pipeline_pooling_tile;
+
+    Pipeline* pipeline_pooling_global;
+    Pipeline* pipeline_pooling_global_stage1;
+    Pipeline* pipeline_pooling_global_stage2;
 
     Pipeline* pipeline_pooling_adaptive;
-    Pipeline* pipeline_pooling_adaptive_pack4;
-
-    Pipeline* pipeline_pooling_global_reduce_first;
-    Pipeline* pipeline_pooling_global_reduce_first_pack4;
-    Pipeline* pipeline_pooling_global_reduce;
-    Pipeline* pipeline_pooling_global_reduce_pack4;
-    Pipeline* pipeline_pooling_global_reduce_last;
-    Pipeline* pipeline_pooling_global_reduce_last_pack4;
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/shader/pooling.comp b/src/layer/vulkan/shader/pooling.comp
index b52ac3c8b7dd..fd2e9dc1157f 100644
--- a/src/layer/vulkan/shader/pooling.comp
+++ b/src/layer/vulkan/shader/pooling.comp
@@ -1,35 +1,34 @@
-// Copyright 2018 Tencent
+// Copyright 2026 Futz12 <pchar.cn>
 // SPDX-License-Identifier: BSD-3-Clause
 
 #version 450
 
-#define FLT_MAX 3.402823466e+38
-
 layout(constant_id = 0) const int pooling_type = 0;
-layout(constant_id = 1) const int kernel_w = 1;
-layout(constant_id = 2) const int kernel_h = 1;
+layout(constant_id = 1) const int kernel_w = 0;
+layout(constant_id = 2) const int kernel_h = 0;
 layout(constant_id = 3) const int stride_w = 1;
 layout(constant_id = 4) const int stride_h = 1;
 layout(constant_id = 5) const int pad_left = 0;
 layout(constant_id = 6) const int pad_right = 0;
 layout(constant_id = 7) const int pad_top = 0;
 layout(constant_id = 8) const int pad_bottom = 0;
-layout(constant_id = 9) const int global_pooling = 0;
-layout(constant_id = 10) const int pad_mode = 0;
-layout(constant_id = 11) const int avgpool_count_include_pad = 0;
+layout(constant_id = 9) const int pad_mode = 0;
+layout(constant_id = 10) const int avgpool_count_include_pad = 0;
 
-#define shape_constant_id_offset 12
+#define shape_constant_id_offset 11
 layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int d = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0;
 
-layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outd = 0;
+layout(constant_id = shape_constant_id_offset + 10) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
 
 layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
@@ -39,107 +38,150 @@ layout(push_constant) uniform parameter
     int dims;
     int w;
     int h;
+    int d;
     int c;
     int cstep;
 
     int outdims;
     int outw;
     int outh;
+    int outd;
     int outc;
     int outcstep;
-
-    int wtailpad;
-    int htailpad;
 } p;
 
 void main()
 {
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
+    int ox = int(gl_GlobalInvocationID.x);
+    int oy = int(gl_GlobalInvocationID.y);
+    int oz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+    if (ox >= psc(outw) || oy >= psc(outh) || oz >= psc(outc))
         return;
 
-    afp res;
+    int pl;
+    int pr;
+    int pt;
+    int pb;
 
-    if (pooling_type == 0)
+    if (pad_mode == 0 || pad_mode == 1)
     {
-        res = afp(-FLT_MAX);
+        pl = pad_left;
+        pr = pad_right;
+        pt = pad_top;
+        pb = pad_bottom;
 
-        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
-
-        for (int y = 0; y < kernel_h; y++)
+        if (pad_mode == 0)
         {
-            for (int x = 0; x < kernel_w; x++)
-            {
-                afp v = buffer_ld1(bottom_blob_data, v_offset + x);
-                res = max(res, v);
-            }
-
-            v_offset += psc(w);
+            int wtail = (psc(w) + pl + pr - kernel_w) % stride_w;
+            int htail = (psc(h) + pt + pb - kernel_h) % stride_h;
+            if (wtail != 0) pr += stride_w - wtail;
+            if (htail != 0) pb += stride_h - htail;
         }
     }
-    if (pooling_type == 1 && avgpool_count_include_pad == 0)
+    else
     {
-        res = afp(0.f);
-        int area = 0;
+        int wpad = kernel_w + (psc(w) - 1) / stride_w * stride_w - psc(w);
+        int hpad = kernel_h + (psc(h) - 1) / stride_h * stride_h - psc(h);
+        if (wpad < 0) wpad = 0;
+        if (hpad < 0) hpad = 0;
 
-        int sx = gx * stride_w;
-        int sy = gy * stride_h;
+        if (pad_mode == 2)
+        {
+            pl = wpad / 2;
+            pr = wpad - pl;
+            pt = hpad / 2;
+            pb = hpad - pt;
+        }
+        else
+        {
+            pl = wpad - wpad / 2;
+            pr = wpad / 2;
+            pt = hpad - hpad / 2;
+            pb = hpad / 2;
+        }
+    }
+
+    int inx0 = ox * stride_w - pl;
+    int iny0 = oy * stride_h - pt;
 
-        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
+    if (pooling_type == 0)
+    {
+        afp mv = afp(-3.402823466e38);
 
-        for (int y = 0; y < kernel_h; y++)
+        for (int ky = 0; ky < kernel_h; ky++)
         {
-            if (sy + y < pad_top)
+            int iy = iny0 + ky;
+            for (int kx = 0; kx < kernel_w; kx++)
             {
-                v_offset += psc(w);
-                continue;
-            }
-
-            if (sy + y >= psc(h) - pad_bottom - p.htailpad)
-                break;
+                int ix = inx0 + kx;
 
-            for (int x = 0; x < kernel_w; x++)
-            {
-                if (sx + x < pad_left)
-                {
+                if (ix < 0 || ix >= psc(w) || iy < 0 || iy >= psc(h))
                     continue;
-                }
-
-                if (sx + x >= psc(w) - pad_right - p.wtailpad)
-                    break;
 
-                res += buffer_ld1(bottom_blob_data, v_offset + x);
-                area += 1;
+                int si = oz * psc(cstep) + iy * psc(w) + ix;
+                afp v = buffer_ld1(bottom_blob_data, si);
+                mv = max(mv, v);
             }
-
-            v_offset += psc(w);
         }
 
-        res /= afp(area);
+        int gi = oz * psc(outcstep) + oy * psc(outw) + ox;
+        buffer_st1(top_blob_data, gi, mv);
     }
-    if (pooling_type == 1 && avgpool_count_include_pad == 1)
+    else
     {
-        res = afp(0.f);
+        afp sum = afp(0.f);
 
-        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
-
-        for (int y = 0; y < kernel_h; y++)
+        if (avgpool_count_include_pad == 1)
         {
-            for (int x = 0; x < kernel_w; x++)
+            for (int ky = 0; ky < kernel_h; ky++)
             {
-                res += buffer_ld1(bottom_blob_data, v_offset + x);
+                int iy = iny0 + ky;
+                for (int kx = 0; kx < kernel_w; kx++)
+                {
+                    int ix = inx0 + kx;
+
+                    if (ix < 0 || ix >= psc(w) || iy < 0 || iy >= psc(h))
+                        continue;
+
+                    int si = oz * psc(cstep) + iy * psc(w) + ix;
+                    sum += buffer_ld1(bottom_blob_data, si);
+                }
             }
 
-            v_offset += psc(w);
+            sum *= afp(1.f / float(kernel_w * kernel_h));
+            int gi = oz * psc(outcstep) + oy * psc(outw) + ox;
+            buffer_st1(top_blob_data, gi, sum);
         }
+        else
+        {
+            int vx0 = max(0, -inx0);
+            int vy0 = max(0, -iny0);
+            int vx1 = min(kernel_w, psc(w) - inx0);
+            int vy1 = min(kernel_h, psc(h) - iny0);
 
-        res /= afp(kernel_w * kernel_h);
-    }
+            int area = (vx1 - vx0) * (vy1 - vy0);
+            if (area <= 0)
+            {
+                int gi = oz * psc(outcstep) + oy * psc(outw) + ox;
+                buffer_st1(top_blob_data, gi, afp(0.f));
+                return;
+            }
 
-    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+            for (int ky = vy0; ky < vy1; ky++)
+            {
+                int iy = iny0 + ky;
+                for (int kx = vx0; kx < vx1; kx++)
+                {
+                    int ix = inx0 + kx;
+                    int si = oz * psc(cstep) + iy * psc(w) + ix;
+                    sum += buffer_ld1(bottom_blob_data, si);
+                }
+            }
 
-    buffer_st1(top_blob_data, gi, res);
+            sum *= afp(1.f / float(area));
+            int gi = oz * psc(outcstep) + oy * psc(outw) + ox;
+            buffer_st1(top_blob_data, gi, sum);
+        }
+    }
 }
diff --git a/src/layer/vulkan/shader/pooling_adaptive.comp b/src/layer/vulkan/shader/pooling_adaptive.comp
index 003a0cb8c903..4cca86add74a 100644
--- a/src/layer/vulkan/shader/pooling_adaptive.comp
+++ b/src/layer/vulkan/shader/pooling_adaptive.comp
@@ -1,24 +1,13 @@
-// Copyright 2018 Tencent
+// Copyright 2026 Futz12 <pchar.cn>
 // SPDX-License-Identifier: BSD-3-Clause
 
 #version 450
 
-#define FLT_MAX 3.402823466e+38
-
 layout(constant_id = 0) const int pooling_type = 0;
-
-#define shape_constant_id_offset 1
-layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout(constant_id = 1) const int out_w_param = 0;
+layout(constant_id = 2) const int out_h_param = 0;
+layout(constant_id = 3) const int reserved0 = 0;
+layout(constant_id = 4) const int reserved1 = 0;
 
 layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
@@ -28,73 +17,69 @@ layout(push_constant) uniform parameter
     int dims;
     int w;
     int h;
+    int d;
     int c;
     int cstep;
 
     int outdims;
     int outw;
     int outh;
+    int outd;
     int outc;
     int outcstep;
 } p;
 
 void main()
 {
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
+    int ox = int(gl_GlobalInvocationID.x);
+    int oy = int(gl_GlobalInvocationID.y);
+    int oz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+    if (ox >= p.outw || oy >= p.outh || oz >= p.outc)
         return;
 
-    afp res;
+    int ih0 = p.h * oy / p.outh;
+    int ih1 = (p.h * (oy + 1) + p.outh - 1) / p.outh;
 
-    // calculate adaptive kernel size
-    const int sx = psc(w) * gx / psc(outw);
-    const int ex = (psc(w) * (gx + 1) + psc(outw) - 1) / psc(outw);
-    const int kernel_w = ex - sx;
-    const int sy = psc(h) * gy / psc(outh);
-    const int ey = (psc(h) * (gy + 1) + psc(outh) - 1) / psc(outh);
-    const int kernel_h = ey - sy;
+    int iw0 = p.w * ox / p.outw;
+    int iw1 = (p.w * (ox + 1) + p.outw - 1) / p.outw;
 
     if (pooling_type == 0)
     {
-        res = afp(-FLT_MAX);
+        int si0 = oz * p.cstep + ih0 * p.w + iw0;
+        afp mv = buffer_ld1(bottom_blob_data, si0);
 
-        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
-
-        for (int y = 0; y < kernel_h; y++)
+        for (int iy = ih0; iy < ih1; iy++)
         {
-            for (int x = 0; x < kernel_w; x++)
+            int base = oz * p.cstep + iy * p.w;
+            for (int ix = iw0; ix < iw1; ix++)
             {
-                afp v = buffer_ld1(bottom_blob_data, v_offset + x);
-                res = max(res, v);
+                afp v = buffer_ld1(bottom_blob_data, base + ix);
+                mv = max(mv, v);
             }
-
-            v_offset += psc(w);
         }
+
+        int gi = oz * p.outcstep + oy * p.outw + ox;
+        buffer_st1(top_blob_data, gi, mv);
     }
-    if (pooling_type == 1)
+    else
     {
-        float res_fp32 = 0.f; // force accumulation in fp32
-
-        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
+        afp sum = afp(0.f);
+        int hk = ih1 - ih0;
+        int wk = iw1 - iw0;
+        int area = hk * wk;
 
-        for (int y = 0; y < kernel_h; y++)
+        for (int iy = ih0; iy < ih1; iy++)
         {
-            for (int x = 0; x < kernel_w; x++)
+            int base = oz * p.cstep + iy * p.w;
+            for (int ix = iw0; ix < iw1; ix++)
             {
-                res_fp32 += buffer_ld1(bottom_blob_data, v_offset + x);
+                sum += buffer_ld1(bottom_blob_data, base + ix);
             }
-
-            v_offset += psc(w);
         }
 
-        res_fp32 /= float(kernel_h * kernel_w);
-        res = afp(res_fp32); // cast to fp16 if possible
+        sum *= afp(1.f / float(area));
+        int gi = oz * p.outcstep + oy * p.outw + ox;
+        buffer_st1(top_blob_data, gi, sum);
     }
-
-    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
-
-    buffer_st1(top_blob_data, gi, res);
 }
diff --git a/src/layer/vulkan/shader/pooling_adaptive_pack4.comp b/src/layer/vulkan/shader/pooling_adaptive_pack4.comp
deleted file mode 100644
index f0d5c66a0567..000000000000
--- a/src/layer/vulkan/shader/pooling_adaptive_pack4.comp
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2019 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define FLT_MAX 3.402823466e+38
-
-layout(constant_id = 0) const int pooling_type = 0;
-
-#define shape_constant_id_offset 1
-layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int dims;
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outdims;
-    int outw;
-    int outh;
-    int outc;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
-        return;
-
-    afpvec4 res;
-
-    // calculate adaptive kernel size
-    const int sx = psc(w) * gx / psc(outw);
-    const int ex = (psc(w) * (gx + 1) + psc(outw) - 1) / psc(outw);
-    const int kernel_w = ex - sx;
-    const int sy = psc(h) * gy / psc(outh);
-    const int ey = (psc(h) * (gy + 1) + psc(outh) - 1) / psc(outh);
-    const int kernel_h = ey - sy;
-
-    if (pooling_type == 0)
-    {
-        res = afpvec4(-FLT_MAX);
-
-        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
-
-        for (int y = 0; y < kernel_h; y++)
-        {
-            for (int x = 0; x < kernel_w; x++)
-            {
-                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x);
-                res = max(res, v);
-            }
-
-            v_offset += psc(w);
-        }
-    }
-    else if (pooling_type == 1)
-    {
-        vec4 res_fp32 = vec4(0.f); // force accumulation in fp32
-
-        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
-
-        for (int y = 0; y < kernel_h; y++)
-        {
-            for (int x = 0; x < kernel_w; x++)
-            {
-                res_fp32 += buffer_ld4(bottom_blob_data, v_offset + x);
-            }
-
-            v_offset += psc(w);
-        }
-
-        res_fp32 /= float(kernel_h * kernel_w);
-        res = afpvec4(res_fp32); // cast to fp16 if possible
-    }
-
-    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
-
-    buffer_st4(top_blob_data, gi, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global.comp b/src/layer/vulkan/shader/pooling_global.comp
new file mode 100644
index 000000000000..557221cda933
--- /dev/null
+++ b/src/layer/vulkan/shader/pooling_global.comp
@@ -0,0 +1,69 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int pooling_type = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+shared lfp sdata[256];
+
+void main()
+{
+    int cid = int(gl_WorkGroupID.x);
+    int tid = int(gl_LocalInvocationID.x);
+
+    if (cid >= p.c)
+        return;
+
+    int size = p.w * p.h;
+
+    afp acc = (pooling_type == 0) ? afp(-3.402823466e38) : afp(0.f);
+
+    for (int i = tid; i < size; i += 256)
+    {
+        int iy = i / p.w;
+        int ix = i - iy * p.w;
+        int si = cid * p.cstep + iy * p.w + ix;
+        afp v = buffer_ld1(bottom_blob_data, si);
+
+        if (pooling_type == 0)
+            acc = max(acc, v);
+        else
+            acc += v;
+    }
+
+    sdata[tid] = sfp2lfp(acc);
+    barrier();
+
+    for (int offset = 128; offset > 0; offset >>= 1)
+    {
+        if (tid < offset)
+        {
+            afp a = lfp2afp(sdata[tid]);
+            afp b = lfp2afp(sdata[tid + offset]);
+            afp r = (pooling_type == 0) ? max(a, b) : (a + b);
+            sdata[tid] = sfp2lfp(r);
+        }
+        barrier();
+    }
+
+    if (tid == 0)
+    {
+        afp outv = lfp2afp(sdata[0]);
+        if (pooling_type != 0)
+            outv *= afp(1.f / float(size));
+
+        buffer_st1(top_blob_data, cid, outv);
+    }
+}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max.comp b/src/layer/vulkan/shader/pooling_global_reduce_max.comp
deleted file mode 100644
index 352bd9b44410..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_max.comp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define FLT_MAX 3.402823466e+38
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 3) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int c;
-    int cstep;
-
-    int outw;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int size_1 = psc(w) - 1;
-
-    const int v_offset = gz * psc(cstep);
-
-    afp res = afp(-FLT_MAX);
-
-    for (int ii = 0; ii < 8; ii++)
-    {
-        int i = min(gx + ii * psc(outw), size_1);
-
-        afp v = buffer_ld1(bottom_blob_data, v_offset + i);
-        res = max(res, v);
-    }
-
-    buffer_st1(top_blob_data, gz * psc(outcstep) + gx, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_first.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_first.comp
deleted file mode 100644
index 1b560eed567e..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_max_first.comp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define FLT_MAX 3.402823466e+38
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 5) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int size_1 = psc(w) * psc(h) - 1;
-
-    const int v_offset = gz * psc(cstep);
-
-    afp res = afp(-FLT_MAX);
-
-    for (int ii = 0; ii < 8; ii++)
-    {
-        int i = min(gx + ii * psc(outw), size_1);
-
-        afp v = buffer_ld1(bottom_blob_data, v_offset + i);
-        res = max(res, v);
-    }
-
-    buffer_st1(top_blob_data, gz * psc(outcstep) + gx, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_first_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_first_pack4.comp
deleted file mode 100644
index 882c9269a221..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_max_first_pack4.comp
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define FLT_MAX 3.402823466e+38
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 5) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int size_1 = psc(w) * psc(h) - 1;
-
-    const int v_offset = gz * psc(cstep);
-
-    afpvec4 res = afpvec4(-FLT_MAX);
-
-    for (int ii = 0; ii < 8; ii++)
-    {
-        int i = min(gx + ii * psc(outw), size_1);
-
-        afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i);
-        res = max(res, v);
-    }
-
-    buffer_st4(top_blob_data, gz * psc(outcstep) + gx, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_last.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_last.comp
deleted file mode 100644
index 78196494e35e..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_max_last.comp
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define FLT_MAX 3.402823466e+38
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int c;
-    int cstep;
-    int size;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= 1 || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int v_offset = gz * psc(cstep);
-
-    afp res = afp(-FLT_MAX);
-
-    for (int i = 0; i < psc(w); i++)
-    {
-        afp v = buffer_ld1(bottom_blob_data, v_offset + i);
-        res = max(res, v);
-    }
-
-    buffer_st1(top_blob_data, gz, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_last_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_last_pack4.comp
deleted file mode 100644
index c1c592bd8f22..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_max_last_pack4.comp
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define FLT_MAX 3.402823466e+38
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int c;
-    int cstep;
-    int size;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= 1 || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int v_offset = gz * psc(cstep);
-
-    afpvec4 res = afpvec4(-FLT_MAX);
-
-    for (int i = 0; i < psc(w); i++)
-    {
-        afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i);
-        res = max(res, v);
-    }
-
-    buffer_st4(top_blob_data, gz, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_pack4.comp
deleted file mode 100644
index 421c99025386..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_max_pack4.comp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define FLT_MAX 3.402823466e+38
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 3) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int c;
-    int cstep;
-
-    int outw;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int size_1 = psc(w) - 1;
-
-    const int v_offset = gz * psc(cstep);
-
-    afpvec4 res = afpvec4(-FLT_MAX);
-
-    for (int ii = 0; ii < 8; ii++)
-    {
-        int i = min(gx + ii * psc(outw), size_1);
-
-        afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i);
-        res = max(res, v);
-    }
-
-    buffer_st4(top_blob_data, gz * psc(outcstep) + gx, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum.comp
deleted file mode 100644
index a1f77a519475..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_sum.comp
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 3) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int c;
-    int cstep;
-
-    int outw;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int end = min(8, (psc(w) - gx - 1) / psc(outw) + 1);
-
-    const int v_offset = gz * psc(cstep);
-
-    float sum = 0.f;
-
-    for (int ii = 0; ii < end; ii++)
-    {
-        int i = gx + ii * psc(outw);
-
-        float v = bottom_blob_data[v_offset + i];
-        sum += v;
-    }
-
-    top_blob_data[gz * psc(outcstep) + gx] = sum;
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_first.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_first.comp
deleted file mode 100644
index 272f464d86f1..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_sum_first.comp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 5) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int end = min(8, (psc(w) * psc(h) - gx - 1) / psc(outw) + 1);
-
-    const int v_offset = gz * psc(cstep);
-
-    float sum = 0.f;
-
-    for (int ii = 0; ii < end; ii++)
-    {
-        int i = gx + ii * psc(outw);
-
-        afp v = buffer_ld1(bottom_blob_data, v_offset + i);
-        sum += float(v);
-    }
-
-    top_blob_data[gz * psc(outcstep) + gx] = sum;
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_first_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_first_pack4.comp
deleted file mode 100644
index 4b75f4b95da5..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_sum_first_pack4.comp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 5) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int end = min(8, (psc(w) * psc(h) - gx - 1) / psc(outw) + 1);
-
-    const int v_offset = gz * psc(cstep);
-
-    vec4 sum = vec4(0.f);
-
-    for (int ii = 0; ii < end; ii++)
-    {
-        int i = gx + ii * psc(outw);
-
-        afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i);
-        sum += vec4(v);
-    }
-
-    top_blob_data[gz * psc(outcstep) + gx] = sum;
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_last.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_last.comp
deleted file mode 100644
index be8100ede20f..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_sum_last.comp
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int c;
-    int cstep;
-    int size;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= 1 || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int v_offset = gz * psc(cstep);
-
-    float sum = 0.f;
-
-    for (int i = 0; i < psc(w); i++)
-    {
-        float v = bottom_blob_data[v_offset + i];
-        sum += v;
-    }
-
-    afp res = afp(sum / p.size);
-
-    buffer_st1(top_blob_data, gz, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_last_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_last_pack4.comp
deleted file mode 100644
index 8b5dd8b92ccf..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_sum_last_pack4.comp
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int c;
-    int cstep;
-    int size;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= 1 || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int v_offset = gz * psc(cstep);
-
-    vec4 sum = vec4(0.f);
-
-    for (int i = 0; i < psc(w); i++)
-    {
-        vec4 v = bottom_blob_data[v_offset + i];
-        sum += v;
-    }
-
-    afpvec4 res = afpvec4(sum / p.size);
-
-    buffer_st4(top_blob_data, gz, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_pack4.comp
deleted file mode 100644
index c5f3184350c4..000000000000
--- a/src/layer/vulkan/shader/pooling_global_reduce_sum_pack4.comp
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2023 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define shape_constant_id_offset 0
-layout(constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 3) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int w;
-    int c;
-    int cstep;
-
-    int outw;
-    int outcstep;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= 1 || gz >= psc(c))
-        return;
-
-    const int end = min(8, (psc(w) - gx - 1) / psc(outw) + 1);
-
-    const int v_offset = gz * psc(cstep);
-
-    vec4 sum = vec4(0.f);
-
-    for (int ii = 0; ii < end; ii++)
-    {
-        int i = gx + ii * psc(outw);
-
-        vec4 v = bottom_blob_data[v_offset + i];
-        sum += v;
-    }
-
-    top_blob_data[gz * psc(outcstep) + gx] = sum;
-}
diff --git a/src/layer/vulkan/shader/pooling_global_stage1.comp b/src/layer/vulkan/shader/pooling_global_stage1.comp
new file mode 100644
index 000000000000..385981853cbc
--- /dev/null
+++ b/src/layer/vulkan/shader/pooling_global_stage1.comp
@@ -0,0 +1,75 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int pooling_type = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer partial_blob { sfp partial_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+    int partial_w;
+} p;
+
+shared lfp sdata[256];
+
+void main()
+{
+    int cid = int(gl_WorkGroupID.y);
+    int chunk = int(gl_WorkGroupID.x);
+    int tid = int(gl_LocalInvocationID.x);
+
+    if (cid >= p.c)
+        return;
+
+    int size = p.w * p.h;
+
+    int base = chunk * (256 * 4);
+    int idx = base + tid;
+
+    afp acc = (pooling_type == 0) ? afp(-3.402823466e38) : afp(0.f);
+
+    for (int u = 0; u < 4; u++)
+    {
+        int k = idx + u * 256;
+        if (k < size)
+        {
+            int iy = k / p.w;
+            int ix = k - iy * p.w;
+            int si = cid * p.cstep + iy * p.w + ix;
+            afp v = buffer_ld1(bottom_blob_data, si);
+
+            if (pooling_type == 0)
+                acc = max(acc, v);
+            else
+                acc += v;
+        }
+    }
+
+    sdata[tid] = sfp2lfp(acc);
+    barrier();
+
+    for (int offset = 128; offset > 0; offset >>= 1)
+    {
+        if (tid < offset)
+        {
+            afp a = lfp2afp(sdata[tid]);
+            afp b = lfp2afp(sdata[tid + offset]);
+            afp r = (pooling_type == 0) ? max(a, b) : (a + b);
+            sdata[tid] = sfp2lfp(r);
+        }
+        barrier();
+    }
+
+    if (tid == 0)
+    {
+        int oi = cid * p.partial_w + chunk;
+        buffer_st1(partial_blob_data, oi, lfp2afp(sdata[0]));
+    }
+}
diff --git a/src/layer/vulkan/shader/pooling_global_stage2.comp b/src/layer/vulkan/shader/pooling_global_stage2.comp
new file mode 100644
index 000000000000..596c7e4091c1
--- /dev/null
+++ b/src/layer/vulkan/shader/pooling_global_stage2.comp
@@ -0,0 +1,64 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int pooling_type = 0;
+
+layout(binding = 0) readonly buffer partial_blob { sfp partial_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int partial_w;
+    int c;
+    int in_size;
+} p;
+
+shared lfp sdata[256];
+
+void main()
+{
+    int cid = int(gl_WorkGroupID.x);
+    int tid = int(gl_LocalInvocationID.x);
+
+    if (cid >= p.c)
+        return;
+
+    afp acc = (pooling_type == 0) ? afp(-3.402823466e38) : afp(0.f);
+
+    for (int i = tid; i < p.partial_w; i += 256)
+    {
+        int si = cid * p.partial_w + i;
+        afp v = buffer_ld1(partial_blob_data, si);
+
+        if (pooling_type == 0)
+            acc = max(acc, v);
+        else
+            acc += v;
+    }
+
+    sdata[tid] = sfp2lfp(acc);
+    barrier();
+
+    for (int offset = 128; offset > 0; offset >>= 1)
+    {
+        if (tid < offset)
+        {
+            afp a = lfp2afp(sdata[tid]);
+            afp b = lfp2afp(sdata[tid + offset]);
+            afp r = (pooling_type == 0) ? max(a, b) : (a + b);
+            sdata[tid] = sfp2lfp(r);
+        }
+        barrier();
+    }
+
+    if (tid == 0)
+    {
+        afp outv = lfp2afp(sdata[0]);
+        if (pooling_type != 0)
+            outv *= afp(1.f / float(p.in_size));
+
+        buffer_st1(top_blob_data, cid, outv);
+    }
+}
diff --git a/src/layer/vulkan/shader/pooling_pack4.comp b/src/layer/vulkan/shader/pooling_pack4.comp
deleted file mode 100644
index a14377946a0b..000000000000
--- a/src/layer/vulkan/shader/pooling_pack4.comp
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright 2019 Tencent
-// SPDX-License-Identifier: BSD-3-Clause
-
-#version 450
-
-#define FLT_MAX 3.402823466e+38
-
-layout(constant_id = 0) const int pooling_type = 0;
-layout(constant_id = 1) const int kernel_w = 1;
-layout(constant_id = 2) const int kernel_h = 1;
-layout(constant_id = 3) const int stride_w = 1;
-layout(constant_id = 4) const int stride_h = 1;
-layout(constant_id = 5) const int pad_left = 0;
-layout(constant_id = 6) const int pad_right = 0;
-layout(constant_id = 7) const int pad_top = 0;
-layout(constant_id = 8) const int pad_bottom = 0;
-layout(constant_id = 9) const int global_pooling = 0;
-layout(constant_id = 10) const int pad_mode = 0;
-layout(constant_id = 11) const int avgpool_count_include_pad = 0;
-
-#define shape_constant_id_offset 12
-layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout(constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout(constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout(constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout(constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
-
-layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
-layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-
-layout(push_constant) uniform parameter
-{
-    int dims;
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outdims;
-    int outw;
-    int outh;
-    int outc;
-    int outcstep;
-
-    int wtailpad;
-    int htailpad;
-} p;
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x);
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
-        return;
-
-    afpvec4 res;
-
-    if (pooling_type == 0)
-    {
-        res = afpvec4(-FLT_MAX);
-
-        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
-
-        for (int y = 0; y < kernel_h; y++)
-        {
-            for (int x = 0; x < kernel_w; x++)
-            {
-                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x);
-                res = max(res, v);
-            }
-
-            v_offset += psc(w);
-        }
-    }
-    else if (pooling_type == 1 && avgpool_count_include_pad == 0)
-    {
-        res = afpvec4(0.f);
-        int area = 0;
-
-        int sx = gx * stride_w;
-        int sy = gy * stride_h;
-
-        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
-
-        for (int y = 0; y < kernel_h; y++)
-        {
-            if (sy + y < pad_top)
-            {
-                v_offset += psc(w);
-                continue;
-            }
-
-            if (sy + y >= psc(h) - pad_bottom - p.htailpad)
-                break;
-
-            for (int x = 0; x < kernel_w; x++)
-            {
-                if (sx + x < pad_left)
-                {
-                    continue;
-                }
-
-                if (sx + x >= psc(w) - pad_right - p.wtailpad)
-                    break;
-
-                res += buffer_ld4(bottom_blob_data, v_offset + x);
-                area += 1;
-            }
-
-            v_offset += psc(w);
-        }
-
-        res /= afp(area);
-    }
-    else if (pooling_type == 1 && avgpool_count_include_pad == 1)
-    {
-        res = afpvec4(0.f);
-
-        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
-
-        for (int y = 0; y < kernel_h; y++)
-        {
-            for (int x = 0; x < kernel_w; x++)
-            {
-                res += buffer_ld4(bottom_blob_data, v_offset + x);
-            }
-
-            v_offset += psc(w);
-        }
-
-        res /= afp(kernel_w * kernel_h);
-    }
-
-    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
-
-    buffer_st4(top_blob_data, gi, res);
-}
diff --git a/src/layer/vulkan/shader/pooling_tile.comp b/src/layer/vulkan/shader/pooling_tile.comp
new file mode 100644
index 000000000000..0fa989926e27
--- /dev/null
+++ b/src/layer/vulkan/shader/pooling_tile.comp
@@ -0,0 +1,222 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+layout(constant_id = 0) const int pooling_type = 0;
+layout(constant_id = 1) const int kernel_w = 0;
+layout(constant_id = 2) const int kernel_h = 0;
+layout(constant_id = 3) const int stride_w = 1;
+layout(constant_id = 4) const int stride_h = 1;
+layout(constant_id = 5) const int pad_left = 0;
+layout(constant_id = 6) const int pad_right = 0;
+layout(constant_id = 7) const int pad_top = 0;
+layout(constant_id = 8) const int pad_bottom = 0;
+layout(constant_id = 9) const int pad_mode = 0;
+layout(constant_id = 10) const int avgpool_count_include_pad = 0;
+
+#define shape_constant_id_offset 11
+layout(constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout(constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout(constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout(constant_id = shape_constant_id_offset + 3) const int d = 0;
+layout(constant_id = shape_constant_id_offset + 4) const int c = 0;
+layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0;
+
+layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0;
+layout(constant_id = shape_constant_id_offset + 7) const int outw = 0;
+layout(constant_id = shape_constant_id_offset + 8) const int outh = 0;
+layout(constant_id = shape_constant_id_offset + 9) const int outd = 0;
+layout(constant_id = shape_constant_id_offset + 10) const int outc = 0;
+layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
+
+layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int d;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outd;
+    int outc;
+    int outcstep;
+} p;
+
+const int tile_out_w = 8;
+const int tile_out_h = 8;
+const int tile_in_max = 36;
+
+shared lfp tile[tile_in_max][tile_in_max];
+
+void main()
+{
+    int lid_x = int(gl_LocalInvocationID.x);
+    int lid_y = int(gl_LocalInvocationID.y);
+
+    int gx0 = int(gl_WorkGroupID.x) * tile_out_w;
+    int gy0 = int(gl_WorkGroupID.y) * tile_out_h;
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gz >= psc(outc))
+        return;
+
+    int pl;
+    int pr;
+    int pt;
+    int pb;
+
+    if (pad_mode == 0 || pad_mode == 1)
+    {
+        pl = pad_left;
+        pr = pad_right;
+        pt = pad_top;
+        pb = pad_bottom;
+
+        if (pad_mode == 0)
+        {
+            int wtail = (psc(w) + pl + pr - kernel_w) % stride_w;
+            int htail = (psc(h) + pt + pb - kernel_h) % stride_h;
+            if (wtail != 0) pr += stride_w - wtail;
+            if (htail != 0) pb += stride_h - htail;
+        }
+    }
+    else
+    {
+        int wpad = kernel_w + (psc(w) - 1) / stride_w * stride_w - psc(w);
+        int hpad = kernel_h + (psc(h) - 1) / stride_h * stride_h - psc(h);
+        if (wpad < 0) wpad = 0;
+        if (hpad < 0) hpad = 0;
+
+        if (pad_mode == 2)
+        {
+            pl = wpad / 2;
+            pr = wpad - pl;
+            pt = hpad / 2;
+            pb = hpad - pt;
+        }
+        else
+        {
+            pl = wpad - wpad / 2;
+            pr = wpad / 2;
+            pt = hpad - hpad / 2;
+            pb = hpad / 2;
+        }
+    }
+
+    int tile_in_w = (tile_out_w - 1) * stride_w + kernel_w;
+    int tile_in_h = (tile_out_h - 1) * stride_h + kernel_h;
+
+    int in_x0 = gx0 * stride_w - pl;
+    int in_y0 = gy0 * stride_h - pt;
+
+    lfp padv = (pooling_type == 0) ? sfp2lfp(afp(-3.402823466e38)) : sfp2lfp(afp(0.f));
+
+    int l = lid_y * tile_out_w + lid_x;
+    int lsize = tile_out_w * tile_out_h;
+    int tilesz = tile_in_w * tile_in_h;
+
+    for (int i = l; i < tilesz; i += lsize)
+    {
+        int ty = i / tile_in_w;
+        int tx = i - ty * tile_in_w;
+
+        int ix = in_x0 + tx;
+        int iy = in_y0 + ty;
+
+        lfp v = padv;
+        if (ix >= 0 && ix < psc(w) && iy >= 0 && iy < psc(h))
+        {
+            int si = gz * psc(cstep) + iy * psc(w) + ix;
+            v = sfp2lfp(buffer_ld1(bottom_blob_data, si));
+        }
+
+        tile[ty][tx] = v;
+    }
+
+    barrier();
+
+    int ox = gx0 + lid_x;
+    int oy = gy0 + lid_y;
+
+    if (ox >= psc(outw) || oy >= psc(outh))
+        return;
+
+    int tx0 = lid_x * stride_w;
+    int ty0 = lid_y * stride_h;
+
+    afp outv;
+
+    if (pooling_type == 0)
+    {
+        afp mv = lfp2afp(tile[ty0][tx0]);
+
+        for (int ky = 0; ky < kernel_h; ky++)
+        {
+            for (int kx = 0; kx < kernel_w; kx++)
+            {
+                afp v = lfp2afp(tile[ty0 + ky][tx0 + kx]);
+                mv = max(mv, v);
+            }
+        }
+
+        outv = mv;
+    }
+    else
+    {
+        afp sum = afp(0.f);
+
+        if (avgpool_count_include_pad == 1)
+        {
+            for (int ky = 0; ky < kernel_h; ky++)
+            {
+                for (int kx = 0; kx < kernel_w; kx++)
+                {
+                    sum += lfp2afp(tile[ty0 + ky][tx0 + kx]);
+                }
+            }
+
+            sum *= afp(1.f / float(kernel_w * kernel_h));
+            outv = sum;
+        }
+        else
+        {
+            int inx0 = ox * stride_w - pl;
+            int iny0 = oy * stride_h - pt;
+
+            int vx0 = max(0, -inx0);
+            int vy0 = max(0, -iny0);
+            int vx1 = min(kernel_w, psc(w) - inx0);
+            int vy1 = min(kernel_h, psc(h) - iny0);
+
+            int area = (vx1 - vx0) * (vy1 - vy0);
+            if (area <= 0)
+            {
+                outv = afp(0.f);
+            }
+            else
+            {
+                for (int ky = vy0; ky < vy1; ky++)
+                {
+                    for (int kx = vx0; kx < vx1; kx++)
+                    {
+                        sum += lfp2afp(tile[ty0 + ky][tx0 + kx]);
+                    }
+                }
+
+                sum *= afp(1.f / float(area));
+                outv = sum;
+            }
+        }
+    }
+
+    int gi = gz * psc(outcstep) + oy * psc(outw) + ox;
+    buffer_st1(top_blob_data, gi, outv);
+}