diff --git a/src/layer/vulkan/pooling_vulkan.cpp b/src/layer/vulkan/pooling_vulkan.cpp index 8e38c9a5b23b..69855240b68a 100644 --- a/src/layer/vulkan/pooling_vulkan.cpp +++ b/src/layer/vulkan/pooling_vulkan.cpp @@ -1,75 +1,61 @@ -// Copyright 2019 Tencent +// Copyright 2026 Futz12 // SPDX-License-Identifier: BSD-3-Clause #include "pooling_vulkan.h" #include "layer_shader_type.h" -#include "layer_type.h" - -#include namespace ncnn { -Pooling_vulkan::Pooling_vulkan() +static inline void calc_same_pad(int w, int h, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_mode, int& pl, int& pr, int& pt, int& pb) { - support_vulkan = true; - support_vulkan_packing = true; - - padding = 0; - pipeline_pooling = 0; - pipeline_pooling_pack4 = 0; + int wpad = kernel_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; + if (wpad < 0) wpad = 0; + if (hpad < 0) hpad = 0; - pipeline_pooling_adaptive = 0; - pipeline_pooling_adaptive_pack4 = 0; - - pipeline_pooling_global_reduce_first = 0; - pipeline_pooling_global_reduce_first_pack4 = 0; - pipeline_pooling_global_reduce = 0; - pipeline_pooling_global_reduce_pack4 = 0; - pipeline_pooling_global_reduce_last = 0; - pipeline_pooling_global_reduce_last_pack4 = 0; + if (pad_mode == 2) + { + pl = wpad / 2; + pr = wpad - pl; + pt = hpad / 2; + pb = hpad - pt; + } + else + { + pl = wpad - wpad / 2; + pr = wpad / 2; + pt = hpad - hpad / 2; + pb = hpad / 2; + } } -int Pooling_vulkan::create_pipeline(const Option& _opt) +static inline void calc_output_and_pad(int w, int h, + int kernel_w, int kernel_h, + int stride_w, int stride_h, + int pad_left, int pad_right, int pad_top, int pad_bottom, + int pad_mode, int& outw, int& outh, + int& pl, int& pr, int& pt, int& pb) { - Option opt = _opt; - const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; - const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + pl = 0; + pr = 0; + pt = 0; + pb = 0; - // the shape after padding - Mat shape_bordered; - if (shape.dims != 0) + if (pad_mode == 0 || pad_mode == 1) { + pl = pad_left; + pr = pad_right; + pt = pad_top; + pb = pad_bottom; + if (pad_mode == 0) { - int wtail = (shape.w + pad_left + pad_right - kernel_w) % stride_w; - int htail = (shape.h + pad_top + pad_bottom - kernel_h) % stride_h; + int wtail = (w + pl + pr - kernel_w) % stride_w; + int htail = (h + pt + pb - kernel_h) % stride_h; - int wtailpad = 0; - int htailpad = 0; - if (wtail != 0) - wtailpad = stride_w - wtail; - if (htail != 0) - htailpad = stride_h - htail; - - shape_bordered = Mat(shape.w + pad_left + pad_right + wtailpad, shape.h + pad_top + pad_bottom + htailpad, shape.c, (void*)0); - } - else if (pad_mode == 1) - { - shape_bordered = Mat(shape.w + pad_left + pad_right, shape.h + pad_top + pad_bottom, shape.c, (void*)0); - } - else if (pad_mode == 2 || pad_mode == 3) - { - int wpad = kernel_w + (shape.w - 1) / stride_w * stride_w - shape.w; - int hpad = kernel_h + (shape.h - 1) / stride_h * stride_h - shape.h; - if (wpad > 0 || hpad > 0) - { - shape_bordered = Mat(shape.w + wpad, shape.h + hpad, shape.c, (void*)0); - } - } - else - { - shape_bordered = shape; + if (wtail != 0) pr += stride_w - wtail; + if (htail != 0) pb += stride_h - htail; } } @@ -85,395 +71,274 @@ int Pooling_vulkan::create_pipeline(const Option& _opt) } else { - elemsize = elempack * 4u; - out_elemsize = out_elempack * 4u; + calc_same_pad(w, h, kernel_w, kernel_h, stride_w, stride_h, pad_mode, pl, pr, pt, pb); } - Mat shape_bordered_packed; - if (shape_bordered.dims == 1) shape_bordered_packed = Mat(shape_bordered.w / elempack, (void*)0, elemsize, elempack); - if (shape_bordered.dims == 2) shape_bordered_packed = Mat(shape_bordered.w, shape_bordered.h / elempack, (void*)0, elemsize, elempack); - if (shape_bordered.dims == 3) shape_bordered_packed = Mat(shape_bordered.w, shape_bordered.h, shape_bordered.c / elempack, (void*)0, elemsize, elempack); + outw = (w + pl + pr - kernel_w) / stride_w + 1; + outh = (h + pt + pb - kernel_h) / stride_h + 1; +} - Mat out_shape_packed; - if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); - if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); - if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); +Pooling_vulkan::Pooling_vulkan() +{ + support_vulkan = true; - { - padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding); - padding->vkdev = vkdev; - - padding->bottom_shapes.resize(1); - padding->bottom_shapes[0] = shape; - padding->top_shapes.resize(1); - padding->top_shapes[0] = shape_bordered; - - ncnn::ParamDict pd; - pd.set(0, pad_top); - pd.set(1, pad_bottom); - pd.set(2, pad_left); - pd.set(3, pad_right); - pd.set(4, 0); - - if (pooling_type == PoolMethod_MAX) - { - // FLT_MAX becomes NaN during fp16 conversion in shader with swiftshader - // use a proper fp16-representable max as workaround --- nihui - if (opt.use_fp16_packed || opt.use_fp16_storage || opt.use_fp16_arithmetic) - pd.set(5, -65000.f); - else - pd.set(5, -FLT_MAX); - } - else if (pooling_type == PoolMethod_AVE) - { - pd.set(5, 0.f); - } + support_vulkan_packing = false; + support_vulkan_any_packing = false; - padding->load_param(pd); + pipeline_pooling = 0; + pipeline_pooling_tile = 0; - padding->create_pipeline(opt); - } + pipeline_pooling_global = 0; + pipeline_pooling_global_stage1 = 0; + pipeline_pooling_global_stage2 = 0; + + pipeline_pooling_adaptive = 0; +} + +int Pooling_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + int elempack = 1; + int out_elempack = 1; + + size_t elemsize = (opt.use_fp16_storage || opt.use_fp16_packed) ? 2u : 4u; + size_t out_elemsize = elemsize; + + Mat shape_packed; + Mat out_shape_packed; + + if (shape.dims == 1) shape_packed = Mat(shape.w, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c, (void*)0, elemsize, elempack); + + if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c, (void*)0, out_elemsize, out_elempack); if (global_pooling) { - // reduce first { - std::vector specializations(6); - specializations[0].i = shape_bordered_packed.w; - specializations[1].i = shape_bordered_packed.h; - specializations[2].i = shape_bordered_packed.c; - specializations[3].i = shape_bordered_packed.cstep; - specializations[4].i = 0; - specializations[5].i = 0; - - Mat local_size_xyz(64, 1, 1, (void*)0); - - // pack1 - if (shape.dims == 0 || elempack == 1) - { - int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_first : LayerShaderType::pooling_global_reduce_sum_first; - - pipeline_pooling_global_reduce_first = new Pipeline(vkdev); - pipeline_pooling_global_reduce_first->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_global_reduce_first->create(layer_shader_type, opt, specializations); - } - - // pack4 - if (shape.dims == 0 || elempack == 4) - { - int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_first_pack4 : LayerShaderType::pooling_global_reduce_sum_first_pack4; - - pipeline_pooling_global_reduce_first_pack4 = new Pipeline(vkdev); - pipeline_pooling_global_reduce_first_pack4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_global_reduce_first_pack4->create(layer_shader_type, opt, specializations); - } + std::vector specializations(1); + specializations[0].i = pooling_type; + + pipeline_pooling_global = new Pipeline(vkdev); + pipeline_pooling_global->set_local_size_xyz(256, 1, 1); + pipeline_pooling_global->create(LayerShaderType::pooling_global, opt, specializations); } - // reduce more { - std::vector specializations(5); - specializations[0].i = 0; - specializations[1].i = shape_bordered_packed.c; - specializations[2].i = 0; - specializations[3].i = 0; - specializations[4].i = 0; - - Mat local_size_xyz(64, 1, 1, (void*)0); - - // pack1 - if (shape.dims == 0 || elempack == 1) - { - int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max : LayerShaderType::pooling_global_reduce_sum; - - pipeline_pooling_global_reduce = new Pipeline(vkdev); - pipeline_pooling_global_reduce->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_global_reduce->create(layer_shader_type, opt, specializations); - } - - // pack4 - if (shape.dims == 0 || elempack == 4) - { - int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_pack4 : LayerShaderType::pooling_global_reduce_sum_pack4; - - pipeline_pooling_global_reduce_pack4 = new Pipeline(vkdev); - pipeline_pooling_global_reduce_pack4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_global_reduce_pack4->create(layer_shader_type, opt, specializations); - } + std::vector specializations(1); + specializations[0].i = pooling_type; + + pipeline_pooling_global_stage1 = new Pipeline(vkdev); + pipeline_pooling_global_stage1->set_local_size_xyz(256, 1, 1); + pipeline_pooling_global_stage1->create(LayerShaderType::pooling_global_stage1, opt, specializations); } - // reduce last { - std::vector specializations(3); - specializations[0].i = 0; - specializations[1].i = shape_bordered_packed.c; - specializations[2].i = 0; - - Mat local_size_xyz(1, 1, 64, (void*)0); - - // pack1 - if (shape.dims == 0 || elempack == 1) - { - int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_last : LayerShaderType::pooling_global_reduce_sum_last; - - pipeline_pooling_global_reduce_last = new Pipeline(vkdev); - pipeline_pooling_global_reduce_last->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_global_reduce_last->create(layer_shader_type, opt, specializations); - } - - // pack4 - if (shape.dims == 0 || elempack == 4) - { - int layer_shader_type = pooling_type == 0 ? LayerShaderType::pooling_global_reduce_max_last_pack4 : LayerShaderType::pooling_global_reduce_sum_last_pack4; - - pipeline_pooling_global_reduce_last_pack4 = new Pipeline(vkdev); - pipeline_pooling_global_reduce_last_pack4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_global_reduce_last_pack4->create(layer_shader_type, opt, specializations); - } + std::vector specializations(1); + specializations[0].i = pooling_type; + + pipeline_pooling_global_stage2 = new Pipeline(vkdev); + pipeline_pooling_global_stage2->set_local_size_xyz(256, 1, 1); + pipeline_pooling_global_stage2->create(LayerShaderType::pooling_global_stage2, opt, specializations); } + + return 0; } - else if (adaptive_pooling) + + if (adaptive_pooling) { - std::vector specializations(1 + 10); + std::vector specializations(5); specializations[0].i = pooling_type; - specializations[1 + 0].i = shape_bordered_packed.dims; - specializations[1 + 1].i = shape_bordered_packed.w; - specializations[1 + 2].i = shape_bordered_packed.h; - specializations[1 + 3].i = shape_bordered_packed.c; - specializations[1 + 4].i = shape_bordered_packed.cstep; - specializations[1 + 5].i = out_shape_packed.dims; - specializations[1 + 6].i = out_shape_packed.w; - specializations[1 + 7].i = out_shape_packed.h; - specializations[1 + 8].i = out_shape_packed.c; - specializations[1 + 9].i = out_shape_packed.cstep; - - Mat local_size_xyz; - if (out_shape_packed.dims != 0) - { - local_size_xyz.w = std::min(4, out_shape_packed.w); - local_size_xyz.h = std::min(4, out_shape_packed.h); - local_size_xyz.c = std::min(4, out_shape_packed.c); - } + specializations[1].i = out_w; + specializations[2].i = out_h; + specializations[3].i = 0; + specializations[4].i = 0; + + pipeline_pooling_adaptive = new Pipeline(vkdev); + pipeline_pooling_adaptive->set_local_size_xyz(8, 8, 1); + pipeline_pooling_adaptive->create(LayerShaderType::pooling_adaptive, opt, specializations); + return 0; + } - // pack1 - if (shape.dims == 0 || elempack == 1) - { - pipeline_pooling_adaptive = new Pipeline(vkdev); - pipeline_pooling_adaptive->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_adaptive->create(LayerShaderType::pooling_adaptive, opt, specializations); - } + bool use_tile = true; + { + const int tile_out_w = 8; + const int tile_out_h = 8; + const int tile_in_w = (tile_out_w - 1) * stride_w + kernel_w; + const int tile_in_h = (tile_out_h - 1) * stride_h + kernel_h; + + if (tile_in_w > 36 || tile_in_h > 36) use_tile = false; + if (kernel_w <= 0 || kernel_h <= 0) use_tile = false; + if (stride_w <= 0 || stride_h <= 0) use_tile = false; + } - // pack4 - if (shape.dims == 0 || elempack == 4) - { - pipeline_pooling_adaptive_pack4 = new Pipeline(vkdev); - pipeline_pooling_adaptive_pack4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_adaptive_pack4->create(LayerShaderType::pooling_adaptive_pack4, opt, specializations); - } + std::vector specializations(11 + 12); + specializations[0].i = pooling_type; + specializations[1].i = kernel_w; + specializations[2].i = kernel_h; + specializations[3].i = stride_w; + specializations[4].i = stride_h; + specializations[5].i = pad_left; + specializations[6].i = pad_right; + specializations[7].i = pad_top; + specializations[8].i = pad_bottom; + specializations[9].i = pad_mode; + specializations[10].i = avgpool_count_include_pad; + + specializations[11 + 0].i = shape_packed.dims; + specializations[11 + 1].i = shape_packed.w; + specializations[11 + 2].i = shape_packed.h; + specializations[11 + 3].i = shape_packed.d; + specializations[11 + 4].i = shape_packed.c; + specializations[11 + 5].i = shape_packed.cstep; + + specializations[11 + 6].i = out_shape_packed.dims; + specializations[11 + 7].i = out_shape_packed.w; + specializations[11 + 8].i = out_shape_packed.h; + specializations[11 + 9].i = out_shape_packed.d; + specializations[11 + 10].i = out_shape_packed.c; + specializations[11 + 11].i = out_shape_packed.cstep; + + if (use_tile) + { + pipeline_pooling_tile = new Pipeline(vkdev); + pipeline_pooling_tile->set_local_size_xyz(8, 8, 1); + pipeline_pooling_tile->create(LayerShaderType::pooling_tile, opt, specializations); } else { - std::vector specializations(12 + 10); - specializations[0].i = pooling_type; - specializations[1].i = kernel_w; - specializations[2].i = kernel_h; - specializations[3].i = stride_w; - specializations[4].i = stride_h; - specializations[5].i = pad_left; - specializations[6].i = pad_right; - specializations[7].i = pad_top; - specializations[8].i = pad_bottom; - specializations[9].i = global_pooling; - specializations[10].i = pad_mode; - specializations[11].i = avgpool_count_include_pad; - specializations[12 + 0].i = shape_bordered_packed.dims; - specializations[12 + 1].i = shape_bordered_packed.w; - specializations[12 + 2].i = shape_bordered_packed.h; - specializations[12 + 3].i = shape_bordered_packed.c; - specializations[12 + 4].i = shape_bordered_packed.cstep; - specializations[12 + 5].i = out_shape_packed.dims; - specializations[12 + 6].i = out_shape_packed.w; - specializations[12 + 7].i = out_shape_packed.h; - specializations[12 + 8].i = out_shape_packed.c; - specializations[12 + 9].i = out_shape_packed.cstep; - - Mat local_size_xyz; - if (out_shape_packed.dims != 0) - { - local_size_xyz.w = std::min(4, out_shape_packed.w); - local_size_xyz.h = std::min(4, out_shape_packed.h); - local_size_xyz.c = std::min(4, out_shape_packed.c); - } - - // pack1 - if (shape.dims == 0 || elempack == 1) - { - pipeline_pooling = new Pipeline(vkdev); - pipeline_pooling->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling->create(LayerShaderType::pooling, opt, specializations); - } - - // pack4 - if (shape.dims == 0 || elempack == 4) - { - pipeline_pooling_pack4 = new Pipeline(vkdev); - pipeline_pooling_pack4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_pooling_pack4->create(LayerShaderType::pooling_pack4, opt, specializations); - } + pipeline_pooling = new Pipeline(vkdev); + pipeline_pooling->set_local_size_xyz(8, 8, 1); + pipeline_pooling->create(LayerShaderType::pooling, opt, specializations); } return 0; } -int Pooling_vulkan::destroy_pipeline(const Option& opt) +int Pooling_vulkan::destroy_pipeline(const Option& /*opt*/) { - if (padding) - { - padding->destroy_pipeline(opt); - delete padding; - padding = 0; - } - delete pipeline_pooling; pipeline_pooling = 0; - delete pipeline_pooling_pack4; - pipeline_pooling_pack4 = 0; - - delete pipeline_pooling_adaptive; - pipeline_pooling_adaptive = 0; - - delete pipeline_pooling_adaptive_pack4; - pipeline_pooling_adaptive_pack4 = 0; - - delete pipeline_pooling_global_reduce_first; - pipeline_pooling_global_reduce_first = 0; + delete pipeline_pooling_tile; + pipeline_pooling_tile = 0; - delete pipeline_pooling_global_reduce_first_pack4; - pipeline_pooling_global_reduce_first_pack4 = 0; + delete pipeline_pooling_global; + pipeline_pooling_global = 0; - delete pipeline_pooling_global_reduce; - pipeline_pooling_global_reduce = 0; + delete pipeline_pooling_global_stage1; + pipeline_pooling_global_stage1 = 0; - delete pipeline_pooling_global_reduce_pack4; - pipeline_pooling_global_reduce_pack4 = 0; + delete pipeline_pooling_global_stage2; + pipeline_pooling_global_stage2 = 0; - delete pipeline_pooling_global_reduce_last; - pipeline_pooling_global_reduce_last = 0; - - delete pipeline_pooling_global_reduce_last_pack4; - pipeline_pooling_global_reduce_last_pack4 = 0; + delete pipeline_pooling_adaptive; + pipeline_pooling_adaptive = 0; return 0; } -int Pooling_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const { - if (padding) + const int dims = bottom_blob.dims; + + if (dims == 1) { - padding->upload_model(cmd, opt); + top_blob = bottom_blob; + return 0; } - return 0; -} + if (dims != 2 && dims != 3) + return -100; -int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const -{ - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = (dims == 3) ? bottom_blob.c : 1; + const size_t elemsize = bottom_blob.elemsize; if (global_pooling) { - // reduce first - VkMat reduced_blob; - { - int reduced_size = (w * h + 7) / 8; - size_t reduced_elemsize = pooling_type == 0 ? elemsize : 4u * elempack; - reduced_blob.create(reduced_size, 1, channels, reduced_elemsize, elempack, opt.workspace_vkallocator); - if (reduced_blob.empty()) - return -100; + top_blob.create(channels, elemsize, 1, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + const int size = w * h; + const bool use_two_stage = (channels < 8 && size >= 4096); + + if (!use_two_stage) + { std::vector bindings(2); bindings[0] = bottom_blob; - bindings[1] = reduced_blob; + bindings[1] = top_blob; - std::vector constants(6); - constants[0].i = bottom_blob.w; - constants[1].i = bottom_blob.h; - constants[2].i = bottom_blob.c; + std::vector constants(4); + constants[0].i = w; + constants[1].i = h; + constants[2].i = channels; constants[3].i = bottom_blob.cstep; - constants[4].i = reduced_blob.w; - constants[5].i = reduced_blob.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_global_reduce_first_pack4 : pipeline_pooling_global_reduce_first; VkMat dispatcher; - dispatcher.w = reduced_blob.w; + dispatcher.w = channels * 256; dispatcher.h = 1; - dispatcher.c = bottom_blob.c; + dispatcher.c = 1; - cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + cmd.record_pipeline(pipeline_pooling_global, bindings, constants, dispatcher); + return 0; } - // reduce more - while (reduced_blob.w > 32) - { - int reduced_size = (reduced_blob.w + 7) / 8; - size_t reduced_elemsize = pooling_type == 0 ? elemsize : 4u * elempack; - VkMat reduced_blob2; - reduced_blob2.create(reduced_size, 1, channels, reduced_elemsize, elempack, opt.workspace_vkallocator); - if (reduced_blob2.empty()) - return -100; + const int wg = 256; + const int unroll = 4; + const int chunk = wg * unroll; + const int partial_w = (size + chunk - 1) / chunk; + VkMat partial; + partial.create(partial_w, channels, elemsize, 1, opt.workspace_vkallocator); + if (partial.empty()) + return -100; + + { std::vector bindings(2); - bindings[0] = reduced_blob; - bindings[1] = reduced_blob2; + bindings[0] = bottom_blob; + bindings[1] = partial; std::vector constants(5); - constants[0].i = reduced_blob.w; - constants[1].i = reduced_blob.c; - constants[2].i = reduced_blob.cstep; - constants[3].i = reduced_blob2.w; - constants[4].i = reduced_blob2.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_global_reduce_pack4 : pipeline_pooling_global_reduce; + constants[0].i = w; + constants[1].i = h; + constants[2].i = channels; + constants[3].i = bottom_blob.cstep; + constants[4].i = partial_w; VkMat dispatcher; - dispatcher.w = reduced_blob2.w; - dispatcher.h = 1; - dispatcher.c = reduced_blob2.c; - - cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + dispatcher.w = partial_w * 256; + dispatcher.h = channels; + dispatcher.c = 1; - reduced_blob = reduced_blob2; + cmd.record_pipeline(pipeline_pooling_global_stage1, bindings, constants, dispatcher); } - // reduce last { - top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator); - if (top_blob.empty()) - return -100; - std::vector bindings(2); - bindings[0] = reduced_blob; + bindings[0] = partial; bindings[1] = top_blob; - std::vector constants(4); - constants[0].i = reduced_blob.w; - constants[1].i = reduced_blob.c; - constants[2].i = reduced_blob.cstep; - constants[3].i = w * h; - - const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_global_reduce_last_pack4 : pipeline_pooling_global_reduce_last; + std::vector constants(3); + constants[0].i = partial_w; + constants[1].i = channels; + constants[2].i = size; VkMat dispatcher; - dispatcher.w = 1; + dispatcher.w = channels * 256; dispatcher.h = 1; - dispatcher.c = top_blob.w; + dispatcher.c = 1; - cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + cmd.record_pipeline(pipeline_pooling_global_stage2, bindings, constants, dispatcher); } return 0; @@ -481,16 +346,16 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute if (adaptive_pooling) { - int _out_w = out_w == -233 ? w : out_w; - int _out_h = out_h == -233 ? h : out_h; + int outw = out_w == -233 ? w : out_w; + int outh = out_h == -233 ? h : out_h; - if (_out_w == w && _out_h == h) + if (outw == w && outh == h) { top_blob = bottom_blob; return 0; } - top_blob.create(_out_w, _out_h, channels, elemsize, elempack, opt.blob_vkallocator); + top_blob.create(outw, outh, channels, elemsize, 1, opt.blob_vkallocator); if (top_blob.empty()) return -100; @@ -498,155 +363,64 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute bindings[0] = bottom_blob; bindings[1] = top_blob; - std::vector constants(10); + std::vector constants(12); constants[0].i = bottom_blob.dims; constants[1].i = bottom_blob.w; constants[2].i = bottom_blob.h; - constants[3].i = bottom_blob.c; - constants[4].i = bottom_blob.cstep; - constants[5].i = top_blob.dims; - constants[6].i = top_blob.w; - constants[7].i = top_blob.h; - constants[8].i = top_blob.c; - constants[9].i = top_blob.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_adaptive_pack4 : pipeline_pooling_adaptive; - - cmd.record_pipeline(pipeline, bindings, constants, top_blob); - + constants[3].i = bottom_blob.d; + constants[4].i = (dims == 3) ? bottom_blob.c : 1; + constants[5].i = bottom_blob.cstep; + constants[6].i = top_blob.dims; + constants[7].i = top_blob.w; + constants[8].i = top_blob.h; + constants[9].i = top_blob.d; + constants[10].i = (dims == 3) ? top_blob.c : 1; + constants[11].i = top_blob.cstep; + + cmd.record_pipeline(pipeline_pooling_adaptive, bindings, constants, top_blob); return 0; } - VkMat bottom_blob_bordered = bottom_blob; - - int wtailpad = 0; - int htailpad = 0; - - if (pad_mode == 0) // full padding + if (kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0 && pad_mode == 1) { - int wtail = (w + pad_left + pad_right - kernel_w) % stride_w; - int htail = (h + pad_top + pad_bottom - kernel_h) % stride_h; - - if (wtail != 0) - wtailpad = stride_w - wtail; - if (htail != 0) - htailpad = stride_h - htail; - - Option opt_pad = opt; - opt_pad.blob_vkallocator = opt.workspace_vkallocator; - - VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); - int* padding_params = padding_param_blob.mapped(); - - padding_params[0] = pad_top; - padding_params[1] = pad_bottom + htailpad; - padding_params[2] = pad_left; - padding_params[3] = pad_right + wtailpad; - padding_params[4] = 0; - padding_params[5] = 0; - - std::vector padding_inputs(2); - padding_inputs[0] = bottom_blob; - padding_inputs[1] = padding_param_blob; - - std::vector padding_outputs(1); - padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); - bottom_blob_bordered = padding_outputs[0]; + top_blob = bottom_blob; + return 0; } - else if (pad_mode == 1) // valid padding - { - Option opt_pad = opt; - opt_pad.blob_vkallocator = opt.workspace_vkallocator; - padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); - } - else if (pad_mode == 2) // tensorflow padding=SAME or onnx padding=SAME_UPPER + int outw, outh; + int pl, pr, pt, pb; + calc_output_and_pad(w, h, kernel_w, kernel_h, stride_w, stride_h, pad_left, pad_right, pad_top, pad_bottom, pad_mode, outw, outh, pl, pr, pt, pb); + + if (dims == 2) { - int wpad = kernel_w + (w - 1) / stride_w * stride_w - w; - int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; - if (wpad > 0 || hpad > 0) - { - Option opt_pad = opt; - opt_pad.blob_vkallocator = opt.workspace_vkallocator; - - VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); - int* padding_params = padding_param_blob.mapped(); - - padding_params[0] = hpad / 2; - padding_params[1] = hpad - hpad / 2; - padding_params[2] = wpad / 2; - padding_params[3] = wpad - wpad / 2; - padding_params[4] = 0; - padding_params[5] = 0; - - std::vector padding_inputs(2); - padding_inputs[0] = bottom_blob; - padding_inputs[1] = padding_param_blob; - - std::vector padding_outputs(1); - padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); - bottom_blob_bordered = padding_outputs[0]; - } + top_blob.create(outw, outh, elemsize, 1, opt.blob_vkallocator); } - else if (pad_mode == 3) // onnx padding=SAME_LOWER + else { - int wpad = kernel_w + (w - 1) / stride_w * stride_w - w; - int hpad = kernel_h + (h - 1) / stride_h * stride_h - h; - if (wpad > 0 || hpad > 0) - { - Option opt_pad = opt; - opt_pad.blob_vkallocator = opt.workspace_vkallocator; - - VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); - int* padding_params = padding_param_blob.mapped(); - - padding_params[0] = hpad - hpad / 2; - padding_params[1] = hpad / 2; - padding_params[2] = wpad - wpad / 2; - padding_params[3] = wpad / 2; - padding_params[4] = 0; - padding_params[5] = 0; - - std::vector padding_inputs(2); - padding_inputs[0] = bottom_blob; - padding_inputs[1] = padding_param_blob; - - std::vector padding_outputs(1); - padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); - bottom_blob_bordered = padding_outputs[0]; - } + top_blob.create(outw, outh, channels, elemsize, 1, opt.blob_vkallocator); } - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_w) / stride_w + 1; - int outh = (h - kernel_h) / stride_h + 1; - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; std::vector bindings(2); - bindings[0] = bottom_blob_bordered; + bindings[0] = bottom_blob; bindings[1] = top_blob; std::vector constants(12); - constants[0].i = bottom_blob_bordered.dims; - constants[1].i = bottom_blob_bordered.w; - constants[2].i = bottom_blob_bordered.h; - constants[3].i = bottom_blob_bordered.c; - constants[4].i = bottom_blob_bordered.cstep; - constants[5].i = top_blob.dims; - constants[6].i = top_blob.w; - constants[7].i = top_blob.h; - constants[8].i = top_blob.c; - constants[9].i = top_blob.cstep; - constants[10].i = wtailpad; - constants[11].i = htailpad; - - const Pipeline* pipeline = elempack == 4 ? pipeline_pooling_pack4 : pipeline_pooling; - + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.d; + constants[4].i = (dims == 3) ? bottom_blob.c : 1; + constants[5].i = bottom_blob.cstep; + constants[6].i = top_blob.dims; + constants[7].i = top_blob.w; + constants[8].i = top_blob.h; + constants[9].i = top_blob.d; + constants[10].i = (dims == 3) ? top_blob.c : 1; + constants[11].i = top_blob.cstep; + + const Pipeline* pipeline = pipeline_pooling_tile ? pipeline_pooling_tile : pipeline_pooling; cmd.record_pipeline(pipeline, bindings, constants, top_blob); return 0; diff --git a/src/layer/vulkan/pooling_vulkan.h b/src/layer/vulkan/pooling_vulkan.h index 273d9aa7f2ff..bb11414c7f9a 100644 --- a/src/layer/vulkan/pooling_vulkan.h +++ b/src/layer/vulkan/pooling_vulkan.h @@ -1,4 +1,4 @@ -// Copyright 2019 Tencent +// Copyright 2026 Futz12 // SPDX-License-Identifier: BSD-3-Clause #ifndef LAYER_POOLING_VULKAN_H @@ -16,26 +16,18 @@ class Pooling_vulkan : public Pooling virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); - virtual int upload_model(VkTransfer& cmd, const Option& opt); - using Pooling::forward; virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; public: - ncnn::Layer* padding; - Pipeline* pipeline_pooling; - Pipeline* pipeline_pooling_pack4; + Pipeline* pipeline_pooling_tile; + + Pipeline* pipeline_pooling_global; + Pipeline* pipeline_pooling_global_stage1; + Pipeline* pipeline_pooling_global_stage2; Pipeline* pipeline_pooling_adaptive; - Pipeline* pipeline_pooling_adaptive_pack4; - - Pipeline* pipeline_pooling_global_reduce_first; - Pipeline* pipeline_pooling_global_reduce_first_pack4; - Pipeline* pipeline_pooling_global_reduce; - Pipeline* pipeline_pooling_global_reduce_pack4; - Pipeline* pipeline_pooling_global_reduce_last; - Pipeline* pipeline_pooling_global_reduce_last_pack4; }; } // namespace ncnn diff --git a/src/layer/vulkan/shader/pooling.comp b/src/layer/vulkan/shader/pooling.comp index b52ac3c8b7dd..fd2e9dc1157f 100644 --- a/src/layer/vulkan/shader/pooling.comp +++ b/src/layer/vulkan/shader/pooling.comp @@ -1,35 +1,34 @@ -// Copyright 2018 Tencent +// Copyright 2026 Futz12 // SPDX-License-Identifier: BSD-3-Clause #version 450 -#define FLT_MAX 3.402823466e+38 - layout(constant_id = 0) const int pooling_type = 0; -layout(constant_id = 1) const int kernel_w = 1; -layout(constant_id = 2) const int kernel_h = 1; +layout(constant_id = 1) const int kernel_w = 0; +layout(constant_id = 2) const int kernel_h = 0; layout(constant_id = 3) const int stride_w = 1; layout(constant_id = 4) const int stride_h = 1; layout(constant_id = 5) const int pad_left = 0; layout(constant_id = 6) const int pad_right = 0; layout(constant_id = 7) const int pad_top = 0; layout(constant_id = 8) const int pad_bottom = 0; -layout(constant_id = 9) const int global_pooling = 0; -layout(constant_id = 10) const int pad_mode = 0; -layout(constant_id = 11) const int avgpool_count_include_pad = 0; +layout(constant_id = 9) const int pad_mode = 0; +layout(constant_id = 10) const int avgpool_count_include_pad = 0; -#define shape_constant_id_offset 12 +#define shape_constant_id_offset 11 layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; layout(constant_id = shape_constant_id_offset + 1) const int w = 0; layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int c = 0; -layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; +layout(constant_id = shape_constant_id_offset + 3) const int d = 0; +layout(constant_id = shape_constant_id_offset + 4) const int c = 0; +layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; -layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; +layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; @@ -39,107 +38,150 @@ layout(push_constant) uniform parameter int dims; int w; int h; + int d; int c; int cstep; int outdims; int outw; int outh; + int outd; int outc; int outcstep; - - int wtailpad; - int htailpad; } p; void main() { - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); + int ox = int(gl_GlobalInvocationID.x); + int oy = int(gl_GlobalInvocationID.y); + int oz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (ox >= psc(outw) || oy >= psc(outh) || oz >= psc(outc)) return; - afp res; + int pl; + int pr; + int pt; + int pb; - if (pooling_type == 0) + if (pad_mode == 0 || pad_mode == 1) { - res = afp(-FLT_MAX); + pl = pad_left; + pr = pad_right; + pt = pad_top; + pb = pad_bottom; - int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; - - for (int y = 0; y < kernel_h; y++) + if (pad_mode == 0) { - for (int x = 0; x < kernel_w; x++) - { - afp v = buffer_ld1(bottom_blob_data, v_offset + x); - res = max(res, v); - } - - v_offset += psc(w); + int wtail = (psc(w) + pl + pr - kernel_w) % stride_w; + int htail = (psc(h) + pt + pb - kernel_h) % stride_h; + if (wtail != 0) pr += stride_w - wtail; + if (htail != 0) pb += stride_h - htail; } } - if (pooling_type == 1 && avgpool_count_include_pad == 0) + else { - res = afp(0.f); - int area = 0; + int wpad = kernel_w + (psc(w) - 1) / stride_w * stride_w - psc(w); + int hpad = kernel_h + (psc(h) - 1) / stride_h * stride_h - psc(h); + if (wpad < 0) wpad = 0; + if (hpad < 0) hpad = 0; - int sx = gx * stride_w; - int sy = gy * stride_h; + if (pad_mode == 2) + { + pl = wpad / 2; + pr = wpad - pl; + pt = hpad / 2; + pb = hpad - pt; + } + else + { + pl = wpad - wpad / 2; + pr = wpad / 2; + pt = hpad - hpad / 2; + pb = hpad / 2; + } + } + + int inx0 = ox * stride_w - pl; + int iny0 = oy * stride_h - pt; - int v_offset = gz * psc(cstep) + sy * psc(w) + sx; + if (pooling_type == 0) + { + afp mv = afp(-3.402823466e38); - for (int y = 0; y < kernel_h; y++) + for (int ky = 0; ky < kernel_h; ky++) { - if (sy + y < pad_top) + int iy = iny0 + ky; + for (int kx = 0; kx < kernel_w; kx++) { - v_offset += psc(w); - continue; - } - - if (sy + y >= psc(h) - pad_bottom - p.htailpad) - break; + int ix = inx0 + kx; - for (int x = 0; x < kernel_w; x++) - { - if (sx + x < pad_left) - { + if (ix < 0 || ix >= psc(w) || iy < 0 || iy >= psc(h)) continue; - } - - if (sx + x >= psc(w) - pad_right - p.wtailpad) - break; - res += buffer_ld1(bottom_blob_data, v_offset + x); - area += 1; + int si = oz * psc(cstep) + iy * psc(w) + ix; + afp v = buffer_ld1(bottom_blob_data, si); + mv = max(mv, v); } - - v_offset += psc(w); } - res /= afp(area); + int gi = oz * psc(outcstep) + oy * psc(outw) + ox; + buffer_st1(top_blob_data, gi, mv); } - if (pooling_type == 1 && avgpool_count_include_pad == 1) + else { - res = afp(0.f); + afp sum = afp(0.f); - int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; - - for (int y = 0; y < kernel_h; y++) + if (avgpool_count_include_pad == 1) { - for (int x = 0; x < kernel_w; x++) + for (int ky = 0; ky < kernel_h; ky++) { - res += buffer_ld1(bottom_blob_data, v_offset + x); + int iy = iny0 + ky; + for (int kx = 0; kx < kernel_w; kx++) + { + int ix = inx0 + kx; + + if (ix < 0 || ix >= psc(w) || iy < 0 || iy >= psc(h)) + continue; + + int si = oz * psc(cstep) + iy * psc(w) + ix; + sum += buffer_ld1(bottom_blob_data, si); + } } - v_offset += psc(w); + sum *= afp(1.f / float(kernel_w * kernel_h)); + int gi = oz * psc(outcstep) + oy * psc(outw) + ox; + buffer_st1(top_blob_data, gi, sum); } + else + { + int vx0 = max(0, -inx0); + int vy0 = max(0, -iny0); + int vx1 = min(kernel_w, psc(w) - inx0); + int vy1 = min(kernel_h, psc(h) - iny0); - res /= afp(kernel_w * kernel_h); - } + int area = (vx1 - vx0) * (vy1 - vy0); + if (area <= 0) + { + int gi = oz * psc(outcstep) + oy * psc(outw) + ox; + buffer_st1(top_blob_data, gi, afp(0.f)); + return; + } - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + for (int ky = vy0; ky < vy1; ky++) + { + int iy = iny0 + ky; + for (int kx = vx0; kx < vx1; kx++) + { + int ix = inx0 + kx; + int si = oz * psc(cstep) + iy * psc(w) + ix; + sum += buffer_ld1(bottom_blob_data, si); + } + } - buffer_st1(top_blob_data, gi, res); + sum *= afp(1.f / float(area)); + int gi = oz * psc(outcstep) + oy * psc(outw) + ox; + buffer_st1(top_blob_data, gi, sum); + } + } } diff --git a/src/layer/vulkan/shader/pooling_adaptive.comp b/src/layer/vulkan/shader/pooling_adaptive.comp index 003a0cb8c903..4cca86add74a 100644 --- a/src/layer/vulkan/shader/pooling_adaptive.comp +++ b/src/layer/vulkan/shader/pooling_adaptive.comp @@ -1,24 +1,13 @@ -// Copyright 2018 Tencent +// Copyright 2026 Futz12 // SPDX-License-Identifier: BSD-3-Clause #version 450 -#define FLT_MAX 3.402823466e+38 - layout(constant_id = 0) const int pooling_type = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int c = 0; -layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout(constant_id = 1) const int out_w_param = 0; +layout(constant_id = 2) const int out_h_param = 0; +layout(constant_id = 3) const int reserved0 = 0; +layout(constant_id = 4) const int reserved1 = 0; layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; @@ -28,73 +17,69 @@ layout(push_constant) uniform parameter int dims; int w; int h; + int d; int c; int cstep; int outdims; int outw; int outh; + int outd; int outc; int outcstep; } p; void main() { - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); + int ox = int(gl_GlobalInvocationID.x); + int oy = int(gl_GlobalInvocationID.y); + int oz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (ox >= p.outw || oy >= p.outh || oz >= p.outc) return; - afp res; + int ih0 = p.h * oy / p.outh; + int ih1 = (p.h * (oy + 1) + p.outh - 1) / p.outh; - // calculate adaptive kernel size - const int sx = psc(w) * gx / psc(outw); - const int ex = (psc(w) * (gx + 1) + psc(outw) - 1) / psc(outw); - const int kernel_w = ex - sx; - const int sy = psc(h) * gy / psc(outh); - const int ey = (psc(h) * (gy + 1) + psc(outh) - 1) / psc(outh); - const int kernel_h = ey - sy; + int iw0 = p.w * ox / p.outw; + int iw1 = (p.w * (ox + 1) + p.outw - 1) / p.outw; if (pooling_type == 0) { - res = afp(-FLT_MAX); + int si0 = oz * p.cstep + ih0 * p.w + iw0; + afp mv = buffer_ld1(bottom_blob_data, si0); - int v_offset = gz * psc(cstep) + sy * psc(w) + sx; - - for (int y = 0; y < kernel_h; y++) + for (int iy = ih0; iy < ih1; iy++) { - for (int x = 0; x < kernel_w; x++) + int base = oz * p.cstep + iy * p.w; + for (int ix = iw0; ix < iw1; ix++) { - afp v = buffer_ld1(bottom_blob_data, v_offset + x); - res = max(res, v); + afp v = buffer_ld1(bottom_blob_data, base + ix); + mv = max(mv, v); } - - v_offset += psc(w); } + + int gi = oz * p.outcstep + oy * p.outw + ox; + buffer_st1(top_blob_data, gi, mv); } - if (pooling_type == 1) + else { - float res_fp32 = 0.f; // force accumulation in fp32 - - int v_offset = gz * psc(cstep) + sy * psc(w) + sx; + afp sum = afp(0.f); + int hk = ih1 - ih0; + int wk = iw1 - iw0; + int area = hk * wk; - for (int y = 0; y < kernel_h; y++) + for (int iy = ih0; iy < ih1; iy++) { - for (int x = 0; x < kernel_w; x++) + int base = oz * p.cstep + iy * p.w; + for (int ix = iw0; ix < iw1; ix++) { - res_fp32 += buffer_ld1(bottom_blob_data, v_offset + x); + sum += buffer_ld1(bottom_blob_data, base + ix); } - - v_offset += psc(w); } - res_fp32 /= float(kernel_h * kernel_w); - res = afp(res_fp32); // cast to fp16 if possible + sum *= afp(1.f / float(area)); + int gi = oz * p.outcstep + oy * p.outw + ox; + buffer_st1(top_blob_data, gi, sum); } - - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - - buffer_st1(top_blob_data, gi, res); } diff --git a/src/layer/vulkan/shader/pooling_adaptive_pack4.comp b/src/layer/vulkan/shader/pooling_adaptive_pack4.comp deleted file mode 100644 index f0d5c66a0567..000000000000 --- a/src/layer/vulkan/shader/pooling_adaptive_pack4.comp +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define FLT_MAX 3.402823466e+38 - -layout(constant_id = 0) const int pooling_type = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int c = 0; -layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) - return; - - afpvec4 res; - - // calculate adaptive kernel size - const int sx = psc(w) * gx / psc(outw); - const int ex = (psc(w) * (gx + 1) + psc(outw) - 1) / psc(outw); - const int kernel_w = ex - sx; - const int sy = psc(h) * gy / psc(outh); - const int ey = (psc(h) * (gy + 1) + psc(outh) - 1) / psc(outh); - const int kernel_h = ey - sy; - - if (pooling_type == 0) - { - res = afpvec4(-FLT_MAX); - - int v_offset = gz * psc(cstep) + sy * psc(w) + sx; - - for (int y = 0; y < kernel_h; y++) - { - for (int x = 0; x < kernel_w; x++) - { - afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x); - res = max(res, v); - } - - v_offset += psc(w); - } - } - else if (pooling_type == 1) - { - vec4 res_fp32 = vec4(0.f); // force accumulation in fp32 - - int v_offset = gz * psc(cstep) + sy * psc(w) + sx; - - for (int y = 0; y < kernel_h; y++) - { - for (int x = 0; x < kernel_w; x++) - { - res_fp32 += buffer_ld4(bottom_blob_data, v_offset + x); - } - - v_offset += psc(w); - } - - res_fp32 /= float(kernel_h * kernel_w); - res = afpvec4(res_fp32); // cast to fp16 if possible - } - - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - - buffer_st4(top_blob_data, gi, res); -} diff --git a/src/layer/vulkan/shader/pooling_global.comp b/src/layer/vulkan/shader/pooling_global.comp new file mode 100644 index 000000000000..557221cda933 --- /dev/null +++ b/src/layer/vulkan/shader/pooling_global.comp @@ -0,0 +1,69 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int pooling_type = 0; + +layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; +} p; + +shared lfp sdata[256]; + +void main() +{ + int cid = int(gl_WorkGroupID.x); + int tid = int(gl_LocalInvocationID.x); + + if (cid >= p.c) + return; + + int size = p.w * p.h; + + afp acc = (pooling_type == 0) ? afp(-3.402823466e38) : afp(0.f); + + for (int i = tid; i < size; i += 256) + { + int iy = i / p.w; + int ix = i - iy * p.w; + int si = cid * p.cstep + iy * p.w + ix; + afp v = buffer_ld1(bottom_blob_data, si); + + if (pooling_type == 0) + acc = max(acc, v); + else + acc += v; + } + + sdata[tid] = sfp2lfp(acc); + barrier(); + + for (int offset = 128; offset > 0; offset >>= 1) + { + if (tid < offset) + { + afp a = lfp2afp(sdata[tid]); + afp b = lfp2afp(sdata[tid + offset]); + afp r = (pooling_type == 0) ? max(a, b) : (a + b); + sdata[tid] = sfp2lfp(r); + } + barrier(); + } + + if (tid == 0) + { + afp outv = lfp2afp(sdata[0]); + if (pooling_type != 0) + outv *= afp(1.f / float(size)); + + buffer_st1(top_blob_data, cid, outv); + } +} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max.comp b/src/layer/vulkan/shader/pooling_global_reduce_max.comp deleted file mode 100644 index 352bd9b44410..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_max.comp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define FLT_MAX 3.402823466e+38 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int c = 0; -layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 3) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int c; - int cstep; - - int outw; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= 1 || gz >= psc(c)) - return; - - const int size_1 = psc(w) - 1; - - const int v_offset = gz * psc(cstep); - - afp res = afp(-FLT_MAX); - - for (int ii = 0; ii < 8; ii++) - { - int i = min(gx + ii * psc(outw), size_1); - - afp v = buffer_ld1(bottom_blob_data, v_offset + i); - res = max(res, v); - } - - buffer_st1(top_blob_data, gz * psc(outcstep) + gx, res); -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_first.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_first.comp deleted file mode 100644 index 1b560eed567e..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_max_first.comp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define FLT_MAX 3.402823466e+38 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int h = 0; -layout(constant_id = shape_constant_id_offset + 2) const int c = 0; -layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 5) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= 1 || gz >= psc(c)) - return; - - const int size_1 = psc(w) * psc(h) - 1; - - const int v_offset = gz * psc(cstep); - - afp res = afp(-FLT_MAX); - - for (int ii = 0; ii < 8; ii++) - { - int i = min(gx + ii * psc(outw), size_1); - - afp v = buffer_ld1(bottom_blob_data, v_offset + i); - res = max(res, v); - } - - buffer_st1(top_blob_data, gz * psc(outcstep) + gx, res); -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_first_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_first_pack4.comp deleted file mode 100644 index 882c9269a221..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_max_first_pack4.comp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define FLT_MAX 3.402823466e+38 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int h = 0; -layout(constant_id = shape_constant_id_offset + 2) const int c = 0; -layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 5) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= 1 || gz >= psc(c)) - return; - - const int size_1 = psc(w) * psc(h) - 1; - - const int v_offset = gz * psc(cstep); - - afpvec4 res = afpvec4(-FLT_MAX); - - for (int ii = 0; ii < 8; ii++) - { - int i = min(gx + ii * psc(outw), size_1); - - afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i); - res = max(res, v); - } - - buffer_st4(top_blob_data, gz * psc(outcstep) + gx, res); -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_last.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_last.comp deleted file mode 100644 index 78196494e35e..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_max_last.comp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define FLT_MAX 3.402823466e+38 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int c = 0; -layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int c; - int cstep; - int size; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= 1 || gy >= 1 || gz >= psc(c)) - return; - - const int v_offset = gz * psc(cstep); - - afp res = afp(-FLT_MAX); - - for (int i = 0; i < psc(w); i++) - { - afp v = buffer_ld1(bottom_blob_data, v_offset + i); - res = max(res, v); - } - - buffer_st1(top_blob_data, gz, res); -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_last_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_last_pack4.comp deleted file mode 100644 index c1c592bd8f22..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_max_last_pack4.comp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define FLT_MAX 3.402823466e+38 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int c = 0; -layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int c; - int cstep; - int size; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= 1 || gy >= 1 || gz >= psc(c)) - return; - - const int v_offset = gz * psc(cstep); - - afpvec4 res = afpvec4(-FLT_MAX); - - for (int i = 0; i < psc(w); i++) - { - afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i); - res = max(res, v); - } - - buffer_st4(top_blob_data, gz, res); -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_max_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_max_pack4.comp deleted file mode 100644 index 421c99025386..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_max_pack4.comp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define FLT_MAX 3.402823466e+38 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int c = 0; -layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 3) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int c; - int cstep; - - int outw; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= 1 || gz >= psc(c)) - return; - - const int size_1 = psc(w) - 1; - - const int v_offset = gz * psc(cstep); - - afpvec4 res = afpvec4(-FLT_MAX); - - for (int ii = 0; ii < 8; ii++) - { - int i = min(gx + ii * psc(outw), size_1); - - afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i); - res = max(res, v); - } - - buffer_st4(top_blob_data, gz * psc(outcstep) + gx, res); -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum.comp deleted file mode 100644 index a1f77a519475..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_sum.comp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int c = 0; -layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 3) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int c; - int cstep; - - int outw; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= 1 || gz >= psc(c)) - return; - - const int end = min(8, (psc(w) - gx - 1) / psc(outw) + 1); - - const int v_offset = gz * psc(cstep); - - float sum = 0.f; - - for (int ii = 0; ii < end; ii++) - { - int i = gx + ii * psc(outw); - - float v = bottom_blob_data[v_offset + i]; - sum += v; - } - - top_blob_data[gz * psc(outcstep) + gx] = sum; -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_first.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_first.comp deleted file mode 100644 index 272f464d86f1..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_sum_first.comp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int h = 0; -layout(constant_id = shape_constant_id_offset + 2) const int c = 0; -layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 5) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= 1 || gz >= psc(c)) - return; - - const int end = min(8, (psc(w) * psc(h) - gx - 1) / psc(outw) + 1); - - const int v_offset = gz * psc(cstep); - - float sum = 0.f; - - for (int ii = 0; ii < end; ii++) - { - int i = gx + ii * psc(outw); - - afp v = buffer_ld1(bottom_blob_data, v_offset + i); - sum += float(v); - } - - top_blob_data[gz * psc(outcstep) + gx] = sum; -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_first_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_first_pack4.comp deleted file mode 100644 index 4b75f4b95da5..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_sum_first_pack4.comp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int h = 0; -layout(constant_id = shape_constant_id_offset + 2) const int c = 0; -layout(constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 5) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= 1 || gz >= psc(c)) - return; - - const int end = min(8, (psc(w) * psc(h) - gx - 1) / psc(outw) + 1); - - const int v_offset = gz * psc(cstep); - - vec4 sum = vec4(0.f); - - for (int ii = 0; ii < end; ii++) - { - int i = gx + ii * psc(outw); - - afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i); - sum += vec4(v); - } - - top_blob_data[gz * psc(outcstep) + gx] = sum; -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_last.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_last.comp deleted file mode 100644 index be8100ede20f..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_sum_last.comp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int c = 0; -layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int c; - int cstep; - int size; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= 1 || gy >= 1 || gz >= psc(c)) - return; - - const int v_offset = gz * psc(cstep); - - float sum = 0.f; - - for (int i = 0; i < psc(w); i++) - { - float v = bottom_blob_data[v_offset + i]; - sum += v; - } - - afp res = afp(sum / p.size); - - buffer_st1(top_blob_data, gz, res); -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_last_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_last_pack4.comp deleted file mode 100644 index 8b5dd8b92ccf..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_sum_last_pack4.comp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int c = 0; -layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int c; - int cstep; - int size; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= 1 || gy >= 1 || gz >= psc(c)) - return; - - const int v_offset = gz * psc(cstep); - - vec4 sum = vec4(0.f); - - for (int i = 0; i < psc(w); i++) - { - vec4 v = bottom_blob_data[v_offset + i]; - sum += v; - } - - afpvec4 res = afpvec4(sum / p.size); - - buffer_st4(top_blob_data, gz, res); -} diff --git a/src/layer/vulkan/shader/pooling_global_reduce_sum_pack4.comp b/src/layer/vulkan/shader/pooling_global_reduce_sum_pack4.comp deleted file mode 100644 index c5f3184350c4..000000000000 --- a/src/layer/vulkan/shader/pooling_global_reduce_sum_pack4.comp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2023 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define shape_constant_id_offset 0 -layout(constant_id = shape_constant_id_offset + 0) const int w = 0; -layout(constant_id = shape_constant_id_offset + 1) const int c = 0; -layout(constant_id = shape_constant_id_offset + 2) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 3) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int w; - int c; - int cstep; - - int outw; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= 1 || gz >= psc(c)) - return; - - const int end = min(8, (psc(w) - gx - 1) / psc(outw) + 1); - - const int v_offset = gz * psc(cstep); - - vec4 sum = vec4(0.f); - - for (int ii = 0; ii < end; ii++) - { - int i = gx + ii * psc(outw); - - vec4 v = bottom_blob_data[v_offset + i]; - sum += v; - } - - top_blob_data[gz * psc(outcstep) + gx] = sum; -} diff --git a/src/layer/vulkan/shader/pooling_global_stage1.comp b/src/layer/vulkan/shader/pooling_global_stage1.comp new file mode 100644 index 000000000000..385981853cbc --- /dev/null +++ b/src/layer/vulkan/shader/pooling_global_stage1.comp @@ -0,0 +1,75 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int pooling_type = 0; + +layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer partial_blob { sfp partial_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + int partial_w; +} p; + +shared lfp sdata[256]; + +void main() +{ + int cid = int(gl_WorkGroupID.y); + int chunk = int(gl_WorkGroupID.x); + int tid = int(gl_LocalInvocationID.x); + + if (cid >= p.c) + return; + + int size = p.w * p.h; + + int base = chunk * (256 * 4); + int idx = base + tid; + + afp acc = (pooling_type == 0) ? afp(-3.402823466e38) : afp(0.f); + + for (int u = 0; u < 4; u++) + { + int k = idx + u * 256; + if (k < size) + { + int iy = k / p.w; + int ix = k - iy * p.w; + int si = cid * p.cstep + iy * p.w + ix; + afp v = buffer_ld1(bottom_blob_data, si); + + if (pooling_type == 0) + acc = max(acc, v); + else + acc += v; + } + } + + sdata[tid] = sfp2lfp(acc); + barrier(); + + for (int offset = 128; offset > 0; offset >>= 1) + { + if (tid < offset) + { + afp a = lfp2afp(sdata[tid]); + afp b = lfp2afp(sdata[tid + offset]); + afp r = (pooling_type == 0) ? max(a, b) : (a + b); + sdata[tid] = sfp2lfp(r); + } + barrier(); + } + + if (tid == 0) + { + int oi = cid * p.partial_w + chunk; + buffer_st1(partial_blob_data, oi, lfp2afp(sdata[0])); + } +} diff --git a/src/layer/vulkan/shader/pooling_global_stage2.comp b/src/layer/vulkan/shader/pooling_global_stage2.comp new file mode 100644 index 000000000000..596c7e4091c1 --- /dev/null +++ b/src/layer/vulkan/shader/pooling_global_stage2.comp @@ -0,0 +1,64 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int pooling_type = 0; + +layout(binding = 0) readonly buffer partial_blob { sfp partial_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int partial_w; + int c; + int in_size; +} p; + +shared lfp sdata[256]; + +void main() +{ + int cid = int(gl_WorkGroupID.x); + int tid = int(gl_LocalInvocationID.x); + + if (cid >= p.c) + return; + + afp acc = (pooling_type == 0) ? afp(-3.402823466e38) : afp(0.f); + + for (int i = tid; i < p.partial_w; i += 256) + { + int si = cid * p.partial_w + i; + afp v = buffer_ld1(partial_blob_data, si); + + if (pooling_type == 0) + acc = max(acc, v); + else + acc += v; + } + + sdata[tid] = sfp2lfp(acc); + barrier(); + + for (int offset = 128; offset > 0; offset >>= 1) + { + if (tid < offset) + { + afp a = lfp2afp(sdata[tid]); + afp b = lfp2afp(sdata[tid + offset]); + afp r = (pooling_type == 0) ? max(a, b) : (a + b); + sdata[tid] = sfp2lfp(r); + } + barrier(); + } + + if (tid == 0) + { + afp outv = lfp2afp(sdata[0]); + if (pooling_type != 0) + outv *= afp(1.f / float(p.in_size)); + + buffer_st1(top_blob_data, cid, outv); + } +} diff --git a/src/layer/vulkan/shader/pooling_pack4.comp b/src/layer/vulkan/shader/pooling_pack4.comp deleted file mode 100644 index a14377946a0b..000000000000 --- a/src/layer/vulkan/shader/pooling_pack4.comp +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -#define FLT_MAX 3.402823466e+38 - -layout(constant_id = 0) const int pooling_type = 0; -layout(constant_id = 1) const int kernel_w = 1; -layout(constant_id = 2) const int kernel_h = 1; -layout(constant_id = 3) const int stride_w = 1; -layout(constant_id = 4) const int stride_h = 1; -layout(constant_id = 5) const int pad_left = 0; -layout(constant_id = 6) const int pad_right = 0; -layout(constant_id = 7) const int pad_top = 0; -layout(constant_id = 8) const int pad_bottom = 0; -layout(constant_id = 9) const int global_pooling = 0; -layout(constant_id = 10) const int pad_mode = 0; -layout(constant_id = 11) const int avgpool_count_include_pad = 0; - -#define shape_constant_id_offset 12 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int c = 0; -layout(constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; -layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outc; - int outcstep; - - int wtailpad; - int htailpad; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) - return; - - afpvec4 res; - - if (pooling_type == 0) - { - res = afpvec4(-FLT_MAX); - - int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; - - for (int y = 0; y < kernel_h; y++) - { - for (int x = 0; x < kernel_w; x++) - { - afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x); - res = max(res, v); - } - - v_offset += psc(w); - } - } - else if (pooling_type == 1 && avgpool_count_include_pad == 0) - { - res = afpvec4(0.f); - int area = 0; - - int sx = gx * stride_w; - int sy = gy * stride_h; - - int v_offset = gz * psc(cstep) + sy * psc(w) + sx; - - for (int y = 0; y < kernel_h; y++) - { - if (sy + y < pad_top) - { - v_offset += psc(w); - continue; - } - - if (sy + y >= psc(h) - pad_bottom - p.htailpad) - break; - - for (int x = 0; x < kernel_w; x++) - { - if (sx + x < pad_left) - { - continue; - } - - if (sx + x >= psc(w) - pad_right - p.wtailpad) - break; - - res += buffer_ld4(bottom_blob_data, v_offset + x); - area += 1; - } - - v_offset += psc(w); - } - - res /= afp(area); - } - else if (pooling_type == 1 && avgpool_count_include_pad == 1) - { - res = afpvec4(0.f); - - int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; - - for (int y = 0; y < kernel_h; y++) - { - for (int x = 0; x < kernel_w; x++) - { - res += buffer_ld4(bottom_blob_data, v_offset + x); - } - - v_offset += psc(w); - } - - res /= afp(kernel_w * kernel_h); - } - - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; - - buffer_st4(top_blob_data, gi, res); -} diff --git a/src/layer/vulkan/shader/pooling_tile.comp b/src/layer/vulkan/shader/pooling_tile.comp new file mode 100644 index 000000000000..0fa989926e27 --- /dev/null +++ b/src/layer/vulkan/shader/pooling_tile.comp @@ -0,0 +1,222 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int pooling_type = 0; +layout(constant_id = 1) const int kernel_w = 0; +layout(constant_id = 2) const int kernel_h = 0; +layout(constant_id = 3) const int stride_w = 1; +layout(constant_id = 4) const int stride_h = 1; +layout(constant_id = 5) const int pad_left = 0; +layout(constant_id = 6) const int pad_right = 0; +layout(constant_id = 7) const int pad_top = 0; +layout(constant_id = 8) const int pad_bottom = 0; +layout(constant_id = 9) const int pad_mode = 0; +layout(constant_id = 10) const int avgpool_count_include_pad = 0; + +#define shape_constant_id_offset 11 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int d = 0; +layout(constant_id = shape_constant_id_offset + 4) const int c = 0; +layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; + +layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; +layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; +layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; +layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; +layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; +layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int d; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outd; + int outc; + int outcstep; +} p; + +const int tile_out_w = 8; +const int tile_out_h = 8; +const int tile_in_max = 36; + +shared lfp tile[tile_in_max][tile_in_max]; + +void main() +{ + int lid_x = int(gl_LocalInvocationID.x); + int lid_y = int(gl_LocalInvocationID.y); + + int gx0 = int(gl_WorkGroupID.x) * tile_out_w; + int gy0 = int(gl_WorkGroupID.y) * tile_out_h; + int gz = int(gl_GlobalInvocationID.z); + + if (gz >= psc(outc)) + return; + + int pl; + int pr; + int pt; + int pb; + + if (pad_mode == 0 || pad_mode == 1) + { + pl = pad_left; + pr = pad_right; + pt = pad_top; + pb = pad_bottom; + + if (pad_mode == 0) + { + int wtail = (psc(w) + pl + pr - kernel_w) % stride_w; + int htail = (psc(h) + pt + pb - kernel_h) % stride_h; + if (wtail != 0) pr += stride_w - wtail; + if (htail != 0) pb += stride_h - htail; + } + } + else + { + int wpad = kernel_w + (psc(w) - 1) / stride_w * stride_w - psc(w); + int hpad = kernel_h + (psc(h) - 1) / stride_h * stride_h - psc(h); + if (wpad < 0) wpad = 0; + if (hpad < 0) hpad = 0; + + if (pad_mode == 2) + { + pl = wpad / 2; + pr = wpad - pl; + pt = hpad / 2; + pb = hpad - pt; + } + else + { + pl = wpad - wpad / 2; + pr = wpad / 2; + pt = hpad - hpad / 2; + pb = hpad / 2; + } + } + + int tile_in_w = (tile_out_w - 1) * stride_w + kernel_w; + int tile_in_h = (tile_out_h - 1) * stride_h + kernel_h; + + int in_x0 = gx0 * stride_w - pl; + int in_y0 = gy0 * stride_h - pt; + + lfp padv = (pooling_type == 0) ? sfp2lfp(afp(-3.402823466e38)) : sfp2lfp(afp(0.f)); + + int l = lid_y * tile_out_w + lid_x; + int lsize = tile_out_w * tile_out_h; + int tilesz = tile_in_w * tile_in_h; + + for (int i = l; i < tilesz; i += lsize) + { + int ty = i / tile_in_w; + int tx = i - ty * tile_in_w; + + int ix = in_x0 + tx; + int iy = in_y0 + ty; + + lfp v = padv; + if (ix >= 0 && ix < psc(w) && iy >= 0 && iy < psc(h)) + { + int si = gz * psc(cstep) + iy * psc(w) + ix; + v = sfp2lfp(buffer_ld1(bottom_blob_data, si)); + } + + tile[ty][tx] = v; + } + + barrier(); + + int ox = gx0 + lid_x; + int oy = gy0 + lid_y; + + if (ox >= psc(outw) || oy >= psc(outh)) + return; + + int tx0 = lid_x * stride_w; + int ty0 = lid_y * stride_h; + + afp outv; + + if (pooling_type == 0) + { + afp mv = lfp2afp(tile[ty0][tx0]); + + for (int ky = 0; ky < kernel_h; ky++) + { + for (int kx = 0; kx < kernel_w; kx++) + { + afp v = lfp2afp(tile[ty0 + ky][tx0 + kx]); + mv = max(mv, v); + } + } + + outv = mv; + } + else + { + afp sum = afp(0.f); + + if (avgpool_count_include_pad == 1) + { + for (int ky = 0; ky < kernel_h; ky++) + { + for (int kx = 0; kx < kernel_w; kx++) + { + sum += lfp2afp(tile[ty0 + ky][tx0 + kx]); + } + } + + sum *= afp(1.f / float(kernel_w * kernel_h)); + outv = sum; + } + else + { + int inx0 = ox * stride_w - pl; + int iny0 = oy * stride_h - pt; + + int vx0 = max(0, -inx0); + int vy0 = max(0, -iny0); + int vx1 = min(kernel_w, psc(w) - inx0); + int vy1 = min(kernel_h, psc(h) - iny0); + + int area = (vx1 - vx0) * (vy1 - vy0); + if (area <= 0) + { + outv = afp(0.f); + } + else + { + for (int ky = vy0; ky < vy1; ky++) + { + for (int kx = vx0; kx < vx1; kx++) + { + sum += lfp2afp(tile[ty0 + ky][tx0 + kx]); + } + } + + sum *= afp(1.f / float(area)); + outv = sum; + } + } + } + + int gi = gz * psc(outcstep) + oy * psc(outw) + ox; + buffer_st1(top_blob_data, gi, outv); +}