diff --git a/src/layer/vulkan/shader/softmax.comp b/src/layer/vulkan/shader/softmax.comp new file mode 100644 index 000000000000..9c05cb7e15d1 --- /dev/null +++ b/src/layer/vulkan/shader/softmax.comp @@ -0,0 +1,227 @@ +// softmax.comp +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout(constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int d = 0; +layout(constant_id = shape_constant_id_offset + 4) const int c = 0; +layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int d; + int c; + int cstep; + int outcstep; +} p; + +shared lfp smaxv[256]; +shared lfp ssumv[256]; + +void reduce_maxv(int lid, int lsize) +{ + for (int off = lsize / 2; off > 0; off >>= 1) + { + if (lid < off) + { + afp a = lfp2afp(smaxv[lid]); + afp b = lfp2afp(smaxv[lid + off]); + smaxv[lid] = max(a, b); + } + barrier(); + } +} + +void reduce_sumv(int lid, int lsize) +{ + for (int off = lsize / 2; off > 0; off >>= 1) + { + if (lid < off) + { + afp a = lfp2afp(ssumv[lid]); + afp b = lfp2afp(ssumv[lid + off]); + ssumv[lid] = a + b; + } + barrier(); + } +} + +void main() +{ + int slice = int(gl_WorkGroupID.x); + int lid = int(gl_LocalInvocationID.x); + int lsize = int(gl_WorkGroupSize.x); + + int dims_ = psc(dims); + int w_ = psc(w); + int h_ = psc(h); + int d_ = psc(d); + int c_ = psc(c); + int cstep_ = psc(cstep); + int outcstep_ = psc(outcstep); + + int pa = axis < 0 ? dims_ + axis : axis; + + int base_in = 0; + int base_out = 0; + int size = 0; + int stride_in = 0; + int stride_out = 0; + + if (dims_ == 1) + { + base_in = 0; + base_out = 0; + size = w_; + stride_in = 1; + stride_out = 1; + } + else if (dims_ == 2) + { + if (pa == 0) + { + int x = slice; + base_in = x; + base_out = x; + size = h_; + stride_in = w_; + stride_out = w_; + } + else + { + int y = slice; + base_in = y * w_; + base_out = y * w_; + size = w_; + stride_in = 1; + stride_out = 1; + } + } + else if (dims_ == 3) + { + if (pa == 0) + { + int xy = slice; + base_in = xy; + base_out = xy; + size = c_; + stride_in = cstep_; + stride_out = outcstep_; + } + else if (pa == 1) + { + int q = slice / w_; + int x = slice - q * w_; + base_in = q * cstep_ + x; + base_out = q * outcstep_ + x; + size = h_; + stride_in = w_; + stride_out = w_; + } + else + { + int q = slice / h_; + int y = slice - q * h_; + base_in = q * cstep_ + y * w_; + base_out = q * outcstep_ + y * w_; + size = w_; + stride_in = 1; + stride_out = 1; + } + } + else + { + int plane = w_ * h_; + + if (pa == 0) + { + int xyd = slice; + base_in = xyd; + base_out = xyd; + size = c_; + stride_in = cstep_; + stride_out = outcstep_; + } + else if (pa == 1) + { + int q = slice / plane; + int xy = slice - q * plane; + base_in = q * cstep_ + xy; + base_out = q * outcstep_ + xy; + size = d_; + stride_in = plane; + stride_out = plane; + } + else if (pa == 2) + { + int t = d_ * w_; + int q = slice / t; + int rem = slice - q * t; + int z = rem / w_; + int x = rem - z * w_; + base_in = q * cstep_ + z * plane + x; + base_out = q * outcstep_ + z * plane + x; + size = h_; + stride_in = w_; + stride_out = w_; + } + else + { + int t = d_ * h_; + int q = slice / t; + int rem = slice - q * t; + int z = rem / h_; + int y = rem - z * h_; + base_in = q * cstep_ + z * plane + y * w_; + base_out = q * outcstep_ + z * plane + y * w_; + size = w_; + stride_in = 1; + stride_out = 1; + } + } + + afp lmax = afp(-3.402823466e38); + for (int i = lid; i < size; i += lsize) + { + afp v = buffer_ld1(bottom_blob_data, base_in + i * stride_in); + lmax = max(lmax, v); + } + + smaxv[lid] = lmax; + barrier(); + reduce_maxv(lid, lsize); + afp maxv = lfp2afp(smaxv[0]); + + afp lsum = afp(0.f); + for (int i = lid; i < size; i += lsize) + { + afp v = buffer_ld1(bottom_blob_data, base_in + i * stride_in); + lsum += exp(v - maxv); + } + + ssumv[lid] = lsum; + barrier(); + reduce_sumv(lid, lsize); + afp invsum = afp(1.f) / lfp2afp(ssumv[0]); + + for (int i = lid; i < size; i += lsize) + { + afp v = buffer_ld1(bottom_blob_data, base_in + i * stride_in); + afp e = exp(v - maxv) * invsum; + buffer_st1(top_blob_data, base_out + i * stride_out, e); + } +} diff --git a/src/layer/vulkan/shader/softmax_div_sum.comp b/src/layer/vulkan/shader/softmax_div_sum.comp deleted file mode 100644 index 6a3c1c371279..000000000000 --- a/src/layer/vulkan/shader/softmax_div_sum.comp +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -layout(constant_id = 0) const int axis = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int d = 0; -layout(constant_id = shape_constant_id_offset + 4) const int c = 0; -layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; -layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; - -layout(binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; -layout(binding = 1) readonly buffer sum_workspace { sfp sum_workspace_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int d; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outd; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(w) || gy >= psc(h) * psc(d) || gz >= psc(c)) - return; - - int positive_axis = axis < 0 ? psc(dims) + axis : axis; - - int gi = gz * psc(cstep) + gy * psc(w) + gx; - - afp v = buffer_ld1(bottom_top_blob_data, gi); - - afp sum; - - if (psc(dims) == 1) // positive_axis == 0 - { - sum = buffer_ld1(sum_workspace_data, 0); - } - else if (psc(dims) == 2 && positive_axis == 0) - { - sum = buffer_ld1(sum_workspace_data, gx); - } - else if (psc(dims) == 2 && positive_axis == 1) - { - sum = buffer_ld1(sum_workspace_data, gy); - } - else if (psc(dims) == 3 && positive_axis == 0) - { - sum = buffer_ld1(sum_workspace_data, gy * psc(w) + gx); - } - else if (psc(dims) == 3 && positive_axis == 1) - { - sum = buffer_ld1(sum_workspace_data, gz * psc(w) + gx); - } - else if (psc(dims) == 3 && positive_axis == 2) - { - sum = buffer_ld1(sum_workspace_data, gz * psc(h) + gy); - } - else // if (psc(dims) == 4) - { - int yd = gy / psc(h); - int yh = gy % psc(h); - - gi = gz * psc(cstep) + yd * psc(h) * psc(w) + yh * psc(w) + gx; - - if (positive_axis == 0) - { - sum = buffer_ld1(sum_workspace_data, yd * psc(outcstep) + yh * psc(w) + gx); - } - if (positive_axis == 1) - { - sum = buffer_ld1(sum_workspace_data, gz * psc(outcstep) + yh * psc(w) + gx); - } - if (positive_axis == 2) - { - sum = buffer_ld1(sum_workspace_data, gz * psc(outcstep) + yd * psc(w) + gx); - } - if (positive_axis == 3) - { - sum = buffer_ld1(sum_workspace_data, gz * psc(outcstep) + yd * psc(h) + yh); - } - } - - v /= sum; - - buffer_st1(bottom_top_blob_data, gi, v); -} diff --git a/src/layer/vulkan/shader/softmax_div_sum_pack4.comp b/src/layer/vulkan/shader/softmax_div_sum_pack4.comp deleted file mode 100644 index 92d882e38884..000000000000 --- a/src/layer/vulkan/shader/softmax_div_sum_pack4.comp +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -layout(constant_id = 0) const int axis = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int d = 0; -layout(constant_id = shape_constant_id_offset + 4) const int c = 0; -layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; -layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; - -layout(binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; -layout(binding = 1) readonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int d; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outd; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(w) || gy >= psc(h) * psc(d) || gz >= psc(c)) - return; - - int positive_axis = axis < 0 ? psc(dims) + axis : axis; - - int gi = gz * psc(cstep) + gy * psc(w) + gx; - - afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); - - afpvec4 sum; - - if (psc(dims) == 1) // positive_axis == 0 - { - sum = buffer_ld4(sum_workspace_data, 0); - } - else if (psc(dims) == 2 && positive_axis == 0) - { - sum = buffer_ld4(sum_workspace_data, gx); - } - else if (psc(dims) == 2 && positive_axis == 1) - { - sum = buffer_ld4(sum_workspace_data, gy); - } - else if (psc(dims) == 3 && positive_axis == 0) - { - sum = buffer_ld4(sum_workspace_data, gy * psc(w) + gx); - } - else if (psc(dims) == 3 && positive_axis == 1) - { - sum = buffer_ld4(sum_workspace_data, gz * psc(w) + gx); - } - else if (psc(dims) == 3 && positive_axis == 2) - { - sum = buffer_ld4(sum_workspace_data, gz * psc(h) + gy); - } - else // if (psc(dims) == 4) - { - int yd = gy / psc(h); - int yh = gy % psc(h); - - gi = gz * psc(cstep) + yd * psc(h) * psc(w) + yh * psc(w) + gx; - - if (positive_axis == 0) - { - sum = buffer_ld4(sum_workspace_data, yd * psc(outcstep) + yh * psc(w) + gx); - } - if (positive_axis == 1) - { - sum = buffer_ld4(sum_workspace_data, gz * psc(outcstep) + yh * psc(w) + gx); - } - if (positive_axis == 2) - { - sum = buffer_ld4(sum_workspace_data, gz * psc(outcstep) + yd * psc(w) + gx); - } - if (positive_axis == 3) - { - sum = buffer_ld4(sum_workspace_data, gz * psc(outcstep) + yd * psc(h) + yh); - } - } - -#if NCNN_fp16_packed || NCNN_fp16_storage - // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s - // TODO only enable this workaround for some nvidia driver - if (positive_axis == 0) - { - sum = afpvec4(sum.r); - } -#endif - - v /= sum; - - buffer_st4(bottom_top_blob_data, gi, v); -} diff --git a/src/layer/vulkan/shader/softmax_exp_sub_max.comp b/src/layer/vulkan/shader/softmax_exp_sub_max.comp deleted file mode 100644 index 050671d7c603..000000000000 --- a/src/layer/vulkan/shader/softmax_exp_sub_max.comp +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -layout(constant_id = 0) const int axis = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int d = 0; -layout(constant_id = shape_constant_id_offset + 4) const int c = 0; -layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; -layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; - -layout(binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; -layout(binding = 1) readonly buffer max_workspace { sfp max_workspace_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int d; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outd; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(w) || gy >= psc(h) * psc(d) || gz >= psc(c)) - return; - - int positive_axis = axis < 0 ? psc(dims) + axis : axis; - - int gi = gz * psc(cstep) + gy * psc(w) + gx; - - afp v = buffer_ld1(bottom_top_blob_data, gi); - - afp max_value; - - if (psc(dims) == 1) // positive_axis == 0 - { - max_value = buffer_ld1(max_workspace_data, 0); - } - else if (psc(dims) == 2 && positive_axis == 0) - { - max_value = buffer_ld1(max_workspace_data, gx); - } - else if (psc(dims) == 2 && positive_axis == 1) - { - max_value = buffer_ld1(max_workspace_data, gy); - } - else if (psc(dims) == 3 && positive_axis == 0) - { - max_value = buffer_ld1(max_workspace_data, gy * psc(w) + gx); - } - else if (psc(dims) == 3 && positive_axis == 1) - { - max_value = buffer_ld1(max_workspace_data, gz * psc(w) + gx); - } - else if (psc(dims) == 3 && positive_axis == 2) - { - max_value = buffer_ld1(max_workspace_data, gz * psc(h) + gy); - } - else // if (psc(dims) == 4) - { - int yd = gy / psc(h); - int yh = gy % psc(h); - - gi = gz * psc(cstep) + yd * psc(h) * psc(w) + yh * psc(w) + gx; - - if (positive_axis == 0) - { - max_value = buffer_ld1(max_workspace_data, yd * psc(outcstep) + yh * psc(w) + gx); - } - if (positive_axis == 1) - { - max_value = buffer_ld1(max_workspace_data, gz * psc(outcstep) + yh * psc(w) + gx); - } - if (positive_axis == 2) - { - max_value = buffer_ld1(max_workspace_data, gz * psc(outcstep) + yd * psc(w) + gx); - } - if (positive_axis == 3) - { - max_value = buffer_ld1(max_workspace_data, gz * psc(outcstep) + yd * psc(h) + yh); - } - } - - v = exp(v - max_value); - - buffer_st1(bottom_top_blob_data, gi, v); -} diff --git a/src/layer/vulkan/shader/softmax_exp_sub_max_pack4.comp b/src/layer/vulkan/shader/softmax_exp_sub_max_pack4.comp deleted file mode 100644 index 76b1325b79d6..000000000000 --- a/src/layer/vulkan/shader/softmax_exp_sub_max_pack4.comp +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -layout(constant_id = 0) const int axis = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int d = 0; -layout(constant_id = shape_constant_id_offset + 4) const int c = 0; -layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; -layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; - -layout(binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; -layout(binding = 1) readonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int d; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outd; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(w) || gy >= psc(h) * psc(d) || gz >= psc(c)) - return; - - int positive_axis = axis < 0 ? psc(dims) + axis : axis; - - int gi = gz * psc(cstep) + gy * psc(w) + gx; - - afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); - - afpvec4 max_value; - - if (psc(dims) == 1) // positive_axis == 0 - { - max_value = buffer_ld4(max_workspace_data, 0); - } - else if (psc(dims) == 2 && positive_axis == 0) - { - max_value = buffer_ld4(max_workspace_data, gx); - } - else if (psc(dims) == 2 && positive_axis == 1) - { - max_value = buffer_ld4(max_workspace_data, gy); - } - else if (psc(dims) == 3 && positive_axis == 0) - { - max_value = buffer_ld4(max_workspace_data, gy * psc(w) + gx); - } - else if (psc(dims) == 3 && positive_axis == 1) - { - max_value = buffer_ld4(max_workspace_data, gz * psc(w) + gx); - } - else if (psc(dims) == 3 && positive_axis == 2) - { - max_value = buffer_ld4(max_workspace_data, gz * psc(h) + gy); - } - else // if (psc(dims) == 4) - { - int yd = gy / psc(h); - int yh = gy % psc(h); - - gi = gz * psc(cstep) + yd * psc(h) * psc(w) + yh * psc(w) + gx; - - if (positive_axis == 0) - { - max_value = buffer_ld4(max_workspace_data, yd * psc(outcstep) + yh * psc(w) + gx); - } - if (positive_axis == 1) - { - max_value = buffer_ld4(max_workspace_data, gz * psc(outcstep) + yh * psc(w) + gx); - } - if (positive_axis == 2) - { - max_value = buffer_ld4(max_workspace_data, gz * psc(outcstep) + yd * psc(w) + gx); - } - if (positive_axis == 3) - { - max_value = buffer_ld4(max_workspace_data, gz * psc(outcstep) + yd * psc(h) + yh); - } - } - -#if NCNN_fp16_packed || NCNN_fp16_storage - // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s - // TODO only enable this workaround for some nvidia driver - if (positive_axis == 0) - { - max_value = afpvec4(max_value.r); - } -#endif - - v = exp(v - max_value); - - buffer_st4(bottom_top_blob_data, gi, v); -} diff --git a/src/layer/vulkan/shader/softmax_pack4.comp b/src/layer/vulkan/shader/softmax_pack4.comp new file mode 100644 index 000000000000..156d1556c35c --- /dev/null +++ b/src/layer/vulkan/shader/softmax_pack4.comp @@ -0,0 +1,277 @@ +#version 450 + +layout(constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout(constant_id = shape_constant_id_offset + 1) const int w = 0; +layout(constant_id = shape_constant_id_offset + 2) const int h = 0; +layout(constant_id = shape_constant_id_offset + 3) const int d = 0; +layout(constant_id = shape_constant_id_offset + 4) const int c = 0; +layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; +layout(constant_id = shape_constant_id_offset + 6) const int outcstep = 0; + +layout(binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout(binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; + +layout(push_constant) uniform parameter +{ + int dims; + int w; + int h; + int d; + int c; + int cstep; + int outcstep; +} p; + +shared afp smax1[256]; +shared afp ssum1[256]; + +shared afpvec4 smax4[256]; +shared afpvec4 ssum4[256]; + +void reduce_max1(int lid, int lsize) +{ + for (int off = lsize / 2; off > 0; off >>= 1) + { + if (lid < off) + smax1[lid] = max(smax1[lid], smax1[lid + off]); + barrier(); + } +} + +void reduce_sum1(int lid, int lsize) +{ + for (int off = lsize / 2; off > 0; off >>= 1) + { + if (lid < off) + ssum1[lid] = ssum1[lid] + ssum1[lid + off]; + barrier(); + } +} + +void reduce_max4(int lid, int lsize) +{ + for (int off = lsize / 2; off > 0; off >>= 1) + { + if (lid < off) + smax4[lid] = max(smax4[lid], smax4[lid + off]); + barrier(); + } +} + +void reduce_sum4(int lid, int lsize) +{ + for (int off = lsize / 2; off > 0; off >>= 1) + { + if (lid < off) + ssum4[lid] = ssum4[lid] + ssum4[lid + off]; + barrier(); + } +} + +void main() +{ + int slice = int(gl_WorkGroupID.x); + int lid = int(gl_LocalInvocationID.x); + int lsize = int(gl_WorkGroupSize.x); + + int dims_ = psc(dims); + int w_ = psc(w); + int h_ = psc(h); + int d_ = psc(d); + int c_ = psc(c); + int cstep_ = psc(cstep); + int outcstep_ = psc(outcstep); + + int pa = axis < 0 ? dims_ + axis : axis; + + int base_in = 0; + int base_out = 0; + int size = 0; + int stride_in = 0; + int stride_out = 0; + + if (dims_ == 1) + { + base_in = 0; + base_out = 0; + size = w_; + stride_in = 1; + stride_out = 1; + } + else if (dims_ == 2) + { + if (pa == 0) + { + int x = slice; + base_in = x; + base_out = x; + size = h_; + stride_in = w_; + stride_out = w_; + } + else + { + int y = slice; + base_in = y * w_; + base_out = y * w_; + size = w_; + stride_in = 1; + stride_out = 1; + } + } + else if (dims_ == 3) + { + if (pa == 0) + { + int xy = slice; + base_in = xy; + base_out = xy; + size = c_; + stride_in = cstep_; + stride_out = outcstep_; + } + else if (pa == 1) + { + int q = slice / w_; + int x = slice - q * w_; + base_in = q * cstep_ + x; + base_out = q * outcstep_ + x; + size = h_; + stride_in = w_; + stride_out = w_; + } + else + { + int q = slice / h_; + int y = slice - q * h_; + base_in = q * cstep_ + y * w_; + base_out = q * outcstep_ + y * w_; + size = w_; + stride_in = 1; + stride_out = 1; + } + } + else + { + int plane = w_ * h_; + + if (pa == 0) + { + int xyd = slice; + base_in = xyd; + base_out = xyd; + size = c_; + stride_in = cstep_; + stride_out = outcstep_; + } + else if (pa == 1) + { + int q = slice / plane; + int xy = slice - q * plane; + base_in = q * cstep_ + xy; + base_out = q * outcstep_ + xy; + size = d_; + stride_in = plane; + stride_out = plane; + } + else if (pa == 2) + { + int t = d_ * w_; + int q = slice / t; + int rem = slice - q * t; + int z = rem / w_; + int x = rem - z * w_; + base_in = q * cstep_ + z * plane + x; + base_out = q * outcstep_ + z * plane + x; + size = h_; + stride_in = w_; + stride_out = w_; + } + else + { + int t = d_ * h_; + int q = slice / t; + int rem = slice - q * t; + int z = rem / h_; + int y = rem - z * h_; + base_in = q * cstep_ + z * plane + y * w_; + base_out = q * outcstep_ + z * plane + y * w_; + size = w_; + stride_in = 1; + stride_out = 1; + } + } + + if (pa == 0) + { + afp lmax = afp(-3.402823466e38); + for (int i = lid; i < size; i += lsize) + { + afpvec4 v = buffer_ld4(bottom_blob_data, base_in + i * stride_in); + afp m0 = max(v.x, v.y); + afp m1 = max(v.z, v.w); + lmax = max(lmax, max(m0, m1)); + } + + smax1[lid] = lmax; + barrier(); + reduce_max1(lid, lsize); + afp maxv = smax1[0]; + + afp lsum = afp(0.f); + for (int i = lid; i < size; i += lsize) + { + afpvec4 v = buffer_ld4(bottom_blob_data, base_in + i * stride_in); + afpvec4 e = exp(v - maxv); + lsum += e.x + e.y + e.z + e.w; + } + + ssum1[lid] = lsum; + barrier(); + reduce_sum1(lid, lsize); + afp invsum = afp(1.f) / ssum1[0]; + + for (int i = lid; i < size; i += lsize) + { + afpvec4 v = buffer_ld4(bottom_blob_data, base_in + i * stride_in); + afpvec4 y = exp(v - maxv) * invsum; + buffer_st4(top_blob_data, base_out + i * stride_out, y); + } + } + else + { + afpvec4 lmax4 = afpvec4(afp(-3.402823466e38)); + for (int i = lid; i < size; i += lsize) + { + afpvec4 v = buffer_ld4(bottom_blob_data, base_in + i * stride_in); + lmax4 = max(lmax4, v); + } + + smax4[lid] = lmax4; + barrier(); + reduce_max4(lid, lsize); + afpvec4 maxv4 = smax4[0]; + + afpvec4 lsum4 = afpvec4(afp(0.f)); + for (int i = lid; i < size; i += lsize) + { + afpvec4 v = buffer_ld4(bottom_blob_data, base_in + i * stride_in); + lsum4 += exp(v - maxv4); + } + + ssum4[lid] = lsum4; + barrier(); + reduce_sum4(lid, lsize); + afpvec4 invsum4 = afpvec4(afp(1.f)) / ssum4[0]; + + for (int i = lid; i < size; i += lsize) + { + afpvec4 v = buffer_ld4(bottom_blob_data, base_in + i * stride_in); + afpvec4 y = exp(v - maxv4) * invsum4; + buffer_st4(top_blob_data, base_out + i * stride_out, y); + } + } +} diff --git a/src/layer/vulkan/shader/softmax_reduce_max.comp b/src/layer/vulkan/shader/softmax_reduce_max.comp deleted file mode 100644 index 42dc549a958b..000000000000 --- a/src/layer/vulkan/shader/softmax_reduce_max.comp +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -layout(constant_id = 0) const int axis = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int d = 0; -layout(constant_id = shape_constant_id_offset + 4) const int c = 0; -layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; -layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; -layout(binding = 1) writeonly buffer max_workspace { sfp max_workspace_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int d; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outd; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) - return; - - int positive_axis = axis < 0 ? psc(dims) + axis : axis; - afp max_value = afp(-99999999.f); - - if (psc(dims) == 1) // positive_axis == 0 - { - for (int i = 0; i < psc(w); i++) - { - afp v = buffer_ld1(bottom_top_blob_data, i); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, 0, max_value); - return; - } - - if (psc(dims) == 2 && positive_axis == 0) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = i * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gx, max_value); - return; - } - - if (psc(dims) == 2 && positive_axis == 1) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gx * psc(w) + i; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gx, max_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 0) - { - for (int i = 0; i < psc(c); i++) - { - int v_offset = i * psc(cstep) + gy * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 1) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = gy * psc(cstep) + i * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 2) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gy * psc(cstep) + gx * psc(w) + i; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gy * psc(h) + gx, max_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 0) - { - for (int i = 0; i < psc(c); i++) - { - int v_offset = i * psc(cstep) + gz * psc(h) * psc(w) + gy * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 1) - { - for (int i = 0; i < psc(d); i++) - { - int v_offset = gz * psc(cstep) + i * psc(h) * psc(w) + gy * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 2) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = gz * psc(cstep) + gy * psc(h) * psc(w) + i * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 3) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gz * psc(cstep) + gy * psc(h) * psc(w) + gx * psc(w) + i; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st1(max_workspace_data, gz * psc(outcstep) + gy * psc(h) + gx, max_value); - return; - } -} diff --git a/src/layer/vulkan/shader/softmax_reduce_max_pack4.comp b/src/layer/vulkan/shader/softmax_reduce_max_pack4.comp deleted file mode 100644 index e1348b0e2156..000000000000 --- a/src/layer/vulkan/shader/softmax_reduce_max_pack4.comp +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -layout(constant_id = 0) const int axis = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int d = 0; -layout(constant_id = shape_constant_id_offset + 4) const int c = 0; -layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; -layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; -layout(binding = 1) writeonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int d; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outd; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) - return; - - int positive_axis = axis < 0 ? psc(dims) + axis : axis; - afpvec4 max_value = afpvec4(-99999999.f); - - if (psc(dims) == 1) // positive_axis == 0 - { - for (int i = 0; i < psc(w); i++) - { - afpvec4 v = buffer_ld4(bottom_top_blob_data, i); - max_value = max(max_value, v); - } - afpvec2 max2 = max(max_value.rg, max_value.ba); - max_value = afpvec4(max(max2.r, max2.g)); - buffer_st4(max_workspace_data, 0, max_value); - return; - } - - if (psc(dims) == 2 && positive_axis == 0) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = i * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - afpvec2 max2 = max(max_value.rg, max_value.ba); - max_value = afpvec4(max(max2.r, max2.g)); - buffer_st4(max_workspace_data, gx, max_value); - return; - } - - if (psc(dims) == 2 && positive_axis == 1) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gx * psc(w) + i; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st4(max_workspace_data, gx, max_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 0) - { - for (int i = 0; i < psc(c); i++) - { - int v_offset = i * psc(cstep) + gy * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - afpvec2 max2 = max(max_value.rg, max_value.ba); - max_value = afpvec4(max(max2.r, max2.g)); - buffer_st4(max_workspace_data, gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 1) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = gy * psc(cstep) + i * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st4(max_workspace_data, gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 2) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gy * psc(cstep) + gx * psc(w) + i; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st4(max_workspace_data, gy * psc(h) + gx, max_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 0) - { - for (int i = 0; i < psc(c); i++) - { - int v_offset = i * psc(cstep) + gz * psc(h) * psc(w) + gy * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - afpvec2 max2 = max(max_value.rg, max_value.ba); - max_value = afpvec4(max(max2.r, max2.g)); - buffer_st4(max_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 1) - { - for (int i = 0; i < psc(d); i++) - { - int v_offset = gz * psc(cstep) + i * psc(h) * psc(w) + gy * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st4(max_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 2) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = gz * psc(cstep) + gy * psc(h) * psc(w) + i * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st4(max_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, max_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 3) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gz * psc(cstep) + gy * psc(h) * psc(w) + gx * psc(w) + i; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - max_value = max(max_value, v); - } - buffer_st4(max_workspace_data, gz * psc(outcstep) + gy * psc(h) + gx, max_value); - return; - } -} diff --git a/src/layer/vulkan/shader/softmax_reduce_sum.comp b/src/layer/vulkan/shader/softmax_reduce_sum.comp deleted file mode 100644 index d27dd80dcdfc..000000000000 --- a/src/layer/vulkan/shader/softmax_reduce_sum.comp +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -layout(constant_id = 0) const int axis = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int d = 0; -layout(constant_id = shape_constant_id_offset + 4) const int c = 0; -layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; -layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; -layout(binding = 1) writeonly buffer sum_workspace { sfp sum_workspace_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int d; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outd; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc)) - return; - - int positive_axis = axis < 0 ? psc(dims) + axis : axis; - afp sum_value = afp(0.f); - - if (psc(dims) == 1) // positive_axis == 0 - { - for (int i = 0; i < psc(w); i++) - { - afp v = buffer_ld1(bottom_top_blob_data, i); - sum_value += v; - } - buffer_st1(sum_workspace_data, 0, sum_value); - return; - } - - if (psc(dims) == 2 && positive_axis == 0) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = i * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gx, sum_value); - return; - } - - if (psc(dims) == 2 && positive_axis == 1) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gx * psc(w) + i; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gx, sum_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 0) - { - for (int i = 0; i < psc(c); i++) - { - int v_offset = i * psc(cstep) + gy * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 1) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = gy * psc(cstep) + i * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 2) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gy * psc(cstep) + gx * psc(w) + i; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gy * psc(h) + gx, sum_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 0) - { - for (int i = 0; i < psc(c); i++) - { - int v_offset = i * psc(cstep) + gz * psc(h) * psc(w) + gy * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 1) - { - for (int i = 0; i < psc(d); i++) - { - int v_offset = gz * psc(cstep) + i * psc(h) * psc(w) + gy * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 2) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = gz * psc(cstep) + gy * psc(h) * psc(w) + i * psc(w) + gx; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 3) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gz * psc(cstep) + gy * psc(h) * psc(w) + gx * psc(w) + i; - afp v = buffer_ld1(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st1(sum_workspace_data, gz * psc(outcstep) + gy * psc(h) + gx, sum_value); - return; - } -} diff --git a/src/layer/vulkan/shader/softmax_reduce_sum_pack4.comp b/src/layer/vulkan/shader/softmax_reduce_sum_pack4.comp deleted file mode 100644 index 7b0e6fe2e2e8..000000000000 --- a/src/layer/vulkan/shader/softmax_reduce_sum_pack4.comp +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2019 Tencent -// SPDX-License-Identifier: BSD-3-Clause - -#version 450 - -layout(constant_id = 0) const int axis = 0; - -#define shape_constant_id_offset 1 -layout(constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout(constant_id = shape_constant_id_offset + 1) const int w = 0; -layout(constant_id = shape_constant_id_offset + 2) const int h = 0; -layout(constant_id = shape_constant_id_offset + 3) const int d = 0; -layout(constant_id = shape_constant_id_offset + 4) const int c = 0; -layout(constant_id = shape_constant_id_offset + 5) const int cstep = 0; - -layout(constant_id = shape_constant_id_offset + 6) const int outdims = 0; -layout(constant_id = shape_constant_id_offset + 7) const int outw = 0; -layout(constant_id = shape_constant_id_offset + 8) const int outh = 0; -layout(constant_id = shape_constant_id_offset + 9) const int outd = 0; -layout(constant_id = shape_constant_id_offset + 10) const int outc = 0; -layout(constant_id = shape_constant_id_offset + 11) const int outcstep = 0; - -layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; -layout(binding = 1) writeonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; - -layout(push_constant) uniform parameter -{ - int dims; - int w; - int h; - int d; - int c; - int cstep; - - int outdims; - int outw; - int outh; - int outd; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) - return; - - int positive_axis = axis < 0 ? psc(dims) + axis : axis; - afpvec4 sum_value = afpvec4(0.f); - - if (psc(dims) == 1) // positive_axis == 0 - { - for (int i = 0; i < psc(w); i++) - { - afpvec4 v = buffer_ld4(bottom_top_blob_data, i); - sum_value += v; - } - afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_value = afpvec4(sum2.r + sum2.g); - buffer_st4(sum_workspace_data, 0, sum_value); - return; - } - - if (psc(dims) == 2 && positive_axis == 0) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = i * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_value = afpvec4(sum2.r + sum2.g); - buffer_st4(sum_workspace_data, gx, sum_value); - return; - } - - if (psc(dims) == 2 && positive_axis == 1) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gx * psc(w) + i; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st4(sum_workspace_data, gx, sum_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 0) - { - for (int i = 0; i < psc(c); i++) - { - int v_offset = i * psc(cstep) + gy * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_value = afpvec4(sum2.r + sum2.g); - buffer_st4(sum_workspace_data, gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 1) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = gy * psc(cstep) + i * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st4(sum_workspace_data, gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 3 && positive_axis == 2) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gy * psc(cstep) + gx * psc(w) + i; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st4(sum_workspace_data, gy * psc(h) + gx, sum_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 0) - { - for (int i = 0; i < psc(c); i++) - { - int v_offset = i * psc(cstep) + gz * psc(h) * psc(w) + gy * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_value = afpvec4(sum2.r + sum2.g); - buffer_st4(sum_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 1) - { - for (int i = 0; i < psc(d); i++) - { - int v_offset = gz * psc(cstep) + i * psc(h) * psc(w) + gy * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st4(sum_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 2) - { - for (int i = 0; i < psc(h); i++) - { - int v_offset = gz * psc(cstep) + gy * psc(h) * psc(w) + i * psc(w) + gx; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st4(sum_workspace_data, gz * psc(outcstep) + gy * psc(w) + gx, sum_value); - return; - } - - if (psc(dims) == 4 && positive_axis == 3) - { - for (int i = 0; i < psc(w); i++) - { - int v_offset = gz * psc(cstep) + gy * psc(h) * psc(w) + gx * psc(w) + i; - afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); - sum_value += v; - } - buffer_st4(sum_workspace_data, gz * psc(outcstep) + gy * psc(h) + gx, sum_value); - return; - } -} diff --git a/src/layer/vulkan/softmax_vulkan.cpp b/src/layer/vulkan/softmax_vulkan.cpp index b63f13a2328b..af0ae8cc8462 100644 --- a/src/layer/vulkan/softmax_vulkan.cpp +++ b/src/layer/vulkan/softmax_vulkan.cpp @@ -1,4 +1,4 @@ -// Copyright 2019 Tencent +// Copyright 2026 Futz12 // SPDX-License-Identifier: BSD-3-Clause #include "softmax_vulkan.h" @@ -10,23 +10,18 @@ namespace ncnn { Softmax_vulkan::Softmax_vulkan() { support_vulkan = true; - support_vulkan_packing = true; - pipeline_softmax_reduce_max = 0; - pipeline_softmax_exp_sub_max = 0; - pipeline_softmax_reduce_sum = 0; - pipeline_softmax_div_sum = 0; + support_vulkan_packing = true; - pipeline_softmax_reduce_max_pack4 = 0; - pipeline_softmax_exp_sub_max_pack4 = 0; - pipeline_softmax_reduce_sum_pack4 = 0; - pipeline_softmax_div_sum_pack4 = 0; + pipeline_softmax = 0; + pipeline_softmax_pack4 = 0; } -int Softmax_vulkan::create_pipeline(const Option& opt) +int Softmax_vulkan::create_pipeline(const Option& _opt) { - const Mat& shape = top_shapes.empty() ? Mat() : top_shapes[0]; - int positive_axis = axis < 0 ? shape.dims + axis : axis; + Option opt = _opt; + + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; int elempack = 1; if (shape.dims == 1) elempack = shape.w % 4 == 0 ? 4 : 1; @@ -49,159 +44,28 @@ int Softmax_vulkan::create_pipeline(const Option& opt) if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); - Mat workspace_shape_packed; - if (shape.dims == 1) // positive_axis == 0 - { - workspace_shape_packed = Mat(1, (void*)0, elemsize, elempack); - } - else if (shape.dims == 2 && positive_axis == 0) - { - workspace_shape_packed = Mat(shape.w, (void*)0, elemsize, elempack); - } - else if (shape.dims == 2 && positive_axis == 1) - { - workspace_shape_packed = Mat(shape.h / elempack, (void*)0, elemsize, elempack); - } - else if (shape.dims == 3 && positive_axis == 0) - { - workspace_shape_packed = Mat(shape.w, shape.h, (void*)0, elemsize, elempack); - } - else if (shape.dims == 3 && positive_axis == 1) - { - workspace_shape_packed = Mat(shape.w, shape.c / elempack, (void*)0, elemsize, elempack); - } - else if (shape.dims == 3 && positive_axis == 2) - { - workspace_shape_packed = Mat(shape.h, shape.c / elempack, (void*)0, elemsize, elempack); - } - else if (shape.dims == 4 && positive_axis == 0) - { - workspace_shape_packed = Mat(shape.w, shape.h, shape.d, (void*)0, elemsize, elempack); - } - else if (shape.dims == 4 && positive_axis == 1) - { - workspace_shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); - } - else if (shape.dims == 4 && positive_axis == 2) - { - workspace_shape_packed = Mat(shape.w, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); - } - else if (shape.dims == 4 && positive_axis == 3) - { - workspace_shape_packed = Mat(shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); - } - - std::vector specializations(1 + 12); + std::vector specializations(1 + 7); specializations[0].i = axis; specializations[1 + 0].i = shape_packed.dims; specializations[1 + 1].i = shape_packed.w; specializations[1 + 2].i = shape_packed.h; specializations[1 + 3].i = shape_packed.d; specializations[1 + 4].i = shape_packed.c; - specializations[1 + 5].i = shape_packed.cstep; - specializations[1 + 6].i = workspace_shape_packed.dims; - specializations[1 + 7].i = workspace_shape_packed.w; - specializations[1 + 8].i = workspace_shape_packed.h; - specializations[1 + 9].i = workspace_shape_packed.d; - specializations[1 + 10].i = workspace_shape_packed.c; - specializations[1 + 11].i = workspace_shape_packed.cstep; + specializations[1 + 5].i = 0; + specializations[1 + 6].i = 0; + if (shape.dims == 0 || elempack == 1) { - Mat local_size_xyz; - if (workspace_shape_packed.dims == 1) - { - local_size_xyz.w = std::min(64, workspace_shape_packed.w); - local_size_xyz.h = 1; - local_size_xyz.c = 1; - } - if (workspace_shape_packed.dims == 2) - { - local_size_xyz.w = std::min(8, workspace_shape_packed.w); - local_size_xyz.h = std::min(8, workspace_shape_packed.h); - local_size_xyz.c = 1; - } - if (workspace_shape_packed.dims != 0) - { - local_size_xyz.w = std::min(4, workspace_shape_packed.w); - local_size_xyz.h = std::min(4, workspace_shape_packed.h * workspace_shape_packed.d); - local_size_xyz.c = std::min(4, workspace_shape_packed.c); - } - - // pack1 - { - pipeline_softmax_reduce_max = new Pipeline(vkdev); - pipeline_softmax_reduce_sum = new Pipeline(vkdev); - - pipeline_softmax_reduce_max->set_optimal_local_size_xyz(local_size_xyz); - pipeline_softmax_reduce_sum->set_optimal_local_size_xyz(local_size_xyz); - - pipeline_softmax_reduce_max->create(LayerShaderType::softmax_reduce_max, opt, specializations); - pipeline_softmax_reduce_sum->create(LayerShaderType::softmax_reduce_sum, opt, specializations); - } - - // pack4 - { - pipeline_softmax_reduce_max_pack4 = new Pipeline(vkdev); - pipeline_softmax_reduce_sum_pack4 = new Pipeline(vkdev); - - pipeline_softmax_reduce_max_pack4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_softmax_reduce_sum_pack4->set_optimal_local_size_xyz(local_size_xyz); - - pipeline_softmax_reduce_max_pack4->create(LayerShaderType::softmax_reduce_max_pack4, opt, specializations); - pipeline_softmax_reduce_sum_pack4->create(LayerShaderType::softmax_reduce_sum_pack4, opt, specializations); - } + pipeline_softmax = new Pipeline(vkdev); + pipeline_softmax->set_local_size_xyz(256, 1, 1); + pipeline_softmax->create(LayerShaderType::softmax, opt, specializations); } + if (shape.dims == 0 || elempack == 4) { - Mat local_size_xyz; - if (shape_packed.dims == 1) - { - local_size_xyz.w = std::min(64, shape_packed.w); - local_size_xyz.h = 1; - local_size_xyz.c = 1; - } - if (shape_packed.dims == 2) - { - local_size_xyz.w = std::min(8, shape_packed.w); - local_size_xyz.h = std::min(8, shape_packed.h); - local_size_xyz.c = 1; - } - if (shape_packed.dims == 3) - { - local_size_xyz.w = std::min(4, shape_packed.w); - local_size_xyz.h = std::min(4, shape_packed.h); - local_size_xyz.c = std::min(4, shape_packed.c); - } - if (shape_packed.dims == 4) - { - local_size_xyz.w = std::min(4, shape_packed.w); - local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d); - local_size_xyz.c = std::min(4, shape_packed.c); - } - - // pack1 - { - pipeline_softmax_exp_sub_max = new Pipeline(vkdev); - pipeline_softmax_div_sum = new Pipeline(vkdev); - - pipeline_softmax_exp_sub_max->set_optimal_local_size_xyz(local_size_xyz); - pipeline_softmax_div_sum->set_optimal_local_size_xyz(local_size_xyz); - - pipeline_softmax_exp_sub_max->create(LayerShaderType::softmax_exp_sub_max, opt, specializations); - pipeline_softmax_div_sum->create(LayerShaderType::softmax_div_sum, opt, specializations); - } - - // pack4 - { - pipeline_softmax_exp_sub_max_pack4 = new Pipeline(vkdev); - pipeline_softmax_div_sum_pack4 = new Pipeline(vkdev); - - pipeline_softmax_exp_sub_max_pack4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_softmax_div_sum_pack4->set_optimal_local_size_xyz(local_size_xyz); - - pipeline_softmax_exp_sub_max_pack4->create(LayerShaderType::softmax_exp_sub_max_pack4, opt, specializations); - pipeline_softmax_div_sum_pack4->create(LayerShaderType::softmax_div_sum_pack4, opt, specializations); - } + pipeline_softmax_pack4 = new Pipeline(vkdev); + pipeline_softmax_pack4->set_local_size_xyz(256, 1, 1); + pipeline_softmax_pack4->create(LayerShaderType::softmax_pack4, opt, specializations); } return 0; @@ -209,198 +73,94 @@ int Softmax_vulkan::create_pipeline(const Option& opt) int Softmax_vulkan::destroy_pipeline(const Option& /*opt*/) { - delete pipeline_softmax_reduce_max; - pipeline_softmax_reduce_max = 0; - - delete pipeline_softmax_exp_sub_max; - pipeline_softmax_exp_sub_max = 0; - - delete pipeline_softmax_reduce_sum; - pipeline_softmax_reduce_sum = 0; + delete pipeline_softmax; + pipeline_softmax = 0; - delete pipeline_softmax_div_sum; - pipeline_softmax_div_sum = 0; - - delete pipeline_softmax_reduce_max_pack4; - pipeline_softmax_reduce_max_pack4 = 0; - - delete pipeline_softmax_exp_sub_max_pack4; - pipeline_softmax_exp_sub_max_pack4 = 0; - - delete pipeline_softmax_reduce_sum_pack4; - pipeline_softmax_reduce_sum_pack4 = 0; - - delete pipeline_softmax_div_sum_pack4; - pipeline_softmax_div_sum_pack4 = 0; + delete pipeline_softmax_pack4; + pipeline_softmax_pack4 = 0; return 0; } int Softmax_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const { - int dims = bottom_top_blob.dims; - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - size_t elemsize = bottom_top_blob.elemsize; - int elempack = bottom_top_blob.elempack; - int positive_axis = axis < 0 ? dims + axis : axis; + const VkMat& bottom_blob = bottom_top_blob; + + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int c = bottom_blob.c; + + VkMat top_blob; + if (dims == 1) + top_blob.create(w, bottom_blob.elemsize, bottom_blob.elempack, opt.blob_vkallocator); + else if (dims == 2) + top_blob.create(w, h, bottom_blob.elemsize, bottom_blob.elempack, opt.blob_vkallocator); + else if (dims == 3) + top_blob.create(w, h, c, bottom_blob.elemsize, bottom_blob.elempack, opt.blob_vkallocator); + else + top_blob.create(w, h, d, c, bottom_blob.elemsize, bottom_blob.elempack, opt.blob_vkallocator); - VkMat max_workspace; - VkMat sum_workspace; + if (top_blob.empty()) + return -100; - if (dims == 1) // positive_axis == 0 - { - max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator); - } - else if (dims == 2 && positive_axis == 0) - { - max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator); - } - else if (dims == 2 && positive_axis == 1) - { - max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator); - } - else if (dims == 3 && positive_axis == 0) - { - max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator); - } - else if (dims == 3 && positive_axis == 1) - { - max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator); - } - else if (dims == 3 && positive_axis == 2) - { - max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator); - } - else if (dims == 4 && positive_axis == 0) - { - max_workspace.create(w, h, d, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(w, h, d, elemsize, elempack, opt.workspace_vkallocator); - } - else if (dims == 4 && positive_axis == 1) + const int pa = axis < 0 ? dims + axis : axis; + + int slices = 1; + if (dims == 1) { - max_workspace.create(w, h, channels, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(w, h, channels, elemsize, elempack, opt.workspace_vkallocator); + slices = 1; } - else if (dims == 4 && positive_axis == 2) + else if (dims == 2) { - max_workspace.create(w, d, channels, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(w, d, channels, elemsize, elempack, opt.workspace_vkallocator); + slices = pa == 0 ? w : h; } - else if (dims == 4 && positive_axis == 3) + else if (dims == 3) { - max_workspace.create(h, d, channels, elemsize, elempack, opt.workspace_vkallocator); - sum_workspace.create(h, d, channels, elemsize, elempack, opt.workspace_vkallocator); + if (pa == 0) + slices = w * h; + else if (pa == 1) + slices = c * w; + else + slices = c * h; } - - // reduce max + else { - std::vector bindings(2); - bindings[0] = bottom_top_blob; - bindings[1] = max_workspace; - - std::vector constants(12); - constants[0].i = bottom_top_blob.dims; - constants[1].i = bottom_top_blob.w; - constants[2].i = bottom_top_blob.h; - constants[3].i = bottom_top_blob.d; - constants[4].i = bottom_top_blob.c; - constants[5].i = bottom_top_blob.cstep; - constants[6].i = max_workspace.dims; - constants[7].i = max_workspace.w; - constants[8].i = max_workspace.h; - constants[9].i = max_workspace.d; - constants[10].i = max_workspace.c; - constants[11].i = max_workspace.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_softmax_reduce_max_pack4 : pipeline_softmax_reduce_max; - - cmd.record_pipeline(pipeline, bindings, constants, max_workspace); + const int plane = w * h; + if (pa == 0) + slices = plane * d; + else if (pa == 1) + slices = c * plane; + else if (pa == 2) + slices = c * d * w; + else + slices = c * d * h; } - // exp( v - max ) - { - std::vector bindings(2); - bindings[0] = bottom_top_blob; - bindings[1] = max_workspace; + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; - std::vector constants(12); - constants[0].i = bottom_top_blob.dims; - constants[1].i = bottom_top_blob.w; - constants[2].i = bottom_top_blob.h; - constants[3].i = bottom_top_blob.d; - constants[4].i = bottom_top_blob.c; - constants[5].i = bottom_top_blob.cstep; - constants[6].i = max_workspace.dims; - constants[7].i = max_workspace.w; - constants[8].i = max_workspace.h; - constants[9].i = max_workspace.d; - constants[10].i = max_workspace.c; - constants[11].i = max_workspace.cstep; + std::vector constants(7); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.d; + constants[4].i = bottom_blob.c; + constants[5].i = bottom_blob.cstep; + constants[6].i = top_blob.cstep; - const Pipeline* pipeline = elempack == 4 ? pipeline_softmax_exp_sub_max_pack4 : pipeline_softmax_exp_sub_max; + const Pipeline* pipeline = bottom_blob.elempack == 4 ? pipeline_softmax_pack4 : pipeline_softmax; - cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); - } - - // reduce sum - { - std::vector bindings(2); - bindings[0] = bottom_top_blob; - bindings[1] = sum_workspace; + VkMat dispatcher; + dispatcher.w = slices * 256; + dispatcher.h = 1; + dispatcher.c = 1; - std::vector constants(12); - constants[0].i = bottom_top_blob.dims; - constants[1].i = bottom_top_blob.w; - constants[2].i = bottom_top_blob.h; - constants[3].i = bottom_top_blob.d; - constants[4].i = bottom_top_blob.c; - constants[5].i = bottom_top_blob.cstep; - constants[6].i = sum_workspace.dims; - constants[7].i = sum_workspace.w; - constants[8].i = sum_workspace.h; - constants[9].i = sum_workspace.d; - constants[10].i = sum_workspace.c; - constants[11].i = sum_workspace.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_softmax_reduce_sum_pack4 : pipeline_softmax_reduce_sum; - - cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); - } - - // div sum - { - std::vector bindings(2); - bindings[0] = bottom_top_blob; - bindings[1] = sum_workspace; - - std::vector constants(12); - constants[0].i = bottom_top_blob.dims; - constants[1].i = bottom_top_blob.w; - constants[2].i = bottom_top_blob.h; - constants[3].i = bottom_top_blob.d; - constants[4].i = bottom_top_blob.c; - constants[5].i = bottom_top_blob.cstep; - constants[6].i = sum_workspace.dims; - constants[7].i = sum_workspace.w; - constants[8].i = sum_workspace.h; - constants[9].i = sum_workspace.d; - constants[10].i = sum_workspace.c; - constants[11].i = sum_workspace.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_softmax_div_sum_pack4 : pipeline_softmax_div_sum; - - cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); - } + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + bottom_top_blob = top_blob; return 0; } diff --git a/src/layer/vulkan/softmax_vulkan.h b/src/layer/vulkan/softmax_vulkan.h index 764adb62bbf4..f7fab241b14e 100644 --- a/src/layer/vulkan/softmax_vulkan.h +++ b/src/layer/vulkan/softmax_vulkan.h @@ -1,4 +1,4 @@ -// Copyright 2019 Tencent +// Copyright 2026 Futz12 // SPDX-License-Identifier: BSD-3-Clause #ifndef LAYER_SOFTMAX_VULKAN_H @@ -20,15 +20,8 @@ class Softmax_vulkan : public Softmax virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; public: - Pipeline* pipeline_softmax_reduce_max; - Pipeline* pipeline_softmax_exp_sub_max; - Pipeline* pipeline_softmax_reduce_sum; - Pipeline* pipeline_softmax_div_sum; - - Pipeline* pipeline_softmax_reduce_max_pack4; - Pipeline* pipeline_softmax_exp_sub_max_pack4; - Pipeline* pipeline_softmax_reduce_sum_pack4; - Pipeline* pipeline_softmax_div_sum_pack4; + Pipeline* pipeline_softmax; + Pipeline* pipeline_softmax_pack4; }; } // namespace ncnn