diff --git a/src/layer/vulkan/gru_vulkan.cpp b/src/layer/vulkan/gru_vulkan.cpp new file mode 100644 index 000000000000..28db9be06475 --- /dev/null +++ b/src/layer/vulkan/gru_vulkan.cpp @@ -0,0 +1,467 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "gru_vulkan.h" + +#include + +#include "layer_shader_type.h" + +namespace ncnn { + +GRU_vulkan::GRU_vulkan() +{ + support_vulkan = true; + support_vulkan_packing = false; + support_vulkan_any_packing = false; + + pipeline_gru_step = 0; + pipeline_gru_step_pack4 = 0; + pipeline_gru_copy = 0; +} + +int GRU_vulkan::load_param(const ParamDict& pd) +{ + int ret = GRU::load_param(pd); + + if (int8_scale_term) + { + support_vulkan = false; + } + + return ret; +} + +int GRU_vulkan::create_pipeline(const Option& opt) +{ + if (!support_vulkan) + return 0; + + { + pipeline_gru_step = new Pipeline(vkdev); + pipeline_gru_step->set_local_size_xyz(64, 1, 1); + + std::vector specializations; + pipeline_gru_step->create(LayerShaderType::gru_step, opt, specializations); + } + + if (num_output % 4 == 0) + { + pipeline_gru_step_pack4 = new Pipeline(vkdev); + pipeline_gru_step_pack4->set_local_size_xyz(64, 1, 1); + + std::vector specializations; + pipeline_gru_step_pack4->create(LayerShaderType::gru_step_pack4, opt, specializations); + } + + { + pipeline_gru_copy = new Pipeline(vkdev); + pipeline_gru_copy->set_local_size_xyz(64, 1, 1); + + std::vector specializations; + pipeline_gru_copy->create(LayerShaderType::gru_copy, opt, specializations); + } + + return 0; +} + +int GRU_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_gru_step; + pipeline_gru_step = 0; + + delete pipeline_gru_step_pack4; + pipeline_gru_step_pack4 = 0; + + delete pipeline_gru_copy; + pipeline_gru_copy = 0; + + return 0; +} + +static void pack_gru_weights_bias_pack4(const Mat& weight_xc_data, + const Mat& bias_c_data, + const Mat& weight_hc_data, + Mat& weight_xc_data_pack4, + Mat& bias_c_data_pack4, + Mat& weight_hc_data_pack4, + int size, + int num_output, + int num_directions) +{ + const int num_output_pack = num_output / 4; + + weight_xc_data_pack4.create(size, num_directions * 3 * num_output_pack, (size_t)16u, 4); + weight_hc_data_pack4.create(num_output, num_directions * 3 * num_output_pack, (size_t)16u, 4); + bias_c_data_pack4.create(num_output_pack, num_directions * 4, (size_t)16u, 4); + + const float* wxc_ptr = weight_xc_data; + const float* whc_ptr = weight_hc_data; + const float* bias_ptr = bias_c_data; + + for (int dir = 0; dir < num_directions; dir++) + { + for (int gate = 0; gate < 3; gate++) + { + for (int q_pack = 0; q_pack < num_output_pack; q_pack++) + { + float* wxc_row = weight_xc_data_pack4.row(dir * 3 * num_output_pack + gate * num_output_pack + q_pack); + float* whc_row = weight_hc_data_pack4.row(dir * 3 * num_output_pack + gate * num_output_pack + q_pack); + + for (int i = 0; i < size; i++) + { + const int src_base = (dir * 3 * num_output + gate * num_output + q_pack * 4) * size + i; + + wxc_row[i * 4 + 0] = wxc_ptr[src_base + 0 * size]; + wxc_row[i * 4 + 1] = wxc_ptr[src_base + 1 * size]; + wxc_row[i * 4 + 2] = wxc_ptr[src_base + 2 * size]; + wxc_row[i * 4 + 3] = wxc_ptr[src_base + 3 * size]; + } + + for (int i = 0; i < num_output; i++) + { + const int src_base = (dir * 3 * num_output + gate * num_output + q_pack * 4) * num_output + i; + + whc_row[i * 4 + 0] = whc_ptr[src_base + 0 * num_output]; + whc_row[i * 4 + 1] = whc_ptr[src_base + 1 * num_output]; + whc_row[i * 4 + 2] = whc_ptr[src_base + 2 * num_output]; + whc_row[i * 4 + 3] = whc_ptr[src_base + 3 * num_output]; + } + } + } + + for (int b = 0; b < 4; b++) + { + float* bias_row = bias_c_data_pack4.row(dir * 4 + b); + + for (int q_pack = 0; q_pack < num_output_pack; q_pack++) + { + const int q0 = q_pack * 4; + const int src_base = dir * (num_output * 4) + b * num_output + q0; + + bias_row[q_pack * 4 + 0] = bias_ptr[src_base + 0]; + bias_row[q_pack * 4 + 1] = bias_ptr[src_base + 1]; + bias_row[q_pack * 4 + 2] = bias_ptr[src_base + 2]; + bias_row[q_pack * 4 + 3] = bias_ptr[src_base + 3]; + } + } + } +} + +int GRU_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + if (!support_vulkan) + return 0; + + cmd.record_upload(weight_xc_data, weight_xc_data_gpu, opt); + cmd.record_upload(bias_c_data, bias_c_data_gpu, opt); + cmd.record_upload(weight_hc_data, weight_hc_data_gpu, opt); + + if (num_output % 4 == 0) + { + const int size = weight_xc_data.w; + const int num_directions = direction == 2 ? 2 : 1; + + Mat weight_xc_data_pack4; + Mat bias_c_data_pack4; + Mat weight_hc_data_pack4; + + pack_gru_weights_bias_pack4(weight_xc_data, bias_c_data, weight_hc_data, + weight_xc_data_pack4, bias_c_data_pack4, weight_hc_data_pack4, + size, num_output, num_directions); + + cmd.record_upload(weight_xc_data_pack4, weight_xc_data_gpu_pack4, opt); + cmd.record_upload(bias_c_data_pack4, bias_c_data_gpu_pack4, opt); + cmd.record_upload(weight_hc_data_pack4, weight_hc_data_gpu_pack4, opt); + } + + if (opt.lightmode) + { + weight_xc_data.release(); + bias_c_data.release(); + weight_hc_data.release(); + } + + return 0; +} + +static inline void record_gru_copy(const Pipeline* pipeline, + VkCompute& cmd, + const VkMat& src, + VkMat& dst, + int len, + int src_offset, + int dst_offset, + int mode) +{ + std::vector bindings(2); + bindings[0] = src; + bindings[1] = dst; + + std::vector constants(4); + constants[0].i = len; + constants[1].i = src_offset; + constants[2].i = dst_offset; + constants[3].i = mode; + + VkMat dispatcher; + dispatcher.w = len; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); +} + +static inline void record_gru_step_pack1(const Pipeline* pipeline, + VkCompute& cmd, + const VkMat& bottom_blob, + const VkMat& weight_xc, + const VkMat& bias_c, + const VkMat& weight_hc, + const VkMat& hidden_prev, + VkMat& hidden_next, + VkMat& top_blob, + int size, + int num_output, + int ti, + int outw, + int out_offset, + int dir, + int wxc_dir_stride, + int whc_dir_stride, + int bias_dir_stride, + int bottom_step) +{ + std::vector bindings(7); + bindings[0] = bottom_blob; + bindings[1] = weight_xc; + bindings[2] = bias_c; + bindings[3] = weight_hc; + bindings[4] = hidden_prev; + bindings[5] = hidden_next; + bindings[6] = top_blob; + + std::vector constants(10); + constants[0].i = size; + constants[1].i = num_output; + constants[2].i = ti; + constants[3].i = outw; + constants[4].i = out_offset; + constants[5].i = dir; + constants[6].i = wxc_dir_stride; + constants[7].i = whc_dir_stride; + constants[8].i = bias_dir_stride; + constants[9].i = bottom_step; + + VkMat dispatcher; + dispatcher.w = num_output; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); +} + +static inline void record_gru_step_pack4(const Pipeline* pipeline, + VkCompute& cmd, + const VkMat& bottom_blob, + const VkMat& weight_xc_pack4, + const VkMat& bias_c_pack4, + const VkMat& weight_hc_pack4, + const VkMat& hidden_prev, + VkMat& hidden_next, + VkMat& top_blob, + int size, + int num_output, + int ti, + int outw, + int out_offset, + int dir) +{ + std::vector bindings(7); + bindings[0] = bottom_blob; + bindings[1] = weight_xc_pack4; + bindings[2] = bias_c_pack4; + bindings[3] = weight_hc_pack4; + bindings[4] = hidden_prev; + bindings[5] = hidden_next; + bindings[6] = top_blob; + + std::vector constants(6); + constants[0].i = size; + constants[1].i = num_output; + constants[2].i = ti; + constants[3].i = outw; + constants[4].i = out_offset; + constants[5].i = dir; + + VkMat dispatcher; + dispatcher.w = num_output / 4; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); +} + +int GRU_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + if (!support_vulkan) + return -1; + + const VkMat& bottom_blob = bottom_blobs[0]; + + const int size = bottom_blob.w; + const int timesteps = bottom_blob.h; + + const int num_directions = direction == 2 ? 2 : 1; + + VkMat& top_blob = top_blobs[0]; + top_blob.create(num_output * num_directions, timesteps, bottom_blob.elemsize, 1, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + VkAllocator* hidden_vkallocator = top_blobs.size() == 2 ? opt.blob_vkallocator : opt.workspace_vkallocator; + + VkMat hidden0; + VkMat hidden0_next; + hidden0.create(num_output, 1, bottom_blob.elemsize, 1, hidden_vkallocator); + hidden0_next.create(num_output, 1, bottom_blob.elemsize, 1, hidden_vkallocator); + if (hidden0.empty() || hidden0_next.empty()) + return -100; + + VkMat hidden1; + VkMat hidden1_next; + if (num_directions == 2) + { + hidden1.create(num_output, 1, bottom_blob.elemsize, 1, hidden_vkallocator); + hidden1_next.create(num_output, 1, bottom_blob.elemsize, 1, hidden_vkallocator); + if (hidden1.empty() || hidden1_next.empty()) + return -100; + } + + if (bottom_blobs.size() == 2) + { + const VkMat& hidden_in0 = bottom_blobs[1]; + + if (num_directions == 1) + { + record_gru_copy(pipeline_gru_copy, cmd, hidden_in0, hidden0, num_output, 0, 0, 1); + } + else + { + record_gru_copy(pipeline_gru_copy, cmd, hidden_in0, hidden0, num_output, 0, 0, 1); + record_gru_copy(pipeline_gru_copy, cmd, hidden_in0, hidden1, num_output, num_output, 0, 1); + } + } + else + { + record_gru_copy(pipeline_gru_copy, cmd, bottom_blob, hidden0, num_output, 0, 0, 0); + if (num_directions == 2) + { + record_gru_copy(pipeline_gru_copy, cmd, bottom_blob, hidden1, num_output, 0, 0, 0); + } + } + + const int wxc_dir_stride = size * (num_output * 3); + const int whc_dir_stride = num_output * (num_output * 3); + const int bias_dir_stride = num_output * 4; + const int bottom_step = size; + + const bool use_pack4 = (num_output % 4 == 0) + && pipeline_gru_step_pack4 + && !weight_xc_data_gpu_pack4.empty() + && !bias_c_data_gpu_pack4.empty() + && !weight_hc_data_gpu_pack4.empty(); + + auto run_sequence = [&](int dir_index, int out_offset, int reverse, VkMat& hprev, VkMat& hnext) { + for (int t = 0; t < timesteps; t++) + { + const int ti = reverse ? (timesteps - 1 - t) : t; + + if (use_pack4) + { + record_gru_step_pack4(pipeline_gru_step_pack4, + cmd, + bottom_blob, + weight_xc_data_gpu_pack4, + bias_c_data_gpu_pack4, + weight_hc_data_gpu_pack4, + hprev, + hnext, + top_blob, + size, + num_output, + ti, + top_blob.w, + out_offset, + dir_index); + } + else + { + record_gru_step_pack1(pipeline_gru_step, + cmd, + bottom_blob, + weight_xc_data_gpu, + bias_c_data_gpu, + weight_hc_data_gpu, + hprev, + hnext, + top_blob, + size, + num_output, + ti, + top_blob.w, + out_offset, + dir_index, + wxc_dir_stride, + whc_dir_stride, + bias_dir_stride, + bottom_step); + } + + std::swap(hprev, hnext); + } + }; + + if (direction == 0 || direction == 1) + { + run_sequence(0, 0, direction, hidden0, hidden0_next); + } + else + { + run_sequence(0, 0, 0, hidden0, hidden0_next); + run_sequence(1, num_output, 1, hidden1, hidden1_next); + } + + if (top_blobs.size() == 2) + { + if (num_directions == 1) + { + top_blobs[1] = hidden0; + } + else + { + VkMat& hidden_out = top_blobs[1]; + hidden_out.create(num_output, 2, bottom_blob.elemsize, 1, opt.blob_vkallocator); + if (hidden_out.empty()) + return -100; + + record_gru_copy(pipeline_gru_copy, cmd, hidden0, hidden_out, num_output, 0, 0, 1); + record_gru_copy(pipeline_gru_copy, cmd, hidden1, hidden_out, num_output, 0, num_output, 1); + } + } + + return 0; +} + +int GRU_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + std::vector bottom_blobs(1); + std::vector top_blobs(1); + bottom_blobs[0] = bottom_blob; + + int ret = forward(bottom_blobs, top_blobs, cmd, opt); + top_blob = top_blobs[0]; + return ret; +} + +} // namespace ncnn diff --git a/src/layer/vulkan/gru_vulkan.h b/src/layer/vulkan/gru_vulkan.h new file mode 100644 index 000000000000..30214957d853 --- /dev/null +++ b/src/layer/vulkan/gru_vulkan.h @@ -0,0 +1,43 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_GRU_VULKAN_H +#define LAYER_GRU_VULKAN_H + +#include "gru.h" + +namespace ncnn { + +class GRU_vulkan : public GRU +{ +public: + GRU_vulkan(); + + virtual int load_param(const ParamDict& pd); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + using GRU::forward; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + VkMat weight_xc_data_gpu; + VkMat bias_c_data_gpu; + VkMat weight_hc_data_gpu; + + VkMat weight_xc_data_gpu_pack4; + VkMat bias_c_data_gpu_pack4; + VkMat weight_hc_data_gpu_pack4; + + Pipeline* pipeline_gru_step; + Pipeline* pipeline_gru_step_pack4; + Pipeline* pipeline_gru_copy; +}; + +} // namespace ncnn + +#endif // LAYER_GRU_VULKAN_H diff --git a/src/layer/vulkan/shader/gru_copy.comp b/src/layer/vulkan/shader/gru_copy.comp new file mode 100644 index 000000000000..2bbd049aae3c --- /dev/null +++ b/src/layer/vulkan/shader/gru_copy.comp @@ -0,0 +1,36 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout (binding = 0) readonly buffer src_blob { sfp src_data[]; }; +layout (binding = 1) writeonly buffer dst_blob { sfp dst_data[]; }; + +layout (push_constant) uniform parameter +{ + int len; + int src_offset; + int dst_offset; + int mode; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + + if (gx >= p.len) + return; + + int di = p.dst_offset + gx; + + if (p.mode == 0) + { + buffer_st1(dst_data, di, afp(0.f)); + } + else + { + int si = p.src_offset + gx; + afp v = buffer_ld1(src_data, si); + buffer_st1(dst_data, di, v); + } +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/gru_step.comp b/src/layer/vulkan/shader/gru_step.comp new file mode 100644 index 000000000000..df72224b8e35 --- /dev/null +++ b/src/layer/vulkan/shader/gru_step.comp @@ -0,0 +1,100 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) readonly buffer weight_xc_blob { sfp weight_xc_data[]; }; +layout (binding = 2) readonly buffer bias_c_blob { sfp bias_c_data[]; }; +layout (binding = 3) readonly buffer weight_hc_blob { sfp weight_hc_data[]; }; +layout (binding = 4) readonly buffer hidden_prev_blob { sfp hidden_prev_data[]; }; +layout (binding = 5) writeonly buffer hidden_next_blob { sfp hidden_next_data[]; }; +layout (binding = 6) writeonly buffer top_blob { sfp top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int size; + int num_output; + int ti; + int outw; + int out_offset; + int dir; + int wxc_dir_stride; + int whc_dir_stride; + int bias_dir_stride; + int bottom_step; +} p; + +void main() +{ + int q = int(gl_GlobalInvocationID.x); + + if (q >= p.num_output) + return; + + int x_offset = p.ti * p.bottom_step; + + int wxc_base = p.dir * p.wxc_dir_stride; + int whc_base = p.dir * p.whc_dir_stride; + int bias_base = p.dir * p.bias_dir_stride; + + int bias_R = bias_base + 0 * p.num_output + q; + int bias_U = bias_base + 1 * p.num_output + q; + int bias_WN = bias_base + 2 * p.num_output + q; + int bias_BN = bias_base + 3 * p.num_output + q; + + int wxc_R = wxc_base + (0 * p.num_output + q) * p.size; + int wxc_U = wxc_base + (1 * p.num_output + q) * p.size; + int wxc_N = wxc_base + (2 * p.num_output + q) * p.size; + + int whc_R = whc_base + (0 * p.num_output + q) * p.num_output; + int whc_U = whc_base + (1 * p.num_output + q) * p.num_output; + int whc_N = whc_base + (2 * p.num_output + q) * p.num_output; + + afp R = buffer_ld1(bias_c_data, bias_R); + afp U = buffer_ld1(bias_c_data, bias_U); + + for (int i = 0; i < p.size; i++) + { + afp xi = buffer_ld1(bottom_blob_data, x_offset + i); + + R += buffer_ld1(weight_xc_data, wxc_R + i) * xi; + U += buffer_ld1(weight_xc_data, wxc_U + i) * xi; + } + + for (int i = 0; i < p.num_output; i++) + { + afp hi = buffer_ld1(hidden_prev_data, i); + + R += buffer_ld1(weight_hc_data, whc_R + i) * hi; + U += buffer_ld1(weight_hc_data, whc_U + i) * hi; + } + + R = 1.f / (1.f + exp(-R)); + U = 1.f / (1.f + exp(-U)); + + afp N = buffer_ld1(bias_c_data, bias_BN); + for (int i = 0; i < p.num_output; i++) + { + afp hi = buffer_ld1(hidden_prev_data, i); + N += buffer_ld1(weight_hc_data, whc_N + i) * hi; + } + + N = buffer_ld1(bias_c_data, bias_WN) + R * N; + + for (int i = 0; i < p.size; i++) + { + afp xi = buffer_ld1(bottom_blob_data, x_offset + i); + N += buffer_ld1(weight_xc_data, wxc_N + i) * xi; + } + + N = tanh(N); + + afp hprev_q = buffer_ld1(hidden_prev_data, q); + afp H = (1.f - U) * N + U * hprev_q; + + buffer_st1(hidden_next_data, q, H); + + int out_index = p.ti * p.outw + p.out_offset + q; + buffer_st1(top_blob_data, out_index, H); +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/gru_step_pack4.comp b/src/layer/vulkan/shader/gru_step_pack4.comp new file mode 100644 index 000000000000..c59ea8d97037 --- /dev/null +++ b/src/layer/vulkan/shader/gru_step_pack4.comp @@ -0,0 +1,107 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) readonly buffer weight_xc_blob { sfpvec4 weight_xc_data[]; }; +layout (binding = 2) readonly buffer bias_c_blob { sfpvec4 bias_c_data[]; }; +layout (binding = 3) readonly buffer weight_hc_blob { sfpvec4 weight_hc_data[]; }; +layout (binding = 4) readonly buffer hidden_prev_blob { sfp hidden_prev_data[]; }; +layout (binding = 5) writeonly buffer hidden_next_blob { sfp hidden_next_data[]; }; +layout (binding = 6) writeonly buffer top_blob { sfp top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int size; + int num_output; + int ti; + int outw; + int out_offset; + int dir; +} p; + +void main() +{ + int q_pack = int(gl_GlobalInvocationID.x); + + int num_output_pack = p.num_output / 4; + if (q_pack >= num_output_pack) + return; + + int x_offset = p.ti * p.size; + + int wxc_dir_base = p.dir * (3 * num_output_pack); + int whc_dir_base = p.dir * (3 * num_output_pack); + int bias_dir_base = p.dir * (4 * num_output_pack); + + int wxc_row_r = wxc_dir_base + 0 * num_output_pack + q_pack; + int wxc_row_u = wxc_dir_base + 1 * num_output_pack + q_pack; + int wxc_row_n = wxc_dir_base + 2 * num_output_pack + q_pack; + + int whc_row_r = whc_dir_base + 0 * num_output_pack + q_pack; + int whc_row_u = whc_dir_base + 1 * num_output_pack + q_pack; + int whc_row_n = whc_dir_base + 2 * num_output_pack + q_pack; + + afpvec4 r = buffer_ld4(bias_c_data, bias_dir_base + 0 * num_output_pack + q_pack); + afpvec4 u = buffer_ld4(bias_c_data, bias_dir_base + 1 * num_output_pack + q_pack); + + for (int i = 0; i < p.size; i++) + { + afp xi = buffer_ld1(bottom_blob_data, x_offset + i); + + r += buffer_ld4(weight_xc_data, wxc_row_r * p.size + i) * xi; + u += buffer_ld4(weight_xc_data, wxc_row_u * p.size + i) * xi; + } + + for (int i = 0; i < p.num_output; i++) + { + afp hi = buffer_ld1(hidden_prev_data, i); + + r += buffer_ld4(weight_hc_data, whc_row_r * p.num_output + i) * hi; + u += buffer_ld4(weight_hc_data, whc_row_u * p.num_output + i) * hi; + } + + r = afpvec4(1.f) / (afpvec4(1.f) + exp(-r)); + u = afpvec4(1.f) / (afpvec4(1.f) + exp(-u)); + + afpvec4 n = buffer_ld4(bias_c_data, bias_dir_base + 3 * num_output_pack + q_pack); + + for (int i = 0; i < p.num_output; i++) + { + afp hi = buffer_ld1(hidden_prev_data, i); + n += buffer_ld4(weight_hc_data, whc_row_n * p.num_output + i) * hi; + } + + afpvec4 wn = buffer_ld4(bias_c_data, bias_dir_base + 2 * num_output_pack + q_pack); + n = wn + r * n; + + for (int i = 0; i < p.size; i++) + { + afp xi = buffer_ld1(bottom_blob_data, x_offset + i); + n += buffer_ld4(weight_xc_data, wxc_row_n * p.size + i) * xi; + } + + n = tanh(n); + + int q0 = q_pack * 4; + + afpvec4 hprev; + hprev.r = buffer_ld1(hidden_prev_data, q0 + 0); + hprev.g = buffer_ld1(hidden_prev_data, q0 + 1); + hprev.b = buffer_ld1(hidden_prev_data, q0 + 2); + hprev.a = buffer_ld1(hidden_prev_data, q0 + 3); + + afpvec4 h = (afpvec4(1.f) - u) * n + u * hprev; + + buffer_st1(hidden_next_data, q0 + 0, h.r); + buffer_st1(hidden_next_data, q0 + 1, h.g); + buffer_st1(hidden_next_data, q0 + 2, h.b); + buffer_st1(hidden_next_data, q0 + 3, h.a); + + int out_index = p.ti * p.outw + p.out_offset + q0; + buffer_st1(top_blob_data, out_index + 0, h.r); + buffer_st1(top_blob_data, out_index + 1, h.g); + buffer_st1(top_blob_data, out_index + 2, h.b); + buffer_st1(top_blob_data, out_index + 3, h.a); +}