@@ -272,38 +272,40 @@ void jit_emitter::store_context(const std::vector<size_t>& gpr_regs,
272272 }
273273 }
274274
275- // 2. SIMD and Floating-Point registers
276- // 2.1. store pair registers
277- int prev_reg_idx = -1 ;
278- size_t ignore_registers_count = 0 ;
279- for (const auto reg_idx : vec_regs) {
280- if (ignore_vec_regs.find (reg_idx) != ignore_vec_regs.end ()) {
281- ignore_registers_count++;
282- continue ;
275+ // 2. SIMD and Floating-Point registers - optimized to allocate stack space once
276+ const auto store_vec_regs_size = vec_regs.size () - ignore_vec_regs.size ();
277+ if (store_vec_regs_size > 0 ) {
278+ // Calculate total stack space needed for all vector registers (align once)
279+ const auto total_vec_shift = ov::intel_cpu::rnd_up (get_vec_length () * store_vec_regs_size, sp_alignment);
280+
281+ // Single stack allocation for all vector registers
282+ h->sub (h->sp , h->sp , total_vec_shift);
283+
284+ // Store vector registers using stack offset (preserving original order)
285+ const auto last = store_vec_regs_size % 2 ;
286+ int32_t current_offset = 0 ;
287+
288+ // Collect non-ignored registers
289+ std::vector<size_t > active_regs;
290+ for (const auto reg_idx : vec_regs) {
291+ if (ignore_vec_regs.find (reg_idx) == ignore_vec_regs.end ()) {
292+ active_regs.push_back (reg_idx);
293+ }
283294 }
284- if (prev_reg_idx == -1 ) {
285- prev_reg_idx = static_cast <int >(reg_idx);
286- continue ;
295+
296+ // Store pairs
297+ for (size_t i = 0 ; i < (active_regs.size () - last); i += 2 ) {
298+ h->stp (Xbyak_aarch64::QReg (active_regs[i]),
299+ Xbyak_aarch64::QReg (active_regs[i + 1 ]),
300+ Xbyak_aarch64::ptr (h->sp , current_offset));
301+ current_offset += static_cast <int32_t >(get_vec_length () * 2 );
287302 }
288- const auto shift = ov::intel_cpu::rnd_up (get_vec_length () * 2 , sp_alignment);
289- h->stp (Xbyak_aarch64::QReg (prev_reg_idx),
290- Xbyak_aarch64::QReg (reg_idx),
291- pre_ptr (h->sp , -static_cast <int32_t >(shift)));
292- prev_reg_idx = -1 ;
293- }
294303
295- // 2.1. store the remaining register
296- if (prev_reg_idx != -1 ) {
297- if (ignore_vec_regs.find (prev_reg_idx) == ignore_vec_regs.end ()) {
298- const auto shift = ov::intel_cpu::rnd_up (get_vec_length (), sp_alignment);
299- h->str (Xbyak_aarch64::QReg (prev_reg_idx), pre_ptr (h->sp , -static_cast <int32_t >(shift)));
300- } else {
301- ignore_registers_count++;
304+ // Store the remaining register
305+ if (last != 0 ) {
306+ h->str (Xbyak_aarch64::QReg (active_regs[active_regs.size () - 1 ]), Xbyak_aarch64::ptr (h->sp , current_offset));
302307 }
303308 }
304-
305- OPENVINO_ASSERT (ignore_registers_count == ignore_vec_regs.size (),
306- " ignored registers size is not equal actual ignored registers count" );
307309}
308310
309311void jit_emitter::restore_context (const std::unordered_set<size_t >& ignore_vec_regs) const {
@@ -313,42 +315,44 @@ void jit_emitter::restore_context(const std::unordered_set<size_t>& ignore_vec_r
313315void jit_emitter::restore_context (const std::vector<size_t >& gpr_regs,
314316 const std::vector<size_t >& vec_regs,
315317 const std::unordered_set<size_t >& ignore_vec_regs) const {
316- // 1. SIMD and Floating-Point registers
317- // 1.1. restore the remaining register
318- auto v_last = (vec_regs.size () - ignore_vec_regs.size ()) % 2 ;
319- if (v_last != 0 ) {
320- for (size_t i = 0 ; i < vec_regs.size (); i++) {
321- const auto reg_idx = vec_regs.size () - 1 - i;
322- if (ignore_vec_regs.find (reg_idx) != ignore_vec_regs.end ()) {
323- v_last++;
324- continue ;
318+ // 1. SIMD and Floating-Point registers - optimized to deallocate stack space once
319+ const auto save_vec_regs_size = vec_regs.size () - ignore_vec_regs.size ();
320+ if (save_vec_regs_size > 0 ) {
321+ // Restore vector registers using stack offset (reverse order to match original behavior)
322+ const auto last = save_vec_regs_size % 2 ;
323+ if (last != 0 ) {
324+ int32_t current_offset = get_vec_length () * save_vec_regs_size - get_vec_length ();
325+ // Find the last non-ignored register
326+ for (size_t i = 0 ; i < vec_regs.size (); i++) {
327+ const auto reg_idx = vec_regs.size () - 1 - i;
328+ if (ignore_vec_regs.find (reg_idx) != ignore_vec_regs.end ()) {
329+ continue ;
330+ }
331+ h->ldr (Xbyak_aarch64::QReg (reg_idx), Xbyak_aarch64::ptr (h->sp , current_offset));
332+ break ;
325333 }
326-
327- const auto shift = ov::intel_cpu::rnd_up (get_vec_length (), sp_alignment);
328- h->ldr (Xbyak_aarch64::QReg (reg_idx), post_ptr (h->sp , shift));
329- break ;
330334 }
331- }
332- // 1.2. restore pair registers
333- size_t ignore_registers_count = 0 ;
334- int prev_reg_idx = -1 ;
335- for (size_t i = v_last; i < vec_regs.size (); i++) {
336- const auto reg_idx = vec_regs.size () - 1 - i;
337- if (ignore_vec_regs.find (reg_idx) != ignore_vec_regs.end ()) {
338- ignore_registers_count++;
339- continue ;
335+
336+ // Collect non-ignored registers
337+ std::vector<size_t > active_regs;
338+ for (const auto reg_idx : vec_regs) {
339+ if (ignore_vec_regs.find (reg_idx) == ignore_vec_regs.end ()) {
340+ active_regs.push_back (reg_idx);
341+ }
340342 }
341- if (prev_reg_idx == -1 ) {
342- prev_reg_idx = static_cast <int >(reg_idx);
343- continue ;
343+
344+ // Restore pairs in reverse order
345+ for (size_t i = last; i < active_regs.size (); i += 2 ) {
346+ int32_t current_offset = get_vec_length () * (active_regs.size () - (i + 2 ));
347+ h->ldp (Xbyak_aarch64::QReg (active_regs[active_regs.size () - 1 - (i + 1 )]),
348+ Xbyak_aarch64::QReg (active_regs[active_regs.size () - 1 - i]),
349+ Xbyak_aarch64::ptr (h->sp , current_offset));
344350 }
345- const auto shift = ov::intel_cpu::rnd_up (get_vec_length () * 2 , sp_alignment);
346- h->ldp (Xbyak_aarch64::QReg (reg_idx), Xbyak_aarch64::QReg (prev_reg_idx), post_ptr (h->sp , shift));
347- prev_reg_idx = -1 ;
348- }
349351
350- OPENVINO_ASSERT (ignore_registers_count == ignore_vec_regs.size (),
351- " ignored registers size is not equal actual ignored registers count" );
352+ const auto total_vec_shift = ov::intel_cpu::rnd_up (get_vec_length () * save_vec_regs_size, sp_alignment);
353+ // Single stack deallocation for all vector registers
354+ h->add (h->sp , h->sp , total_vec_shift);
355+ }
352356
353357 // 2. General-purpose Registers - optimized to deallocate stack space once
354358 const auto save_gpr_regs_size = gpr_regs.size ();
0 commit comments