From dd3da1e3b9032f3ec6c59f47e03af94edd08372f Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Mon, 22 Sep 2025 21:05:39 +0000
Subject: [PATCH 01/19] Vectorized MOE fusion init

---
 .../matmul_experts_fusion.hpp                 |  22 ++++
 .../matmul_experts_fusion.cpp                 | 114 ++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp
 create mode 100644 src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp

diff --git a/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp
new file mode 100644
index 00000000000000..27eac10769899b
--- /dev/null
+++ b/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API FuseVectorizedMOE;
+
+}  // namespace pass
+}  // namespace ov
+
+class ov::pass::FuseVectorizedMOE : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("FuseVectorizedMOE");
+    FuseVectorizedMOE();
+};
diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
new file mode 100644
index 00000000000000..23e0277cea7a5e
--- /dev/null
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -0,0 +1,114 @@
+#include "transformations/common_optimizations/matmul_experts_fusion.hpp"
+
+#include "itt.hpp"
+#include "openvino/core/graph_util.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/clamp.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/minimum.hpp"
+#include "openvino/op/moe.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/reduce_sum.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/scatter_elements_update.hpp"
+#include "openvino/op/slice.hpp"
+#include "openvino/op/swish.hpp"
+#include "openvino/op/tile.hpp"
+#include "openvino/op/topk.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "openvino/pass/manager.hpp"
+#include "openvino/pass/pattern/matcher.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "transformations/utils/utils.hpp"
+
+using namespace ov::pass;
+ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
+    MATCHER_SCOPE(FuseVectorizedMOE);
+
+    auto experts_input = pattern::wrap_type<ov::op::v1::Reshape>({pattern::any_input(), pattern::any_input()});
+    auto tile = pattern::wrap_type<ov::op::v0::Tile>({experts_input, pattern::any_input()});
+    auto after_tile_reshape = pattern::wrap_type<ov::op::v1::Reshape>({tile, pattern::any_input()});
+    auto gate_up_matmul = pattern::wrap_type<ov::op::v0::MatMul>({after_tile_reshape, pattern::any_input()});
+    auto gate_up_add = pattern::wrap_type<ov::op::v1::Add>({gate_up_matmul, pattern::any_input()});
+
+    // Branch 1: Slice_1 -> Clamp -> Add_1
+    auto slice1 = pattern::wrap_type<ov::op::v8::Slice>(
+        {gate_up_add, pattern::any_input(), pattern::any_input(), pattern::any_input(), pattern::any_input()});
+    auto clamp = pattern::wrap_type<ov::op::v0::Clamp>({slice1});
+    auto add1 = pattern::wrap_type<ov::op::v1::Add>({clamp, pattern::wrap_const()});
+
+    // Branch 2: Slice_2 -> Minimum_1 -> Swish
+    auto slice2 = pattern::wrap_type<ov::op::v8::Slice>(
+        {gate_up_add, pattern::any_input(), pattern::any_input(), pattern::any_input(), pattern::any_input()});
+    auto minimum1 = pattern::wrap_type<ov::op::v1::Minimum>({slice2, pattern::wrap_const()});
+    auto swish_beta = pattern::wrap_const();
+    auto swish = pattern::wrap_type<ov::op::v4::Swish>({minimum1, swish_beta});
+
+    // Join: Multiply_2
+    auto multiply2 = pattern::wrap_type<ov::op::v1::Multiply>({add1, swish});
+
+    // Down projection
+    auto down_proj_matmul = pattern::wrap_type<ov::op::v0::MatMul>({multiply2, pattern::any_input()});
+    auto down_proj_add = pattern::wrap_type<ov::op::v1::Add>({down_proj_matmul, pattern::wrap_const()});
+    auto end_reshape = pattern::wrap_type<ov::op::v1::Reshape>({down_proj_add, pattern::any_input()});
+
+    // Routing weights/mask
+    auto router_topk_indices = pattern::any_input();
+    auto scatter_elements_update = pattern::wrap_type<ov::op::v12::ScatterElementsUpdate>(
+        {pattern::any_input(), router_topk_indices, pattern::any_input(), pattern::any_input()});
+
+    auto router_transpose = pattern::wrap_type<ov::op::v1::Transpose>({scatter_elements_update, pattern::any_input()});
+    auto router_reshape = pattern::wrap_type<ov::op::v1::Reshape>({router_transpose, pattern::any_input()});
+    auto unsqueeze_routing_weights = pattern::wrap_type<ov::op::v0::Unsqueeze>({router_reshape, pattern::any_input()});
+
+    auto mul3 = pattern::wrap_type<ov::op::v1::Multiply>({end_reshape, unsqueeze_routing_weights});
+    auto reduce_sum = pattern::wrap_type<ov::op::v1::ReduceSum>({mul3, pattern::any_input()});
+    auto moe_pattern = reduce_sum;
+
+    matcher_pass_callback callback = [=](pattern::Matcher& m) {
+        auto& pm = m.get_pattern_value_map();
+
+        auto experts_input_node = pm.at(tile).get_node()->input_value(0);
+        auto routing_weights_node = pm.at(unsqueeze_routing_weights).get_node_shared_ptr();
+        auto gate_up_weight = pm.at(gate_up_matmul).get_node()->input_value(1).get_node_shared_ptr();
+        auto gate_up_bias_node = pm.at(gate_up_add).get_node()->input_value(1).get_node_shared_ptr();
+        auto down_proj_weight = pm.at(down_proj_matmul).get_node()->input_value(1).get_node_shared_ptr();
+        auto down_proj_bias_node = pm.at(down_proj_add).get_node()->input_value(1).get_node_shared_ptr();
+        auto topk_indices_node = pm.at(router_topk_indices).get_node_shared_ptr();
+
+        ov::OutputVector moe_inputs = {experts_input_node,
+                                       topk_indices_node,
+                                       routing_weights_node,
+                                       gate_up_weight,
+                                       gate_up_bias_node,
+                                       down_proj_weight,
+                                       down_proj_bias_node};
+
+        ov::op::v16::MOE::Config config;
+
+        // Extract expert_alpha from Swish beta attribute
+        auto swish_beta_const = ov::as_type_ptr<ov::op::v0::Constant>(pm.at(swish_beta).get_node_shared_ptr());
+        auto swish_beta_const_val = swish_beta_const->cast_vector<float>()[0];
+        config.expert_alpha = swish_beta_const_val;
+
+        // Extract expert_beta from Clamp max attribute
+        if (auto clamp_op = ov::as_type_ptr<ov::op::v0::Clamp>(pm.at(clamp).get_node_shared_ptr())) {
+            config.expert_beta = clamp_op->get_max();
+        }
+
+        // Set expert_type
+        config.expert_type = ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP;
+
+        auto moe = std::make_shared<ov::op::v16::MOE>(moe_inputs, config);
+        moe->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ov::copy_runtime_info(m.get_matched_nodes(), moe);
+        ov::replace_node(m.get_match_root(), moe);
+
+        register_new_node(moe);
+        return true;
+    };
+
+    auto matcher = std::make_shared<pattern::Matcher>(moe_pattern, matcher_name);
+    this->register_matcher(matcher, callback);
+}

From 7b9572ebeaece8081b7b17efca3ba8a16e13262b Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Mon, 22 Sep 2025 21:09:26 +0000
Subject: [PATCH 02/19] MOE op init

---
 src/core/include/openvino/op/moe.hpp          | 88 +++++++++++++++++
 src/core/include/openvino/op/ops.hpp          |  1 +
 .../include/openvino/opsets/opset16_tbl.hpp   |  1 +
 src/core/src/op/moe.cpp                       | 96 +++++++++++++++++++
 4 files changed, 186 insertions(+)
 create mode 100644 src/core/include/openvino/op/moe.hpp
 create mode 100644 src/core/src/op/moe.cpp

diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/include/openvino/op/moe.hpp
new file mode 100644
index 00000000000000..4320cfd368ab95
--- /dev/null
+++ b/src/core/include/openvino/op/moe.hpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "openvino/core/node.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/op.hpp"
+
+namespace ov::op::v16 {
+///
+/// \brief MOE experts
+/// \ingroup ov_ops_cpp_api
+class OPENVINO_API MOE : public ov::op::Op {
+public:
+    OPENVINO_OP("MOE", "opset16");
+
+    MOE() = default;
+
+    enum class Expert_type {
+        GEMM3_SWIGLU,
+        GEMM2_BIAS_SWIGLU_CLAMP
+    };
+
+    struct Config {
+        size_t topk{};
+        size_t expert_num{};
+        size_t hidden_size{};
+        size_t intermediate_size{};
+        size_t group_size{};              // quantized group size, 0 for no group size. same for gate/up/down
+        ov::element::Type weight_type{};  // same for gate/up/down
+        ov::element::Type scale_type{};   // same for gate/up/down
+        ov::element::Type zp_type{};      // same for gate/up/down
+
+        Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP};
+        float expert_alpha{1.0f};  // Expert attribute, e.g. sigmoid alpha (gpt-oss: 1.702)
+        float expert_beta{0.0f};   // Expert attribute, e.g. clamp limit (gpt-oss: 7.0)
+
+        bool operator==(const Config& rhs) const {
+            return std::tie(topk,
+                            expert_num,
+                            hidden_size,
+                            intermediate_size,
+                            group_size,
+                            weight_type,
+                            scale_type,
+                            zp_type) == std::tie(rhs.topk,
+                                                 rhs.expert_num,
+                                                 rhs.hidden_size,
+                                                 rhs.intermediate_size,
+                                                 rhs.group_size,
+                                                 rhs.weight_type,
+                                                 rhs.scale_type,
+                                                 rhs.zp_type);
+        }
+    };
+
+    /// \brief Constructs a MOE operation with config only
+    /// \param args The input tensors: [hidden_states, router_logits] followed by expert weights/scales/zps
+    /// \param config Configuration for the MOE operation
+    MOE(const OutputVector& args, const Config& config);
+
+    const Config& get_config() const;
+    void set_config(const Config& config);
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+    /// \brief Get expert weight/scale/zp constant for a specific expert and weight type
+    /// \param expert_idx Index of the expert (0 to expert_num-1)
+    /// \param weight_type 0=gate, 1=up, 2=down
+    /// \param const_type 0=weight, 1=scale, 2=zp
+    /// \return Constant node or nullptr if not present
+    std::shared_ptr<ov::op::v0::Constant> get_expert_const(size_t expert_idx,
+                                                           size_t weight_type,
+                                                           size_t const_type) const;
+
+private:
+    Config m_config;
+};
+
+}  // namespace ov::op::v16
diff --git a/src/core/include/openvino/op/ops.hpp b/src/core/include/openvino/op/ops.hpp
index dcb5fc0385ecde..a4150c218b1c73 100644
--- a/src/core/include/openvino/op/ops.hpp
+++ b/src/core/include/openvino/op/ops.hpp
@@ -119,6 +119,7 @@
 #include "openvino/op/minimum.hpp"
 #include "openvino/op/mish.hpp"
 #include "openvino/op/mod.hpp"
+#include "openvino/op/moe.hpp"
 #include "openvino/op/multiclass_nms.hpp"
 #include "openvino/op/multinomial.hpp"
 #include "openvino/op/multiply.hpp"
diff --git a/src/core/include/openvino/opsets/opset16_tbl.hpp b/src/core/include/openvino/opsets/opset16_tbl.hpp
index 39d3d5d1d80889..a8312c9a09dc58 100644
--- a/src/core/include/openvino/opsets/opset16_tbl.hpp
+++ b/src/core/include/openvino/opsets/opset16_tbl.hpp
@@ -16,6 +16,7 @@ _OPENVINO_OP_REG(ShapeOf, ov::op::v3)
 // New operations added in opset16
 _OPENVINO_OP_REG(Identity, ov::op::v16)
 _OPENVINO_OP_REG(ISTFT, ov::op::v16)
+_OPENVINO_OP_REG(MOE, ov::op::v16)
 _OPENVINO_OP_REG(SegmentMax, ov::op::v16)
 _OPENVINO_OP_REG(SparseFillEmptyRows, ov::op::v16)
 _OPENVINO_OP_REG(AvgPool, ov::op::v16)
diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp
new file mode 100644
index 00000000000000..80ce3a4342455c
--- /dev/null
+++ b/src/core/src/op/moe.cpp
@@ -0,0 +1,96 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/op/moe.hpp"
+
+#include "itt.hpp"
+
+namespace ov {
+namespace op {
+namespace v16 {
+
+MOE::MOE(const OutputVector& args, const Config& config) : Op(args), m_config(config) {
+    constructor_validate_and_infer_types();
+}
+
+const MOE::Config& MOE::get_config() const {
+    return m_config;
+}
+
+void MOE::set_config(const Config& config) {
+    m_config = config;
+}
+
+std::shared_ptr<ov::Node> MOE::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    OV_OP_SCOPE(v16_MOE_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+
+    return std::make_shared<MOE>(new_args, m_config);
+}
+
+void MOE::validate_and_infer_types() {
+    OV_OP_SCOPE(v16_MOE_validate_and_infer_types);
+    // At minimum we need 2 inputs: hidden_states and router_logits
+    OPENVINO_ASSERT(get_input_size() >= 2, "MOE must have at least 2 inputs whereas it has ", get_input_size());
+
+    // For now, just do basic validation. The input layout validation can be more flexible
+    // to allow incremental building during pattern matching
+    // Expected inputs:
+    // 0: hidden_states
+    // 1: router_logits
+    // 2+: expert constants (flexible layout during construction)
+
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+}
+
+bool MOE::visit_attributes(ov::AttributeVisitor& visitor) {
+    OV_OP_SCOPE(v16_MOE_visit_attributes);
+    visitor.start_structure("config");
+
+    visitor.on_attribute("topk", m_config.topk);
+    visitor.on_attribute("expert_num", m_config.expert_num);
+    visitor.on_attribute("hidden_size", m_config.hidden_size);
+    visitor.on_attribute("intermediate_size", m_config.intermediate_size);
+    visitor.on_attribute("group_size", m_config.group_size);
+    visitor.on_attribute("weight_type", m_config.weight_type);
+    visitor.on_attribute("scale_type", m_config.scale_type);
+    visitor.on_attribute("zp_type", m_config.zp_type);
+    visitor.finish_structure();
+    
+    return true;
+}
+
+std::shared_ptr<ov::op::v0::Constant> MOE::get_expert_const(size_t expert_idx, size_t weight_type, size_t const_type) const {
+    OPENVINO_ASSERT(expert_idx < m_config.expert_num, "Expert index out of range");
+    OPENVINO_ASSERT(weight_type < 3, "Weight type must be 0 (gate), 1 (up), or 2 (down)");
+    OPENVINO_ASSERT(const_type < 3, "Const type must be 0 (weight), 1 (scale), or 2 (zp)");
+
+    // Calculate input index based on expert and weight/const type
+    // Input layout: [hidden_states, router_logits, expert0_gate_weight, expert0_gate_scale?, expert0_gate_zp?, 
+    //                expert0_up_weight, expert0_up_scale?, expert0_up_zp?, expert0_down_weight, expert0_down_scale?, expert0_down_zp?, ...]
+    
+    size_t base_idx = 2; // Start after hidden_states and router_logits
+    
+    // For now, assume simple layout: weight, scale?, zp? for each of gate, up, down
+    size_t constants_per_weight_type = 1; // Just weights for now, will need to extend for scales/zps
+    if (m_config.scale_type != ov::element::dynamic) constants_per_weight_type++;
+    if (m_config.zp_type != ov::element::dynamic) constants_per_weight_type++;
+    
+    size_t constants_per_expert = 3 * constants_per_weight_type; // 3 weight types * constants per type
+    
+    size_t expert_base = base_idx + expert_idx * constants_per_expert;
+    size_t weight_base = expert_base + weight_type * constants_per_weight_type;
+    size_t input_idx = weight_base + const_type;
+    
+    if (input_idx >= get_input_size()) {
+        return nullptr; // Constant not provided (e.g., scale or zp for non-quantized weights)
+    }
+    
+    auto input_node = get_input_node_shared_ptr(input_idx);
+    return ov::as_type_ptr<ov::op::v0::Constant>(input_node);
+}
+
+}  // namespace v16
+}  // namespace op
+}  // namespace ov

From 4a50118c69417756917ba9821f05510c666310e4 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Mon, 22 Sep 2025 21:47:02 +0000
Subject: [PATCH 03/19] MOE attrs/inputs adjust

---
 src/core/include/openvino/op/moe.hpp | 42 ++----------------------
 src/core/src/op/moe.cpp              | 49 ++++------------------------
 2 files changed, 10 insertions(+), 81 deletions(-)

diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/include/openvino/op/moe.hpp
index 4320cfd368ab95..4f3493bc546aa0 100644
--- a/src/core/include/openvino/op/moe.hpp
+++ b/src/core/include/openvino/op/moe.hpp
@@ -28,40 +28,13 @@ class OPENVINO_API MOE : public ov::op::Op {
     };
 
     struct Config {
-        size_t topk{};
-        size_t expert_num{};
-        size_t hidden_size{};
-        size_t intermediate_size{};
-        size_t group_size{};              // quantized group size, 0 for no group size. same for gate/up/down
-        ov::element::Type weight_type{};  // same for gate/up/down
-        ov::element::Type scale_type{};   // same for gate/up/down
-        ov::element::Type zp_type{};      // same for gate/up/down
-
         Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP};
-        float expert_alpha{1.0f};  // Expert attribute, e.g. sigmoid alpha (gpt-oss: 1.702)
-        float expert_beta{0.0f};   // Expert attribute, e.g. clamp limit (gpt-oss: 7.0)
-
-        bool operator==(const Config& rhs) const {
-            return std::tie(topk,
-                            expert_num,
-                            hidden_size,
-                            intermediate_size,
-                            group_size,
-                            weight_type,
-                            scale_type,
-                            zp_type) == std::tie(rhs.topk,
-                                                 rhs.expert_num,
-                                                 rhs.hidden_size,
-                                                 rhs.intermediate_size,
-                                                 rhs.group_size,
-                                                 rhs.weight_type,
-                                                 rhs.scale_type,
-                                                 rhs.zp_type);
-        }
+        float expert_alpha{1.0f};  // Expert attribute, e.g. sigmoid alpha
+        float expert_beta{0.0f};   // Expert attribute, e.g. clamp limit
     };
 
     /// \brief Constructs a MOE operation with config only
-    /// \param args The input tensors: [hidden_states, router_logits] followed by expert weights/scales/zps
+    /// \param args The input tensors
     /// \param config Configuration for the MOE operation
     MOE(const OutputVector& args, const Config& config);
 
@@ -72,15 +45,6 @@ class OPENVINO_API MOE : public ov::op::Op {
     void validate_and_infer_types() override;
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
-    /// \brief Get expert weight/scale/zp constant for a specific expert and weight type
-    /// \param expert_idx Index of the expert (0 to expert_num-1)
-    /// \param weight_type 0=gate, 1=up, 2=down
-    /// \param const_type 0=weight, 1=scale, 2=zp
-    /// \return Constant node or nullptr if not present
-    std::shared_ptr<ov::op::v0::Constant> get_expert_const(size_t expert_idx,
-                                                           size_t weight_type,
-                                                           size_t const_type) const;
-
 private:
     Config m_config;
 };
diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp
index 80ce3a4342455c..b31a585478ad08 100644
--- a/src/core/src/op/moe.cpp
+++ b/src/core/src/op/moe.cpp
@@ -46,49 +46,14 @@ void MOE::validate_and_infer_types() {
 
 bool MOE::visit_attributes(ov::AttributeVisitor& visitor) {
     OV_OP_SCOPE(v16_MOE_visit_attributes);
-    visitor.start_structure("config");
-
-    visitor.on_attribute("topk", m_config.topk);
-    visitor.on_attribute("expert_num", m_config.expert_num);
-    visitor.on_attribute("hidden_size", m_config.hidden_size);
-    visitor.on_attribute("intermediate_size", m_config.intermediate_size);
-    visitor.on_attribute("group_size", m_config.group_size);
-    visitor.on_attribute("weight_type", m_config.weight_type);
-    visitor.on_attribute("scale_type", m_config.scale_type);
-    visitor.on_attribute("zp_type", m_config.zp_type);
-    visitor.finish_structure();
-    
-    return true;
-}
 
-std::shared_ptr<ov::op::v0::Constant> MOE::get_expert_const(size_t expert_idx, size_t weight_type, size_t const_type) const {
-    OPENVINO_ASSERT(expert_idx < m_config.expert_num, "Expert index out of range");
-    OPENVINO_ASSERT(weight_type < 3, "Weight type must be 0 (gate), 1 (up), or 2 (down)");
-    OPENVINO_ASSERT(const_type < 3, "Const type must be 0 (weight), 1 (scale), or 2 (zp)");
-
-    // Calculate input index based on expert and weight/const type
-    // Input layout: [hidden_states, router_logits, expert0_gate_weight, expert0_gate_scale?, expert0_gate_zp?, 
-    //                expert0_up_weight, expert0_up_scale?, expert0_up_zp?, expert0_down_weight, expert0_down_scale?, expert0_down_zp?, ...]
-    
-    size_t base_idx = 2; // Start after hidden_states and router_logits
-    
-    // For now, assume simple layout: weight, scale?, zp? for each of gate, up, down
-    size_t constants_per_weight_type = 1; // Just weights for now, will need to extend for scales/zps
-    if (m_config.scale_type != ov::element::dynamic) constants_per_weight_type++;
-    if (m_config.zp_type != ov::element::dynamic) constants_per_weight_type++;
-    
-    size_t constants_per_expert = 3 * constants_per_weight_type; // 3 weight types * constants per type
-    
-    size_t expert_base = base_idx + expert_idx * constants_per_expert;
-    size_t weight_base = expert_base + weight_type * constants_per_weight_type;
-    size_t input_idx = weight_base + const_type;
-    
-    if (input_idx >= get_input_size()) {
-        return nullptr; // Constant not provided (e.g., scale or zp for non-quantized weights)
-    }
-    
-    auto input_node = get_input_node_shared_ptr(input_idx);
-    return ov::as_type_ptr<ov::op::v0::Constant>(input_node);
+    // visitor.on_attribute("expert_type", m_config.expert_type);
+    // TODO: Add adapter
+
+    visitor.on_attribute("expert_alpha", m_config.expert_alpha);
+    visitor.on_attribute("expert_beta", m_config.expert_beta);
+
+    return true;
 }
 
 }  // namespace v16

From 7e3230ef47d4e0a18f1753982e22a783353ed948 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Mon, 22 Sep 2025 22:12:49 +0000
Subject: [PATCH 04/19] Adjust inputs desc

---
 .../common_optimizations/matmul_experts_fusion.cpp    |  2 +-
 src/core/include/openvino/op/moe.hpp                  | 11 ++++++++++-
 src/core/src/op/moe.cpp                               | 10 +---------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index 23e0277cea7a5e..6ec241fd025712 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -78,8 +78,8 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
         auto topk_indices_node = pm.at(router_topk_indices).get_node_shared_ptr();
 
         ov::OutputVector moe_inputs = {experts_input_node,
-                                       topk_indices_node,
                                        routing_weights_node,
+                                       topk_indices_node,
                                        gate_up_weight,
                                        gate_up_bias_node,
                                        down_proj_weight,
diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/include/openvino/op/moe.hpp
index 4f3493bc546aa0..fa2a51432b068d 100644
--- a/src/core/include/openvino/op/moe.hpp
+++ b/src/core/include/openvino/op/moe.hpp
@@ -34,7 +34,16 @@ class OPENVINO_API MOE : public ov::op::Op {
     };
 
     /// \brief Constructs a MOE operation with config only
-    /// \param args The input tensors
+    /// \param args The input tensors, in the following order:
+    ///   0: hidden_states - input tensor with hidden representations
+    ///   1: router_topk_output_weights - normalized weights for selected experts (input to final multiplication)
+    ///   2: router_topk_output_indices - indices of selected top-k experts
+    ///   3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or [num_experts, hidden_size, 2 * inter_size] if fused
+    ///   4: w0_bias (optional) - expert bias for first projection, shape [num_experts, ...] or empty tensor if not needed
+    ///   5: w1_weight - expert weights for second projection, shape [num_experts, inter_size, hidden_size]
+    ///   6: w1_bias (optional) - expert bias for second projection, shape [num_experts, ...] or empty tensor if not needed
+    ///   7: w2_weight - expert weights for final projection, shape [num_experts, hidden_size, inter_size]
+    ///   8: w2_bias (optional/redundant) - expert bias for final projection, usually not required
     /// \param config Configuration for the MOE operation
     MOE(const OutputVector& args, const Config& config);
 
diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp
index b31a585478ad08..57c2bd67d94317 100644
--- a/src/core/src/op/moe.cpp
+++ b/src/core/src/op/moe.cpp
@@ -31,15 +31,7 @@ std::shared_ptr<ov::Node> MOE::clone_with_new_inputs(const ov::OutputVector& new
 
 void MOE::validate_and_infer_types() {
     OV_OP_SCOPE(v16_MOE_validate_and_infer_types);
-    // At minimum we need 2 inputs: hidden_states and router_logits
-    OPENVINO_ASSERT(get_input_size() >= 2, "MOE must have at least 2 inputs whereas it has ", get_input_size());
-
-    // For now, just do basic validation. The input layout validation can be more flexible
-    // to allow incremental building during pattern matching
-    // Expected inputs:
-    // 0: hidden_states
-    // 1: router_logits
-    // 2+: expert constants (flexible layout during construction)
+    // TODO: Add inputs validation
 
     set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
 }

From 104246af146b33f6aa884e916911a7c56f75d477 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Mon, 22 Sep 2025 22:29:28 +0000
Subject: [PATCH 05/19] Add adapters for expert_type enum

---
 src/core/include/openvino/op/moe.hpp | 34 ++++++++++++++++++++--------
 src/core/src/op/moe.cpp              | 19 +++++++++++++---
 2 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/include/openvino/op/moe.hpp
index fa2a51432b068d..e61b2c686f758b 100644
--- a/src/core/include/openvino/op/moe.hpp
+++ b/src/core/include/openvino/op/moe.hpp
@@ -22,10 +22,7 @@ class OPENVINO_API MOE : public ov::op::Op {
 
     MOE() = default;
 
-    enum class Expert_type {
-        GEMM3_SWIGLU,
-        GEMM2_BIAS_SWIGLU_CLAMP
-    };
+    enum class Expert_type { GEMM3_SWIGLU, GEMM2_BIAS_SWIGLU_CLAMP };
 
     struct Config {
         Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP};
@@ -38,12 +35,13 @@ class OPENVINO_API MOE : public ov::op::Op {
     ///   0: hidden_states - input tensor with hidden representations
     ///   1: router_topk_output_weights - normalized weights for selected experts (input to final multiplication)
     ///   2: router_topk_output_indices - indices of selected top-k experts
-    ///   3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or [num_experts, hidden_size, 2 * inter_size] if fused
-    ///   4: w0_bias (optional) - expert bias for first projection, shape [num_experts, ...] or empty tensor if not needed
-    ///   5: w1_weight - expert weights for second projection, shape [num_experts, inter_size, hidden_size]
-    ///   6: w1_bias (optional) - expert bias for second projection, shape [num_experts, ...] or empty tensor if not needed
-    ///   7: w2_weight - expert weights for final projection, shape [num_experts, hidden_size, inter_size]
-    ///   8: w2_bias (optional/redundant) - expert bias for final projection, usually not required
+    ///   3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or
+    ///   [num_experts, hidden_size, 2 * inter_size] if fused 4: w0_bias (optional) - expert bias for first projection,
+    ///   shape [num_experts, ...] or empty tensor if not needed 5: w1_weight - expert weights for second projection,
+    ///   shape [num_experts, inter_size, hidden_size] 6: w1_bias (optional) - expert bias for second projection, shape
+    ///   [num_experts, ...] or empty tensor if not needed 7: w2_weight - expert weights for final projection, shape
+    ///   [num_experts, hidden_size, inter_size] 8: w2_bias (optional/redundant) - expert bias for final projection,
+    ///   usually not required
     /// \param config Configuration for the MOE operation
     MOE(const OutputVector& args, const Config& config);
 
@@ -59,3 +57,19 @@ class OPENVINO_API MOE : public ov::op::Op {
 };
 
 }  // namespace ov::op::v16
+
+namespace ov {
+OPENVINO_API
+std::ostream& operator<<(std::ostream& s, const ov::op::v16::MOE::Expert_type& type);
+
+template <>
+class OPENVINO_API
+    AttributeAdapter<ov::op::v16::MOE::Expert_type> : public EnumAttributeAdapterBase<ov::op::v16::MOE::Expert_type> {
+public:
+    AttributeAdapter(ov::op::v16::MOE::Expert_type& value)
+        : EnumAttributeAdapterBase<ov::op::v16::MOE::Expert_type>(value) {}
+
+    OPENVINO_RTTI("AttributeAdapter<ov::op::v16::MOE::Expert_type>");
+    ~AttributeAdapter() override = default;
+};
+}  // namespace ov
diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp
index 57c2bd67d94317..d63302128b4c40 100644
--- a/src/core/src/op/moe.cpp
+++ b/src/core/src/op/moe.cpp
@@ -39,9 +39,7 @@ void MOE::validate_and_infer_types() {
 bool MOE::visit_attributes(ov::AttributeVisitor& visitor) {
     OV_OP_SCOPE(v16_MOE_visit_attributes);
 
-    // visitor.on_attribute("expert_type", m_config.expert_type);
-    // TODO: Add adapter
-
+    visitor.on_attribute("expert_type", m_config.expert_type);
     visitor.on_attribute("expert_alpha", m_config.expert_alpha);
     visitor.on_attribute("expert_beta", m_config.expert_beta);
 
@@ -50,4 +48,19 @@ bool MOE::visit_attributes(ov::AttributeVisitor& visitor) {
 
 }  // namespace v16
 }  // namespace op
+
+std::ostream& operator<<(std::ostream& s, const ov::op::v16::MOE::Expert_type& type) {
+    return s << as_string(type);
+}
+
+template <>
+OPENVINO_API EnumNames<ov::op::v16::MOE::Expert_type>& EnumNames<ov::op::v16::MOE::Expert_type>::get() {
+    static auto enum_names = EnumNames<ov::op::v16::MOE::Expert_type>(
+        "ov::op::v16::MOE::Expert_type",
+        {
+            {"gemm2_bias_swiglu_clamp", ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP},
+            {"gemm2_bias_gelu", ov::op::v16::MOE::Expert_type::GEMM3_SWIGLU},
+        });
+    return enum_names;
+}
 }  // namespace ov

From 6df368c47ed2621eff7fbcf705c2471870f9a533 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Tue, 23 Sep 2025 23:48:53 +0000
Subject: [PATCH 06/19] Fuse Multiply output before Reshape

---
 .../common_optimizations/matmul_experts_fusion.cpp           | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index 6ec241fd025712..fba0fa35d6e115 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -69,13 +69,14 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
     matcher_pass_callback callback = [=](pattern::Matcher& m) {
         auto& pm = m.get_pattern_value_map();
 
-        auto experts_input_node = pm.at(tile).get_node()->input_value(0);
+        auto experts_input_node = pm.at(experts_input).get_node()->input_value(0);
+
         auto routing_weights_node = pm.at(unsqueeze_routing_weights).get_node_shared_ptr();
         auto gate_up_weight = pm.at(gate_up_matmul).get_node()->input_value(1).get_node_shared_ptr();
         auto gate_up_bias_node = pm.at(gate_up_add).get_node()->input_value(1).get_node_shared_ptr();
         auto down_proj_weight = pm.at(down_proj_matmul).get_node()->input_value(1).get_node_shared_ptr();
         auto down_proj_bias_node = pm.at(down_proj_add).get_node()->input_value(1).get_node_shared_ptr();
-        auto topk_indices_node = pm.at(router_topk_indices).get_node_shared_ptr();
+        auto topk_indices_node = pm.at(scatter_elements_update).get_node()->input_value(1);
 
         ov::OutputVector moe_inputs = {experts_input_node,
                                        routing_weights_node,

From c6448d3a6d12098943754c07aa3866651a3c5e70 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 00:07:02 +0000
Subject: [PATCH 07/19] MOE fusion unit test

---
 .../fuse_vectorized_moe_test.cpp              | 253 ++++++++++++++++++
 1 file changed, 253 insertions(+)
 create mode 100644 src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp

diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
new file mode 100644
index 00000000000000..f234e4c0ffc069
--- /dev/null
+++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
@@ -0,0 +1,253 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <openvino/core/model.hpp>
+#include <openvino/op/add.hpp>
+#include <openvino/op/clamp.hpp>
+#include <openvino/op/constant.hpp>
+#include <openvino/op/matmul.hpp>
+#include <openvino/op/minimum.hpp>
+#include <openvino/op/moe.hpp>
+#include <openvino/op/multiply.hpp>
+#include <openvino/op/parameter.hpp>
+#include <openvino/op/reduce_sum.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/swish.hpp>
+#include <openvino/op/tile.hpp>
+#include <openvino/op/topk.hpp>
+#include <openvino/pass/serialize.hpp>
+#include <openvino/pass/visualize_tree.hpp>
+#include <vector>
+
+#include "common_test_utils/graph_comparator.hpp"
+#include "common_test_utils/ov_test_utils.hpp"
+#include "openvino/core/node_vector.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/scatter_elements_update.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "openvino/runtime/core.hpp"
+#include "ov_ops/type_relaxed.hpp"
+#include "transformations/common_optimizations/matmul_experts_fusion.hpp"
+#include "transformations/utils/gen_pattern.hpp"
+
+inline std::shared_ptr<ov::Model> build_moe_pattern_model() {
+    using namespace ov;
+
+    const size_t batch = 2;
+    const Dimension in_dim = Dimension::dynamic();
+    const size_t hidden_size = 2048;
+    const size_t intermediate_size = 4096;
+    const size_t topk = 2;
+    const size_t number_of_experts = 3;
+    const size_t fusion_factor = 2;
+    const auto expert_alpha = 1.702f;
+    const auto expert_beta = 7.0f;
+
+    auto input_shape = PartialShape{batch, in_dim, hidden_size};
+    auto input = std::make_shared<op::v0::Parameter>(element::f32, input_shape);
+    auto experts_reshape = std::make_shared<op::v1::Reshape>(
+        input,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{-1, hidden_size}),
+        false);
+
+    auto tile = std::make_shared<op::v0::Tile>(
+        experts_reshape,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{number_of_experts, 1}));
+    auto after_tile_reshape = std::make_shared<op::v1::Reshape>(
+        tile,
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{number_of_experts, batch, hidden_size}),
+        false);
+
+    auto gate_up_matmul = std::make_shared<op::v0::MatMul>(
+        after_tile_reshape,
+        op::v0::Constant::create(element::f32,
+                                 Shape{number_of_experts, hidden_size, intermediate_size * fusion_factor},
+                                 {1.0f}));
+    auto gate_up_add = std::make_shared<op::v1::Add>(
+        gate_up_matmul,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, 1, intermediate_size * fusion_factor}, {0.0f}));
+
+    auto slice1 = std::make_shared<op::v8::Slice>(
+        gate_up_add,
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{0, 0, 0}),
+        op::v0::Constant::create(element::i64,
+                                 Shape{3},
+                                 std::vector<int64_t>{number_of_experts, batch, intermediate_size * 2}),
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{1, 1, 2}),
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{0, 1, 2}));
+    auto clamp = std::make_shared<op::v0::Clamp>(slice1, -expert_beta, expert_beta);
+    auto add1 = std::make_shared<op::v1::Add>(clamp, op::v0::Constant::create(element::f32, Shape{1}, {1.0f}));
+
+    auto slice2 = std::make_shared<op::v8::Slice>(
+        gate_up_add,
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{0, 1, 0}),
+        op::v0::Constant::create(element::i64,
+                                 Shape{3},
+                                 std::vector<int64_t>{number_of_experts, batch, intermediate_size * 2}),
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{1, 1, 2}),
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{0, 1, 2}));
+    auto minimum1 =
+        std::make_shared<op::v1::Minimum>(slice2, op::v0::Constant::create(element::f32, Shape{1}, {10.0f}));
+    auto swish_beta = op::v0::Constant::create(element::f32, Shape{}, std::vector<float>{expert_alpha});
+    auto swish = std::make_shared<op::v4::Swish>(minimum1, swish_beta);
+
+    auto multiply2 = std::make_shared<op::v1::Multiply>(add1, swish);
+
+    auto down_proj_matmul = std::make_shared<op::v0::MatMul>(
+        multiply2,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}));
+
+    auto down_proj_add = std::make_shared<op::v1::Add>(
+        down_proj_matmul,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, 1, hidden_size}, {1.0f}));
+
+    auto end_reshape = std::make_shared<op::v1::Reshape>(
+        down_proj_add,
+        op::v0::Constant::create(element::i64,
+                                 Shape{4},
+                                 std::vector<int64_t>{number_of_experts, batch, -1, hidden_size}),
+        false);
+
+    // Router subgraph used to test correctness of routing weights connection
+    auto reshape_2nd_consumer_router_matmul = std::make_shared<op::v0::MatMul>(
+        experts_reshape,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size}, {1.0f}),
+        false,
+        true);
+
+    auto router_bias =
+        std::make_shared<op::v1::Add>(reshape_2nd_consumer_router_matmul,
+                                      op::v0::Constant::create(element::f32, Shape{1, number_of_experts}, {1.0f}));
+
+    auto router_topk_values_and_indices =
+        std::make_shared<op::v11::TopK>(router_bias,
+                                        op::v0::Constant::create(element::i64, Shape{}, {topk}),
+                                        -1,
+                                        op::v11::TopK::Mode::MAX,
+                                        op::v11::TopK::SortType::SORT_VALUES,
+                                        element::i64);
+
+    auto router_topk_values = router_topk_values_and_indices->output(0);
+    auto router_topk_indices = router_topk_values_and_indices->output(1);
+
+    auto scatter_elements_update = std::make_shared<op::v12::ScatterElementsUpdate>(
+        router_topk_values,
+        router_topk_indices,
+        op::v0::Constant::create(element::f32, Shape{batch, topk}, {0}),
+        op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{1}));
+    auto router_transpose = std::make_shared<op::v1::Transpose>(
+        scatter_elements_update,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{1, 0}));
+    auto router_reshape = std::make_shared<op::v1::Reshape>(
+        router_transpose,
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{number_of_experts, batch, -1}),
+        true);
+    auto unsqueeze_routing_weights =
+        std::make_shared<op::v0::Unsqueeze>(router_reshape,
+                                            op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{1}));
+
+    auto mul3 = std::make_shared<op::v1::Multiply>(end_reshape, unsqueeze_routing_weights);
+
+    // ReduceSum - final node of the MOE pattern to be fused
+    auto reduce_sum =
+        std::make_shared<op::v1::ReduceSum>(mul3,
+                                            op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{0}),
+                                            true);
+
+    return std::make_shared<ov::Model>(ov::OutputVector{reduce_sum}, ov::ParameterVector{input});
+}
+
+inline std::shared_ptr<ov::Model> build_fused_moe_reference_model() {
+    using namespace ov;
+
+    const size_t batch = 2;
+    const Dimension in_dim = Dimension::dynamic();
+    const size_t hidden_size = 2048;
+    const size_t intermediate_size = 4096;
+    const size_t topk = 2;
+    const size_t number_of_experts = 3;
+    const size_t fusion_factor = 2;
+    const auto expert_alpha = 1.702f;
+    const auto expert_beta = 7.0f;
+
+    auto input_shape = PartialShape{batch, in_dim, hidden_size};
+    auto input = std::make_shared<op::v0::Parameter>(element::f32, input_shape);
+
+    // Begin of Router subgraph (not fused, but valuable for testing)
+    auto experts_reshape = std::make_shared<op::v1::Reshape>(
+        input,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{-1, hidden_size}),
+        false);
+
+    auto reshape_2nd_consumer_router_matmul = std::make_shared<op::v0::MatMul>(
+        experts_reshape,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size}, {1.0f}),
+        false,
+        true);
+
+    auto router_bias =
+        std::make_shared<op::v1::Add>(reshape_2nd_consumer_router_matmul,
+                                      op::v0::Constant::create(element::f32, Shape{1, number_of_experts}, {1.0f}));
+
+    auto router_topk_values_and_indices =
+        std::make_shared<op::v11::TopK>(router_bias,
+                                        op::v0::Constant::create(element::i64, Shape{}, {topk}),
+                                        -1,
+                                        op::v11::TopK::Mode::MAX,
+                                        op::v11::TopK::SortType::SORT_VALUES,
+                                        element::i64);
+
+    auto router_topk_values = router_topk_values_and_indices->output(0);
+    auto router_topk_indices = router_topk_values_and_indices->output(1);
+
+    auto scatter_elements_update = std::make_shared<op::v12::ScatterElementsUpdate>(
+        router_topk_values,
+        router_topk_indices,
+        op::v0::Constant::create(element::f32, Shape{batch, topk}, {0}),
+        op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{1}));
+    auto router_transpose = std::make_shared<op::v1::Transpose>(
+        scatter_elements_update,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{1, 0}));
+    auto router_reshape = std::make_shared<op::v1::Reshape>(
+        router_transpose,
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{number_of_experts, batch, -1}),
+        true);
+    auto unsqueeze_routing_weights =
+        std::make_shared<op::v0::Unsqueeze>(router_reshape,
+                                            op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{1}));
+    // End of Router subgraph
+
+    // Expert MatMuls weights fused into MOE
+    auto w0_weight = op::v0::Constant::create(element::f32,
+                                              Shape{number_of_experts, hidden_size, intermediate_size * fusion_factor},
+                                              {1.0f});
+    auto w0_bias =
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, 1, intermediate_size * fusion_factor}, {0.0f});
+    auto w1_weight =
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f});
+    auto w1_bias = op::v0::Constant::create(element::f32, Shape{number_of_experts, 1, hidden_size}, {1.0f});
+
+    ov::OutputVector moe_inputs =
+        {input, unsqueeze_routing_weights, router_topk_indices, w0_weight, w0_bias, w1_weight, w1_bias};
+
+    ov::op::v16::MOE::Config config;
+    config.expert_type = ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP;
+    config.expert_alpha = expert_alpha;
+    config.expert_beta = expert_beta;
+
+    auto moe = std::make_shared<ov::op::v16::MOE>(moe_inputs, config);
+    return std::make_shared<ov::Model>(ov::OutputVector{moe}, ov::ParameterVector{input});
+}
+
+TEST_F(TransformationTestsF, FuseVectorizedMOE_basic) {
+    model = build_moe_pattern_model();
+    manager.register_pass<ov::pass::FuseVectorizedMOE>();
+    model_ref = build_fused_moe_reference_model();
+}

From f5c1c4187947fb2605e481e8c54a66484898b081 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 00:09:11 +0000
Subject: [PATCH 08/19] Add missing header

---
 .../common_optimizations/matmul_experts_fusion.cpp            | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index fba0fa35d6e115..dbc35413bbf0ff 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -1,3 +1,7 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
 #include "transformations/common_optimizations/matmul_experts_fusion.hpp"
 
 #include "itt.hpp"

From 762bc9ae2490ad3bf00130d025f7a170d8a70097 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 01:20:38 +0000
Subject: [PATCH 09/19] Move MOE op to internal

---
 .../matmul_experts_fusion.cpp                 |  6 ++--
 .../fuse_vectorized_moe_test.cpp              |  6 ++--
 .../{include => dev_api}/openvino/op/moe.hpp  | 34 +++++++++++--------
 src/core/include/openvino/op/ops.hpp          |  1 -
 .../include/openvino/opsets/opset16_tbl.hpp   |  1 -
 src/core/src/op/moe.cpp                       | 23 ++++++-------
 6 files changed, 36 insertions(+), 35 deletions(-)
 rename src/core/{include => dev_api}/openvino/op/moe.hpp (62%)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index dbc35413bbf0ff..9cc74aff6f7dc1 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -90,7 +90,7 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
                                        down_proj_weight,
                                        down_proj_bias_node};
 
-        ov::op::v16::MOE::Config config;
+        ov::op::internal::MOE::Config config;
 
         // Extract expert_alpha from Swish beta attribute
         auto swish_beta_const = ov::as_type_ptr<ov::op::v0::Constant>(pm.at(swish_beta).get_node_shared_ptr());
@@ -103,9 +103,9 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
         }
 
         // Set expert_type
-        config.expert_type = ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP;
+        config.expert_type = ov::op::internal::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP;
 
-        auto moe = std::make_shared<ov::op::v16::MOE>(moe_inputs, config);
+        auto moe = std::make_shared<ov::op::internal::MOE>(moe_inputs, config);
         moe->set_friendly_name(m.get_match_root()->get_friendly_name());
         ov::copy_runtime_info(m.get_matched_nodes(), moe);
         ov::replace_node(m.get_match_root(), moe);
diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
index f234e4c0ffc069..105d41efe3a6cb 100644
--- a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
+++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
@@ -237,12 +237,12 @@ inline std::shared_ptr<ov::Model> build_fused_moe_reference_model() {
     ov::OutputVector moe_inputs =
         {input, unsqueeze_routing_weights, router_topk_indices, w0_weight, w0_bias, w1_weight, w1_bias};
 
-    ov::op::v16::MOE::Config config;
-    config.expert_type = ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP;
+    ov::op::internal::MOE::Config config;
+    config.expert_type = ov::op::internal::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP;
     config.expert_alpha = expert_alpha;
     config.expert_beta = expert_beta;
 
-    auto moe = std::make_shared<ov::op::v16::MOE>(moe_inputs, config);
+    auto moe = std::make_shared<ov::op::internal::MOE>(moe_inputs, config);
     return std::make_shared<ov::Model>(ov::OutputVector{moe}, ov::ParameterVector{input});
 }
 
diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/dev_api/openvino/op/moe.hpp
similarity index 62%
rename from src/core/include/openvino/op/moe.hpp
rename to src/core/dev_api/openvino/op/moe.hpp
index e61b2c686f758b..4a9e78b975a66d 100644
--- a/src/core/include/openvino/op/moe.hpp
+++ b/src/core/dev_api/openvino/op/moe.hpp
@@ -12,13 +12,13 @@
 #include "openvino/op/constant.hpp"
 #include "openvino/op/op.hpp"
 
-namespace ov::op::v16 {
+namespace ov::op::internal {
 ///
 /// \brief MOE experts
 /// \ingroup ov_ops_cpp_api
 class OPENVINO_API MOE : public ov::op::Op {
 public:
-    OPENVINO_OP("MOE", "opset16");
+    OPENVINO_OP("MOE");
 
     MOE() = default;
 
@@ -36,12 +36,16 @@ class OPENVINO_API MOE : public ov::op::Op {
     ///   1: router_topk_output_weights - normalized weights for selected experts (input to final multiplication)
     ///   2: router_topk_output_indices - indices of selected top-k experts
     ///   3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or
-    ///   [num_experts, hidden_size, 2 * inter_size] if fused 4: w0_bias (optional) - expert bias for first projection,
-    ///   shape [num_experts, ...] or empty tensor if not needed 5: w1_weight - expert weights for second projection,
-    ///   shape [num_experts, inter_size, hidden_size] 6: w1_bias (optional) - expert bias for second projection, shape
-    ///   [num_experts, ...] or empty tensor if not needed 7: w2_weight - expert weights for final projection, shape
-    ///   [num_experts, hidden_size, inter_size] 8: w2_bias (optional/redundant) - expert bias for final projection,
-    ///   usually not required
+    ///   [num_experts, hidden_size, 2 * inter_size] if fused
+    ///   4: w0_bias (optional) - expert bias for first projection,
+    ///   shape [num_experts, ...] or empty tensor if not needed
+    ///   5: w1_weight - expert weights for second projection,
+    ///   shape [num_experts, inter_size, hidden_size]
+    ///   6: w1_bias (optional) - expert bias for second projection, shape
+    ///   [num_experts, ...] or empty tensor if not needed
+    ///   7: w2_weight - expert weights for final projection, shape
+    ///   [num_experts, hidden_size, inter_size]
+    ///   8: w2_bias (optional) - expert bias for final projection
     /// \param config Configuration for the MOE operation
     MOE(const OutputVector& args, const Config& config);
 
@@ -56,20 +60,20 @@ class OPENVINO_API MOE : public ov::op::Op {
     Config m_config;
 };
 
-}  // namespace ov::op::v16
+}  // namespace ov::op::internal
 
 namespace ov {
 OPENVINO_API
-std::ostream& operator<<(std::ostream& s, const ov::op::v16::MOE::Expert_type& type);
+std::ostream& operator<<(std::ostream& s, const ov::op::internal::MOE::Expert_type& type);
 
 template <>
-class OPENVINO_API
-    AttributeAdapter<ov::op::v16::MOE::Expert_type> : public EnumAttributeAdapterBase<ov::op::v16::MOE::Expert_type> {
+class OPENVINO_API AttributeAdapter<ov::op::internal::MOE::Expert_type>
+    : public EnumAttributeAdapterBase<ov::op::internal::MOE::Expert_type> {
 public:
-    AttributeAdapter(ov::op::v16::MOE::Expert_type& value)
-        : EnumAttributeAdapterBase<ov::op::v16::MOE::Expert_type>(value) {}
+    AttributeAdapter(ov::op::internal::MOE::Expert_type& value)
+        : EnumAttributeAdapterBase<ov::op::internal::MOE::Expert_type>(value) {}
 
-    OPENVINO_RTTI("AttributeAdapter<ov::op::v16::MOE::Expert_type>");
+    OPENVINO_RTTI("AttributeAdapter<ov::op::internal::MOE::Expert_type>");
     ~AttributeAdapter() override = default;
 };
 }  // namespace ov
diff --git a/src/core/include/openvino/op/ops.hpp b/src/core/include/openvino/op/ops.hpp
index a4150c218b1c73..dcb5fc0385ecde 100644
--- a/src/core/include/openvino/op/ops.hpp
+++ b/src/core/include/openvino/op/ops.hpp
@@ -119,7 +119,6 @@
 #include "openvino/op/minimum.hpp"
 #include "openvino/op/mish.hpp"
 #include "openvino/op/mod.hpp"
-#include "openvino/op/moe.hpp"
 #include "openvino/op/multiclass_nms.hpp"
 #include "openvino/op/multinomial.hpp"
 #include "openvino/op/multiply.hpp"
diff --git a/src/core/include/openvino/opsets/opset16_tbl.hpp b/src/core/include/openvino/opsets/opset16_tbl.hpp
index 98a3dcbd1912b4..6d01ab1d13cb22 100644
--- a/src/core/include/openvino/opsets/opset16_tbl.hpp
+++ b/src/core/include/openvino/opsets/opset16_tbl.hpp
@@ -16,7 +16,6 @@ _OPENVINO_OP_REG(ShapeOf, ov::op::v3)
 // New operations added in opset16
 _OPENVINO_OP_REG(Identity, ov::op::v16)
 _OPENVINO_OP_REG(ISTFT, ov::op::v16)
-_OPENVINO_OP_REG(MOE, ov::op::v16)
 _OPENVINO_OP_REG(SegmentMax, ov::op::v16)
 _OPENVINO_OP_REG(SparseFillEmptyRows, ov::op::v16)
 _OPENVINO_OP_REG(AvgPool, ov::op::v16)
diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp
index d63302128b4c40..6d28bf1bc52d6c 100644
--- a/src/core/src/op/moe.cpp
+++ b/src/core/src/op/moe.cpp
@@ -8,7 +8,7 @@
 
 namespace ov {
 namespace op {
-namespace v16 {
+namespace internal {
 
 MOE::MOE(const OutputVector& args, const Config& config) : Op(args), m_config(config) {
     constructor_validate_and_infer_types();
@@ -23,22 +23,21 @@ void MOE::set_config(const Config& config) {
 }
 
 std::shared_ptr<ov::Node> MOE::clone_with_new_inputs(const ov::OutputVector& new_args) const {
-    OV_OP_SCOPE(v16_MOE_clone_with_new_inputs);
+    OV_OP_SCOPE(internal_MOE_clone_with_new_inputs);
     check_new_args_count(this, new_args);
 
     return std::make_shared<MOE>(new_args, m_config);
 }
 
 void MOE::validate_and_infer_types() {
-    OV_OP_SCOPE(v16_MOE_validate_and_infer_types);
+    OV_OP_SCOPE(internal_MOE_validate_and_infer_types);
     // TODO: Add inputs validation
 
     set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
 }
 
 bool MOE::visit_attributes(ov::AttributeVisitor& visitor) {
-    OV_OP_SCOPE(v16_MOE_visit_attributes);
-
+    OV_OP_SCOPE(internal_MOE_visit_attributes);
     visitor.on_attribute("expert_type", m_config.expert_type);
     visitor.on_attribute("expert_alpha", m_config.expert_alpha);
     visitor.on_attribute("expert_beta", m_config.expert_beta);
@@ -46,20 +45,20 @@ bool MOE::visit_attributes(ov::AttributeVisitor& visitor) {
     return true;
 }
 
-}  // namespace v16
+}  // namespace internal
 }  // namespace op
 
-std::ostream& operator<<(std::ostream& s, const ov::op::v16::MOE::Expert_type& type) {
+std::ostream& operator<<(std::ostream& s, const ov::op::internal::MOE::Expert_type& type) {
     return s << as_string(type);
 }
 
 template <>
-OPENVINO_API EnumNames<ov::op::v16::MOE::Expert_type>& EnumNames<ov::op::v16::MOE::Expert_type>::get() {
-    static auto enum_names = EnumNames<ov::op::v16::MOE::Expert_type>(
-        "ov::op::v16::MOE::Expert_type",
+OPENVINO_API EnumNames<ov::op::internal::MOE::Expert_type>& EnumNames<ov::op::internal::MOE::Expert_type>::get() {
+    static auto enum_names = EnumNames<ov::op::internal::MOE::Expert_type>(
+        "ov::op::internal::MOE::Expert_type",
         {
-            {"gemm2_bias_swiglu_clamp", ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP},
-            {"gemm2_bias_gelu", ov::op::v16::MOE::Expert_type::GEMM3_SWIGLU},
+            {"gemm2_bias_swiglu_clamp", ov::op::internal::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP},
+            {"gemm3_swiglu", ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU},
         });
     return enum_names;
 }

From 41145cf52a06ac28ef593500129653f0372438fb Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 01:21:31 +0000
Subject: [PATCH 10/19] Apply MOE transformation for CPU

---
 .../intel_cpu/src/transformations/transformation_pipeline.cpp  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 201f11b8587e68..7812cfbe02da1e 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -69,6 +69,7 @@
 #include "transformations/common_optimizations/mark_precision_sensitive_shapeof_subgraphs.hpp"
 #include "transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.hpp"
 #include "transformations/common_optimizations/matmul_const_transposes_extraction.hpp"
+#include "transformations/common_optimizations/matmul_experts_fusion.hpp"
 #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp"
 #include "transformations/common_optimizations/mul_fake_quantize_fusion.hpp"
 #include "transformations/common_optimizations/nop_elimination.hpp"
@@ -555,7 +556,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
             });
         },
         ov::pass::KeepConstAndDecompression);
-
+    CPU_REGISTER_PASS_COMMON(manager, ov::pass::FuseVectorizedMOE);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion);
     CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion);
     ov::pass::ConvertPagedAttnInputs::KVCacheConfig cacheConfig;

From 5a346843dc56a7228314ebeff5c454db32e81f4e Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 01:22:43 +0000
Subject: [PATCH 11/19] Revert CPIU transformation pipeline change

---
 .../intel_cpu/src/transformations/transformation_pipeline.cpp   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 7812cfbe02da1e..432a934431dbc7 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -69,7 +69,6 @@
 #include "transformations/common_optimizations/mark_precision_sensitive_shapeof_subgraphs.hpp"
 #include "transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.hpp"
 #include "transformations/common_optimizations/matmul_const_transposes_extraction.hpp"
-#include "transformations/common_optimizations/matmul_experts_fusion.hpp"
 #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp"
 #include "transformations/common_optimizations/mul_fake_quantize_fusion.hpp"
 #include "transformations/common_optimizations/nop_elimination.hpp"
@@ -556,7 +555,6 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
             });
         },
         ov::pass::KeepConstAndDecompression);
-    CPU_REGISTER_PASS_COMMON(manager, ov::pass::FuseVectorizedMOE);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion);
     CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion);
     ov::pass::ConvertPagedAttnInputs::KVCacheConfig cacheConfig;

From b46f9608186509d787432aaaa24d87af7d0b83d8 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 13:44:34 +0000
Subject: [PATCH 12/19] Fix cast warning

---
 .../common_optimizations/matmul_experts_fusion.cpp              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index 9cc74aff6f7dc1..2251ac6c01ee48 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -99,7 +99,7 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
 
         // Extract expert_beta from Clamp max attribute
         if (auto clamp_op = ov::as_type_ptr<ov::op::v0::Clamp>(pm.at(clamp).get_node_shared_ptr())) {
-            config.expert_beta = clamp_op->get_max();
+            config.expert_beta = static_cast<float>(clamp_op->get_max());
         }
 
         // Set expert_type

From e343cd8b4419b5750521d6b7346a0a9f4a5e79a3 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 15:07:35 +0000
Subject: [PATCH 13/19] Remove OPENVINO_API macros

---
 src/core/dev_api/openvino/op/moe.hpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/core/dev_api/openvino/op/moe.hpp b/src/core/dev_api/openvino/op/moe.hpp
index 4a9e78b975a66d..0a8a90be7e2aa1 100644
--- a/src/core/dev_api/openvino/op/moe.hpp
+++ b/src/core/dev_api/openvino/op/moe.hpp
@@ -15,7 +15,6 @@
 namespace ov::op::internal {
 ///
 /// \brief MOE experts
-/// \ingroup ov_ops_cpp_api
 class OPENVINO_API MOE : public ov::op::Op {
 public:
     OPENVINO_OP("MOE");
@@ -63,11 +62,10 @@ class OPENVINO_API MOE : public ov::op::Op {
 }  // namespace ov::op::internal
 
 namespace ov {
-OPENVINO_API
 std::ostream& operator<<(std::ostream& s, const ov::op::internal::MOE::Expert_type& type);
 
 template <>
-class OPENVINO_API AttributeAdapter<ov::op::internal::MOE::Expert_type>
+class AttributeAdapter<ov::op::internal::MOE::Expert_type>
     : public EnumAttributeAdapterBase<ov::op::internal::MOE::Expert_type> {
 public:
     AttributeAdapter(ov::op::internal::MOE::Expert_type& value)

From 0406105e0b03cc1ddd6d638030fef188d54be6f6 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 15:52:05 +0000
Subject: [PATCH 14/19] Update input desc

---
 src/core/dev_api/openvino/op/moe.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/core/dev_api/openvino/op/moe.hpp b/src/core/dev_api/openvino/op/moe.hpp
index 0a8a90be7e2aa1..cf0eb992da83cd 100644
--- a/src/core/dev_api/openvino/op/moe.hpp
+++ b/src/core/dev_api/openvino/op/moe.hpp
@@ -21,7 +21,7 @@ class OPENVINO_API MOE : public ov::op::Op {
 
     MOE() = default;
 
-    enum class Expert_type { GEMM3_SWIGLU, GEMM2_BIAS_SWIGLU_CLAMP };
+    enum class Expert_type { GEMM2_BIAS_SWIGLU_CLAMP, GEMM3_SWIGLU };
 
     struct Config {
         Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP};
@@ -32,8 +32,9 @@ class OPENVINO_API MOE : public ov::op::Op {
     /// \brief Constructs a MOE operation with config only
     /// \param args The input tensors, in the following order:
     ///   0: hidden_states - input tensor with hidden representations
-    ///   1: router_topk_output_weights - normalized weights for selected experts (input to final multiplication)
-    ///   2: router_topk_output_indices - indices of selected top-k experts
+    ///   1: routing_weights - [num_experts, ...] normalized weights for selected experts
+    ///      (input to final multiplication)
+    ///   2: router_topk_output_indices - [..., topk] indices of selected top-k experts
     ///   3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or
     ///   [num_experts, hidden_size, 2 * inter_size] if fused
     ///   4: w0_bias (optional) - expert bias for first projection,

From eaede0de925187e6d4b942426ce41b004fcdd511 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Wed, 24 Sep 2025 16:12:24 +0000
Subject: [PATCH 15/19] No keep dims in Reduce

---
 .../common_optimizations/matmul_experts_fusion.cpp              | 2 +-
 .../tests/common_optimizations/fuse_vectorized_moe_test.cpp     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index 2251ac6c01ee48..685c579e2a7d91 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -67,7 +67,7 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
     auto unsqueeze_routing_weights = pattern::wrap_type<ov::op::v0::Unsqueeze>({router_reshape, pattern::any_input()});
 
     auto mul3 = pattern::wrap_type<ov::op::v1::Multiply>({end_reshape, unsqueeze_routing_weights});
-    auto reduce_sum = pattern::wrap_type<ov::op::v1::ReduceSum>({mul3, pattern::any_input()});
+    auto reduce_sum = pattern::wrap_type<ov::op::v1::ReduceSum>({mul3, pattern::any_input()}, {{"keep_dims", false}});
     auto moe_pattern = reduce_sum;
 
     matcher_pass_callback callback = [=](pattern::Matcher& m) {
diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
index 105d41efe3a6cb..a3ee9f6de550e6 100644
--- a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
+++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
@@ -159,7 +159,7 @@ inline std::shared_ptr<ov::Model> build_moe_pattern_model() {
     auto reduce_sum =
         std::make_shared<op::v1::ReduceSum>(mul3,
                                             op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{0}),
-                                            true);
+                                            false);
 
     return std::make_shared<ov::Model>(ov::OutputVector{reduce_sum}, ov::ParameterVector{input});
 }

From 9ce1569eb8cc2276352c6598dcdf4be7e4b96afe Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Tue, 30 Sep 2025 10:15:07 +0000
Subject: [PATCH 16/19] Add transpose attrs to MatMul patterns

---
 .../common_optimizations/matmul_experts_fusion.cpp          | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index 685c579e2a7d91..0c6862309fb25b 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -33,7 +33,8 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
     auto experts_input = pattern::wrap_type<ov::op::v1::Reshape>({pattern::any_input(), pattern::any_input()});
     auto tile = pattern::wrap_type<ov::op::v0::Tile>({experts_input, pattern::any_input()});
     auto after_tile_reshape = pattern::wrap_type<ov::op::v1::Reshape>({tile, pattern::any_input()});
-    auto gate_up_matmul = pattern::wrap_type<ov::op::v0::MatMul>({after_tile_reshape, pattern::any_input()});
+    auto gate_up_matmul = pattern::wrap_type<ov::op::v0::MatMul>({after_tile_reshape, pattern::any_input()},
+                                                                 {{"transpose_a", false}, {"transpose_b", false}});
     auto gate_up_add = pattern::wrap_type<ov::op::v1::Add>({gate_up_matmul, pattern::any_input()});
 
     // Branch 1: Slice_1 -> Clamp -> Add_1
@@ -53,7 +54,8 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
     auto multiply2 = pattern::wrap_type<ov::op::v1::Multiply>({add1, swish});
 
     // Down projection
-    auto down_proj_matmul = pattern::wrap_type<ov::op::v0::MatMul>({multiply2, pattern::any_input()});
+    auto down_proj_matmul = pattern::wrap_type<ov::op::v0::MatMul>({multiply2, pattern::any_input()},
+                                                                   {{"transpose_a", false}, {"transpose_b", false}});
     auto down_proj_add = pattern::wrap_type<ov::op::v1::Add>({down_proj_matmul, pattern::wrap_const()});
     auto end_reshape = pattern::wrap_type<ov::op::v1::Reshape>({down_proj_add, pattern::any_input()});
 

From 90f31a2d422cd531a1b5aca225ade1235ecfff74 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Tue, 30 Sep 2025 11:34:53 +0000
Subject: [PATCH 17/19] Switch beta with alpha to match the beta for swish
 naming

---
 .../common_optimizations/matmul_experts_fusion.cpp        | 8 ++++----
 src/core/dev_api/openvino/op/moe.hpp                      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index 0c6862309fb25b..2f8ccc6df706f9 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -94,14 +94,14 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
 
         ov::op::internal::MOE::Config config;
 
-        // Extract expert_alpha from Swish beta attribute
+        // Extract expert_beta from Swish beta attribute
         auto swish_beta_const = ov::as_type_ptr<ov::op::v0::Constant>(pm.at(swish_beta).get_node_shared_ptr());
         auto swish_beta_const_val = swish_beta_const->cast_vector<float>()[0];
-        config.expert_alpha = swish_beta_const_val;
+        config.expert_beta = swish_beta_const_val;
 
-        // Extract expert_beta from Clamp max attribute
+        // Extract expert_alpha from Clamp max attribute
         if (auto clamp_op = ov::as_type_ptr<ov::op::v0::Clamp>(pm.at(clamp).get_node_shared_ptr())) {
-            config.expert_beta = static_cast<float>(clamp_op->get_max());
+            config.expert_alpha = static_cast<float>(clamp_op->get_max());
         }
 
         // Set expert_type
diff --git a/src/core/dev_api/openvino/op/moe.hpp b/src/core/dev_api/openvino/op/moe.hpp
index cf0eb992da83cd..5147f15fa8b184 100644
--- a/src/core/dev_api/openvino/op/moe.hpp
+++ b/src/core/dev_api/openvino/op/moe.hpp
@@ -25,8 +25,8 @@ class OPENVINO_API MOE : public ov::op::Op {
 
     struct Config {
         Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP};
-        float expert_alpha{1.0f};  // Expert attribute, e.g. sigmoid alpha
-        float expert_beta{0.0f};   // Expert attribute, e.g. clamp limit
+        float expert_alpha{0.0f};  // Expert attribute for clamp bounds
+        float expert_beta{1.0f};   // Expert attribute for swish beta
     };
 
     /// \brief Constructs a MOE operation with config only

From 3a968558a38e383be25f2b7bedcaa6b206300d18 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Thu, 2 Oct 2025 15:10:12 +0000
Subject: [PATCH 18/19] Add fusion transformation for the second expert_type
 (GEMM3)

---
 .../matmul_experts_fusion.hpp                 |  26 ++-
 .../matmul_experts_fusion.cpp                 |  81 ++++++-
 .../fuse_vectorized_moe_test.cpp              | 216 +++++++++++++++++-
 3 files changed, 308 insertions(+), 15 deletions(-)

diff --git a/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp
index 27eac10769899b..482695ff3ce9ae 100644
--- a/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp
+++ b/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp
@@ -4,19 +4,37 @@
 
 #pragma once
 
+#include "openvino/pass/graph_rewrite.hpp"
 #include "openvino/pass/matcher_pass.hpp"
 #include "transformations_visibility.hpp"
 
 namespace ov {
 namespace pass {
 
-class TRANSFORMATIONS_API FuseVectorizedMOE;
+class TRANSFORMATIONS_API FuseVectorizedMOE2GEMM;
+class TRANSFORMATIONS_API FuseVectorizedMOE3GEMM;
+class TRANSFORMATIONS_API VectorizedExpertsFusion;
 
 }  // namespace pass
 }  // namespace ov
 
-class ov::pass::FuseVectorizedMOE : public ov::pass::MatcherPass {
+class ov::pass::FuseVectorizedMOE2GEMM : public ov::pass::MatcherPass {
 public:
-    OPENVINO_MATCHER_PASS_RTTI("FuseVectorizedMOE");
-    FuseVectorizedMOE();
+    OPENVINO_MATCHER_PASS_RTTI("FuseVectorizedMOE2GEMM");
+    FuseVectorizedMOE2GEMM();
+};
+
+class ov::pass::FuseVectorizedMOE3GEMM : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("FuseVectorizedMOE3GEMM");
+    FuseVectorizedMOE3GEMM();
+};
+
+class ov::pass::VectorizedExpertsFusion : public ov::pass::GraphRewrite {
+public:
+    OPENVINO_GRAPH_REWRITE_RTTI("VectorizedExpertsFusion");
+    VectorizedExpertsFusion() {
+        add_matcher<ov::pass::FuseVectorizedMOE2GEMM>();
+        add_matcher<ov::pass::FuseVectorizedMOE3GEMM>();
+    }
 };
diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index 2f8ccc6df706f9..27baeaea4e6a07 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -27,8 +27,8 @@
 #include "transformations/utils/utils.hpp"
 
 using namespace ov::pass;
-ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
-    MATCHER_SCOPE(FuseVectorizedMOE);
+ov::pass::FuseVectorizedMOE2GEMM::FuseVectorizedMOE2GEMM() {
+    MATCHER_SCOPE(FuseVectorizedMOE2GEMM);
 
     auto experts_input = pattern::wrap_type<ov::op::v1::Reshape>({pattern::any_input(), pattern::any_input()});
     auto tile = pattern::wrap_type<ov::op::v0::Tile>({experts_input, pattern::any_input()});
@@ -119,3 +119,80 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() {
     auto matcher = std::make_shared<pattern::Matcher>(moe_pattern, matcher_name);
     this->register_matcher(matcher, callback);
 }
+
+ov::pass::FuseVectorizedMOE3GEMM::FuseVectorizedMOE3GEMM() {
+    MATCHER_SCOPE(FuseVectorizedMOE3GEMM);
+
+    auto experts_input = pattern::wrap_type<ov::op::v1::Reshape>({pattern::any_input(), pattern::any_input()});
+    auto tile = pattern::wrap_type<ov::op::v0::Tile>({experts_input, pattern::any_input()});
+    auto after_tile_reshape = pattern::wrap_type<ov::op::v1::Reshape>({tile, pattern::any_input()});
+
+    // First GEMM (activation gate)
+    auto gate_matmul = pattern::wrap_type<ov::op::v0::MatMul>({after_tile_reshape, pattern::any_input()},
+                                                              {{"transpose_a", false}, {"transpose_b", false}});
+    auto swish = pattern::wrap_type<ov::op::v4::Swish>({gate_matmul});
+    // Second GEMM (up_projection)
+    auto up_matmul = pattern::wrap_type<ov::op::v0::MatMul>({after_tile_reshape, pattern::any_input()},
+                                                            {{"transpose_a", false}, {"transpose_b", false}});
+    // Join: Multiply (SwiGLU)
+    auto swiglu = pattern::wrap_type<ov::op::v1::Multiply>({swish, up_matmul});
+
+    // Third GEMM (down_projection)
+    auto down_matmul = pattern::wrap_type<ov::op::v0::MatMul>({swiglu, pattern::any_input()},
+                                                              {{"transpose_a", false}, {"transpose_b", false}});
+    auto end_reshape = pattern::wrap_type<ov::op::v1::Reshape>({down_matmul, pattern::any_input()});
+
+    // Routing weights/mask
+    auto router_topk_indices = pattern::any_input();
+    auto scatter_elements_update = pattern::wrap_type<ov::op::v12::ScatterElementsUpdate>(
+        {pattern::any_input(), router_topk_indices, pattern::any_input(), pattern::any_input()});
+    auto router_transpose = pattern::wrap_type<ov::op::v1::Transpose>({scatter_elements_update, pattern::any_input()});
+    auto router_reshape = pattern::wrap_type<ov::op::v1::Reshape>({router_transpose, pattern::any_input()});
+    auto unsqueeze_routing_weights = pattern::wrap_type<ov::op::v0::Unsqueeze>({router_reshape, pattern::any_input()});
+
+    auto mul3 = pattern::wrap_type<ov::op::v1::Multiply>({end_reshape, unsqueeze_routing_weights});
+    auto reduce_sum = pattern::wrap_type<ov::op::v1::ReduceSum>({mul3, pattern::any_input()}, {{"keep_dims", false}});
+    auto moe_pattern = reduce_sum;
+
+    matcher_pass_callback callback = [=](pattern::Matcher& m) {
+        auto& pm = m.get_pattern_value_map();
+        auto experts_input_node = pm.at(experts_input).get_node()->input_value(0);
+        auto routing_weights_node = pm.at(unsqueeze_routing_weights).get_node_shared_ptr();
+        auto gate_weight = pm.at(gate_matmul).get_node()->input_value(1).get_node_shared_ptr();
+        auto up_weight = pm.at(up_matmul).get_node()->input_value(1).get_node_shared_ptr();
+        auto down_weight = pm.at(down_matmul).get_node()->input_value(1).get_node_shared_ptr();
+        auto topk_indices_node = pm.at(scatter_elements_update).get_node()->input_value(1);
+
+        ov::OutputVector moe_inputs = {
+            experts_input_node,
+            routing_weights_node,
+            topk_indices_node,
+            gate_weight,
+            up_weight,
+            down_weight,
+        };
+
+        ov::op::internal::MOE::Config config;
+        config.expert_type = ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU;
+        // Extract expert_beta if Swish has beta input provided
+        if (auto swish_op = ov::as_type_ptr<ov::op::v4::Swish>(pm.at(swish).get_node_shared_ptr())) {
+            if (swish_op->get_input_size() > 1) {
+                if (auto swish_beta_const =
+                        ov::as_type_ptr<ov::op::v0::Constant>(swish_op->get_input_node_shared_ptr(1))) {
+                    config.expert_beta = swish_beta_const->cast_vector<float>()[0];
+                }
+            }
+        }
+
+        auto moe = std::make_shared<ov::op::internal::MOE>(moe_inputs, config);
+        moe->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ov::copy_runtime_info(m.get_matched_nodes(), moe);
+        ov::replace_node(m.get_match_root(), moe);
+
+        register_new_node(moe);
+        return true;
+    };
+
+    auto matcher = std::make_shared<pattern::Matcher>(moe_pattern, matcher_name);
+    this->register_matcher(matcher, callback);
+}
diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
index a3ee9f6de550e6..037429c61c06ca 100644
--- a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
+++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
@@ -37,7 +37,7 @@
 #include "transformations/common_optimizations/matmul_experts_fusion.hpp"
 #include "transformations/utils/gen_pattern.hpp"
 
-inline std::shared_ptr<ov::Model> build_moe_pattern_model() {
+inline std::shared_ptr<ov::Model> build_2gemm_moe_pattern_model() {
     using namespace ov;
 
     const size_t batch = 2;
@@ -151,7 +151,7 @@ inline std::shared_ptr<ov::Model> build_moe_pattern_model() {
         true);
     auto unsqueeze_routing_weights =
         std::make_shared<op::v0::Unsqueeze>(router_reshape,
-                                            op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{1}));
+                                            op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{-1}));
 
     auto mul3 = std::make_shared<op::v1::Multiply>(end_reshape, unsqueeze_routing_weights);
 
@@ -164,7 +164,7 @@ inline std::shared_ptr<ov::Model> build_moe_pattern_model() {
     return std::make_shared<ov::Model>(ov::OutputVector{reduce_sum}, ov::ParameterVector{input});
 }
 
-inline std::shared_ptr<ov::Model> build_fused_moe_reference_model() {
+inline std::shared_ptr<ov::Model> build_fused_2gemm_moe_reference_model() {
     using namespace ov;
 
     const size_t batch = 2;
@@ -220,8 +220,7 @@ inline std::shared_ptr<ov::Model> build_fused_moe_reference_model() {
         op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{number_of_experts, batch, -1}),
         true);
     auto unsqueeze_routing_weights =
-        std::make_shared<op::v0::Unsqueeze>(router_reshape,
-                                            op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{1}));
+        std::make_shared<op::v0::Unsqueeze>(router_reshape, op::v0::Constant::create(element::i64, Shape{1}, {-1}));
     // End of Router subgraph
 
     // Expert MatMuls weights fused into MOE
@@ -246,8 +245,207 @@ inline std::shared_ptr<ov::Model> build_fused_moe_reference_model() {
     return std::make_shared<ov::Model>(ov::OutputVector{moe}, ov::ParameterVector{input});
 }
 
-TEST_F(TransformationTestsF, FuseVectorizedMOE_basic) {
-    model = build_moe_pattern_model();
-    manager.register_pass<ov::pass::FuseVectorizedMOE>();
-    model_ref = build_fused_moe_reference_model();
+inline std::shared_ptr<ov::Model> build_3gemm_moe_pattern_model() {
+    using namespace ov;
+
+    const size_t batch = 2;
+    const Dimension in_dim = Dimension::dynamic();
+    const size_t hidden_size = 2048;
+    const size_t intermediate_size = 4096;
+    const size_t number_of_experts = 3;
+    const size_t topk = 2;
+
+    auto input_shape = PartialShape{batch, in_dim, hidden_size};
+    auto input = std::make_shared<op::v0::Parameter>(element::f32, input_shape);
+    auto experts_reshape = std::make_shared<op::v1::Reshape>(
+        input,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{-1, hidden_size}),
+        false);
+
+    auto tile = std::make_shared<op::v0::Tile>(
+        experts_reshape,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{number_of_experts, 1}));
+    auto after_tile_reshape = std::make_shared<op::v1::Reshape>(
+        tile,
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{number_of_experts, batch, hidden_size}),
+        false);
+
+    // First GEMM (gate)
+    auto gate_matmul = std::make_shared<op::v0::MatMul>(
+        after_tile_reshape,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}),
+        false,
+        false);
+
+    auto swish = std::make_shared<op::v4::Swish>(gate_matmul);
+
+    // Second GEMM (up)
+    auto up_matmul = std::make_shared<op::v0::MatMul>(
+        after_tile_reshape,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}),
+        false,
+        false);
+
+    auto swiglu = std::make_shared<op::v1::Multiply>(swish, up_matmul);
+
+    // Third GEMM (down)
+    auto down_matmul = std::make_shared<op::v0::MatMul>(
+        swiglu,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}),
+        false,
+        false);
+
+    auto experts_out_reshape = std::make_shared<op::v1::Reshape>(
+        down_matmul,
+        op::v0::Constant::create(element::i64,
+                                 Shape{4},
+                                 std::vector<int64_t>{number_of_experts, batch, -1, hidden_size}),
+        false);
+
+    // Router subgraph used to test correctness of routing weights connection
+    auto router_matmul = std::make_shared<op::v0::MatMul>(
+        experts_reshape,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size}, {1.0f}),
+        false,
+        true);
+
+    auto router_topk_values_and_indices =
+        std::make_shared<op::v11::TopK>(router_matmul,
+                                        op::v0::Constant::create(element::i64, Shape{}, {topk}),
+                                        -1,
+                                        op::v11::TopK::Mode::MAX,
+                                        op::v11::TopK::SortType::SORT_VALUES,
+                                        element::i64);
+
+    auto router_topk_values = router_topk_values_and_indices->output(0);
+    auto router_topk_indices = router_topk_values_and_indices->output(1);
+
+    auto scatter_elements_update = std::make_shared<op::v12::ScatterElementsUpdate>(
+        router_topk_values,
+        router_topk_indices,
+        op::v0::Constant::create(element::f32, Shape{batch, topk}, {0}),
+        op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{1}));
+    auto router_transpose = std::make_shared<op::v1::Transpose>(
+        scatter_elements_update,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{1, 0}));
+    auto router_reshape = std::make_shared<op::v1::Reshape>(
+        router_transpose,
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{number_of_experts, batch, -1}),
+        true);
+    auto unsqueeze_routing_weights =
+        std::make_shared<op::v0::Unsqueeze>(router_reshape,
+                                            op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{-1}));
+
+    auto mul3 = std::make_shared<op::v1::Multiply>(experts_out_reshape, unsqueeze_routing_weights);
+
+    // ReduceSum - final node of the MOE pattern to be fused
+    auto reduce_sum =
+        std::make_shared<op::v1::ReduceSum>(mul3,
+                                            op::v0::Constant::create(element::i64, Shape{1}, std::vector<int64_t>{0}),
+                                            false);
+
+    return std::make_shared<ov::Model>(ov::OutputVector{reduce_sum}, ov::ParameterVector{input});
 }
+
+inline std::shared_ptr<ov::Model> build_fused_3gemm_moe_reference_model() {
+    using namespace ov;
+
+    const size_t batch = 2;
+    const Dimension in_dim = Dimension::dynamic();
+    const size_t hidden_size = 2048;
+    const size_t intermediate_size = 4096;
+    const size_t number_of_experts = 3;
+    const size_t topk = 2;
+
+    auto input = std::make_shared<op::v0::Parameter>(element::f32, PartialShape{batch, in_dim, hidden_size});
+
+    // Begin of Router subgraph (not fused, but valuable for testing)
+    auto experts_reshape = std::make_shared<op::v1::Reshape>(
+        input,
+        op::v0::Constant::create(element::i64, Shape{2}, std::vector<int64_t>{-1, hidden_size}),
+        false);
+
+    auto router_matmul = std::make_shared<op::v0::MatMul>(
+        experts_reshape,
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size}, {1.0f}),
+        false,
+        true);
+
+    auto router_topk = std::make_shared<op::v11::TopK>(router_matmul,
+                                                       op::v0::Constant::create(element::i64, Shape{}, {topk}),
+                                                       -1,
+                                                       op::v11::TopK::Mode::MAX,
+                                                       op::v11::TopK::SortType::SORT_VALUES,
+                                                       element::i64);
+
+    auto router_topk_values = router_topk->output(0);
+    auto router_topk_indices = router_topk->output(1);
+
+    auto scatter_elements_update = std::make_shared<op::v12::ScatterElementsUpdate>(
+        router_topk_values,
+        router_topk_indices,
+        op::v0::Constant::create(element::f32, Shape{batch, topk}, {0}),
+        op::v0::Constant::create(element::i64, Shape{1}, {1}));
+
+    auto router_transpose =
+        std::make_shared<op::v1::Transpose>(scatter_elements_update,
+                                            op::v0::Constant::create(element::i64, Shape{2}, {1, 0}));
+    auto router_reshape = std::make_shared<op::v1::Reshape>(
+        router_transpose,
+        op::v0::Constant::create(element::i64, Shape{3}, std::vector<int64_t>{number_of_experts, batch, -1}),
+        true);
+
+    auto unsqueeze_routing_weights =
+        std::make_shared<op::v0::Unsqueeze>(router_reshape, op::v0::Constant::create(element::i64, Shape{1}, {-1}));
+
+    // MOE fused op
+    auto w0_weight =
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f});
+    auto w1_weight =
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f});
+    auto w2_weight =
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f});
+
+    ov::OutputVector moe_inputs =
+        {input, unsqueeze_routing_weights, router_topk_indices, w0_weight, w1_weight, w2_weight};
+
+    ov::op::internal::MOE::Config config;
+    config.expert_type = ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU;
+
+    auto moe = std::make_shared<ov::op::internal::MOE>(moe_inputs, config);
+    return std::make_shared<ov::Model>(ov::OutputVector{moe}, ov::ParameterVector{input});
+}
+
+TEST_F(TransformationTestsF, FuseVectorizedMOE2GEMM_basic) {
+    model = build_2gemm_moe_pattern_model();
+    manager.register_pass<ov::pass::FuseVectorizedMOE2GEMM>();
+    model_ref = build_fused_2gemm_moe_reference_model();
+}
+
+TEST_F(TransformationTestsF, FuseVectorizedMOE2GEMM_VectorizedExpertsFusion) {
+    model = build_2gemm_moe_pattern_model();
+    manager.register_pass<ov::pass::VectorizedExpertsFusion>();
+    model_ref = build_fused_2gemm_moe_reference_model();
+}
+
+TEST_F(TransformationTestsF, FuseVectorizedMOE2GEMM_no_fusion) {
+    model = build_3gemm_moe_pattern_model();
+    manager.register_pass<ov::pass::FuseVectorizedMOE2GEMM>();
+}
+
+TEST_F(TransformationTestsF, FuseVectorizedMOE3GEMM_basic) {
+    model = build_3gemm_moe_pattern_model();
+    manager.register_pass<ov::pass::FuseVectorizedMOE3GEMM>();
+    model_ref = build_fused_3gemm_moe_reference_model();
+}
+
+TEST_F(TransformationTestsF, FuseVectorizedMOE3GEMM_VectorizedExpertsFusion) {
+    model = build_3gemm_moe_pattern_model();
+    manager.register_pass<ov::pass::VectorizedExpertsFusion>();
+    model_ref = build_fused_3gemm_moe_reference_model();
+}
+
+TEST_F(TransformationTestsF, FuseVectorizedMOE3GEMM_no_fusion) {
+    model = build_2gemm_moe_pattern_model();
+    manager.register_pass<ov::pass::FuseVectorizedMOE3GEMM>();
+}
\ No newline at end of file

From df97c220e4c280435c983fe7b75a74c6d81364f0 Mon Sep 17 00:00:00 2001
From: mitruska <katarzyna.mitrus@intel.com>
Date: Mon, 13 Oct 2025 10:04:18 +0000
Subject: [PATCH 19/19] Update GEMM3 transpose_b attr to be true

---
 .../matmul_experts_fusion.cpp                  |  6 +++---
 .../fuse_vectorized_moe_test.cpp               | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
index 27baeaea4e6a07..76bbbef9abf8e0 100644
--- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp
@@ -129,17 +129,17 @@ ov::pass::FuseVectorizedMOE3GEMM::FuseVectorizedMOE3GEMM() {
 
     // First GEMM (activation gate)
     auto gate_matmul = pattern::wrap_type<ov::op::v0::MatMul>({after_tile_reshape, pattern::any_input()},
-                                                              {{"transpose_a", false}, {"transpose_b", false}});
+                                                              {{"transpose_a", false}, {"transpose_b", true}});
     auto swish = pattern::wrap_type<ov::op::v4::Swish>({gate_matmul});
     // Second GEMM (up_projection)
     auto up_matmul = pattern::wrap_type<ov::op::v0::MatMul>({after_tile_reshape, pattern::any_input()},
-                                                            {{"transpose_a", false}, {"transpose_b", false}});
+                                                            {{"transpose_a", false}, {"transpose_b", true}});
     // Join: Multiply (SwiGLU)
     auto swiglu = pattern::wrap_type<ov::op::v1::Multiply>({swish, up_matmul});
 
     // Third GEMM (down_projection)
     auto down_matmul = pattern::wrap_type<ov::op::v0::MatMul>({swiglu, pattern::any_input()},
-                                                              {{"transpose_a", false}, {"transpose_b", false}});
+                                                              {{"transpose_a", false}, {"transpose_b", true}});
     auto end_reshape = pattern::wrap_type<ov::op::v1::Reshape>({down_matmul, pattern::any_input()});
 
     // Routing weights/mask
diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
index 037429c61c06ca..90fac722910d04 100644
--- a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
+++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp
@@ -273,27 +273,27 @@ inline std::shared_ptr<ov::Model> build_3gemm_moe_pattern_model() {
     // First GEMM (gate)
     auto gate_matmul = std::make_shared<op::v0::MatMul>(
         after_tile_reshape,
-        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}),
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}),
         false,
-        false);
+        true);
 
     auto swish = std::make_shared<op::v4::Swish>(gate_matmul);
 
     // Second GEMM (up)
     auto up_matmul = std::make_shared<op::v0::MatMul>(
         after_tile_reshape,
-        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}),
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}),
         false,
-        false);
+        true);
 
     auto swiglu = std::make_shared<op::v1::Multiply>(swish, up_matmul);
 
     // Third GEMM (down)
     auto down_matmul = std::make_shared<op::v0::MatMul>(
         swiglu,
-        op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}),
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}),
         false,
-        false);
+        true);
 
     auto experts_out_reshape = std::make_shared<op::v1::Reshape>(
         down_matmul,
@@ -400,11 +400,11 @@ inline std::shared_ptr<ov::Model> build_fused_3gemm_moe_reference_model() {
 
     // MOE fused op
     auto w0_weight =
-        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f});
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f});
     auto w1_weight =
-        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f});
-    auto w2_weight =
         op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f});
+    auto w2_weight =
+        op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f});
 
     ov::OutputVector moe_inputs =
         {input, unsqueeze_routing_weights, router_topk_indices, w0_weight, w1_weight, w2_weight};