From dd3da1e3b9032f3ec6c59f47e03af94edd08372f Mon Sep 17 00:00:00 2001 From: mitruska Date: Mon, 22 Sep 2025 21:05:39 +0000 Subject: [PATCH 01/19] Vectorized MOE fusion init --- .../matmul_experts_fusion.hpp | 22 ++++ .../matmul_experts_fusion.cpp | 114 ++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp create mode 100644 src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp diff --git a/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp new file mode 100644 index 00000000000000..27eac10769899b --- /dev/null +++ b/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API FuseVectorizedMOE; + +} // namespace pass +} // namespace ov + +class ov::pass::FuseVectorizedMOE : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("FuseVectorizedMOE"); + FuseVectorizedMOE(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp new file mode 100644 index 00000000000000..23e0277cea7a5e --- /dev/null +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -0,0 +1,114 @@ +#include "transformations/common_optimizations/matmul_experts_fusion.hpp" + +#include "itt.hpp" +#include "openvino/core/graph_util.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/clamp.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/minimum.hpp" +#include "openvino/op/moe.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reduce_sum.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/scatter_elements_update.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/swish.hpp" +#include "openvino/op/tile.hpp" +#include "openvino/op/topk.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/unsqueeze.hpp" +#include "openvino/pass/manager.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" + +using namespace ov::pass; +ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { + MATCHER_SCOPE(FuseVectorizedMOE); + + auto experts_input = pattern::wrap_type({pattern::any_input(), pattern::any_input()}); + auto tile = pattern::wrap_type({experts_input, pattern::any_input()}); + auto after_tile_reshape = pattern::wrap_type({tile, pattern::any_input()}); + auto gate_up_matmul = pattern::wrap_type({after_tile_reshape, pattern::any_input()}); + auto gate_up_add = pattern::wrap_type({gate_up_matmul, pattern::any_input()}); + + // Branch 1: Slice_1 -> Clamp -> Add_1 + auto slice1 = pattern::wrap_type( + {gate_up_add, pattern::any_input(), pattern::any_input(), pattern::any_input(), pattern::any_input()}); + auto clamp = pattern::wrap_type({slice1}); + auto add1 = pattern::wrap_type({clamp, pattern::wrap_const()}); + + // Branch 2: Slice_2 -> Minimum_1 -> Swish + auto slice2 = pattern::wrap_type( + {gate_up_add, pattern::any_input(), pattern::any_input(), pattern::any_input(), pattern::any_input()}); + auto minimum1 = pattern::wrap_type({slice2, pattern::wrap_const()}); + auto swish_beta = pattern::wrap_const(); + auto swish = pattern::wrap_type({minimum1, swish_beta}); + + // Join: Multiply_2 + auto multiply2 = pattern::wrap_type({add1, swish}); + + // Down projection + auto down_proj_matmul = pattern::wrap_type({multiply2, pattern::any_input()}); + auto down_proj_add = pattern::wrap_type({down_proj_matmul, pattern::wrap_const()}); + auto end_reshape = pattern::wrap_type({down_proj_add, pattern::any_input()}); + + // Routing weights/mask + auto router_topk_indices = pattern::any_input(); + auto scatter_elements_update = pattern::wrap_type( + {pattern::any_input(), router_topk_indices, pattern::any_input(), pattern::any_input()}); + + auto router_transpose = pattern::wrap_type({scatter_elements_update, pattern::any_input()}); + auto router_reshape = pattern::wrap_type({router_transpose, pattern::any_input()}); + auto unsqueeze_routing_weights = pattern::wrap_type({router_reshape, pattern::any_input()}); + + auto mul3 = pattern::wrap_type({end_reshape, unsqueeze_routing_weights}); + auto reduce_sum = pattern::wrap_type({mul3, pattern::any_input()}); + auto moe_pattern = reduce_sum; + + matcher_pass_callback callback = [=](pattern::Matcher& m) { + auto& pm = m.get_pattern_value_map(); + + auto experts_input_node = pm.at(tile).get_node()->input_value(0); + auto routing_weights_node = pm.at(unsqueeze_routing_weights).get_node_shared_ptr(); + auto gate_up_weight = pm.at(gate_up_matmul).get_node()->input_value(1).get_node_shared_ptr(); + auto gate_up_bias_node = pm.at(gate_up_add).get_node()->input_value(1).get_node_shared_ptr(); + auto down_proj_weight = pm.at(down_proj_matmul).get_node()->input_value(1).get_node_shared_ptr(); + auto down_proj_bias_node = pm.at(down_proj_add).get_node()->input_value(1).get_node_shared_ptr(); + auto topk_indices_node = pm.at(router_topk_indices).get_node_shared_ptr(); + + ov::OutputVector moe_inputs = {experts_input_node, + topk_indices_node, + routing_weights_node, + gate_up_weight, + gate_up_bias_node, + down_proj_weight, + down_proj_bias_node}; + + ov::op::v16::MOE::Config config; + + // Extract expert_alpha from Swish beta attribute + auto swish_beta_const = ov::as_type_ptr(pm.at(swish_beta).get_node_shared_ptr()); + auto swish_beta_const_val = swish_beta_const->cast_vector()[0]; + config.expert_alpha = swish_beta_const_val; + + // Extract expert_beta from Clamp max attribute + if (auto clamp_op = ov::as_type_ptr(pm.at(clamp).get_node_shared_ptr())) { + config.expert_beta = clamp_op->get_max(); + } + + // Set expert_type + config.expert_type = ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP; + + auto moe = std::make_shared(moe_inputs, config); + moe->set_friendly_name(m.get_match_root()->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), moe); + ov::replace_node(m.get_match_root(), moe); + + register_new_node(moe); + return true; + }; + + auto matcher = std::make_shared(moe_pattern, matcher_name); + this->register_matcher(matcher, callback); +} From 7b9572ebeaece8081b7b17efca3ba8a16e13262b Mon Sep 17 00:00:00 2001 From: mitruska Date: Mon, 22 Sep 2025 21:09:26 +0000 Subject: [PATCH 02/19] MOE op init --- src/core/include/openvino/op/moe.hpp | 88 +++++++++++++++++ src/core/include/openvino/op/ops.hpp | 1 + .../include/openvino/opsets/opset16_tbl.hpp | 1 + src/core/src/op/moe.cpp | 96 +++++++++++++++++++ 4 files changed, 186 insertions(+) create mode 100644 src/core/include/openvino/op/moe.hpp create mode 100644 src/core/src/op/moe.cpp diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/include/openvino/op/moe.hpp new file mode 100644 index 00000000000000..4320cfd368ab95 --- /dev/null +++ b/src/core/include/openvino/op/moe.hpp @@ -0,0 +1,88 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "openvino/core/node.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/op.hpp" + +namespace ov::op::v16 { +/// +/// \brief MOE experts +/// \ingroup ov_ops_cpp_api +class OPENVINO_API MOE : public ov::op::Op { +public: + OPENVINO_OP("MOE", "opset16"); + + MOE() = default; + + enum class Expert_type { + GEMM3_SWIGLU, + GEMM2_BIAS_SWIGLU_CLAMP + }; + + struct Config { + size_t topk{}; + size_t expert_num{}; + size_t hidden_size{}; + size_t intermediate_size{}; + size_t group_size{}; // quantized group size, 0 for no group size. same for gate/up/down + ov::element::Type weight_type{}; // same for gate/up/down + ov::element::Type scale_type{}; // same for gate/up/down + ov::element::Type zp_type{}; // same for gate/up/down + + Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP}; + float expert_alpha{1.0f}; // Expert attribute, e.g. sigmoid alpha (gpt-oss: 1.702) + float expert_beta{0.0f}; // Expert attribute, e.g. clamp limit (gpt-oss: 7.0) + + bool operator==(const Config& rhs) const { + return std::tie(topk, + expert_num, + hidden_size, + intermediate_size, + group_size, + weight_type, + scale_type, + zp_type) == std::tie(rhs.topk, + rhs.expert_num, + rhs.hidden_size, + rhs.intermediate_size, + rhs.group_size, + rhs.weight_type, + rhs.scale_type, + rhs.zp_type); + } + }; + + /// \brief Constructs a MOE operation with config only + /// \param args The input tensors: [hidden_states, router_logits] followed by expert weights/scales/zps + /// \param config Configuration for the MOE operation + MOE(const OutputVector& args, const Config& config); + + const Config& get_config() const; + void set_config(const Config& config); + + bool visit_attributes(AttributeVisitor& visitor) override; + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + /// \brief Get expert weight/scale/zp constant for a specific expert and weight type + /// \param expert_idx Index of the expert (0 to expert_num-1) + /// \param weight_type 0=gate, 1=up, 2=down + /// \param const_type 0=weight, 1=scale, 2=zp + /// \return Constant node or nullptr if not present + std::shared_ptr get_expert_const(size_t expert_idx, + size_t weight_type, + size_t const_type) const; + +private: + Config m_config; +}; + +} // namespace ov::op::v16 diff --git a/src/core/include/openvino/op/ops.hpp b/src/core/include/openvino/op/ops.hpp index dcb5fc0385ecde..a4150c218b1c73 100644 --- a/src/core/include/openvino/op/ops.hpp +++ b/src/core/include/openvino/op/ops.hpp @@ -119,6 +119,7 @@ #include "openvino/op/minimum.hpp" #include "openvino/op/mish.hpp" #include "openvino/op/mod.hpp" +#include "openvino/op/moe.hpp" #include "openvino/op/multiclass_nms.hpp" #include "openvino/op/multinomial.hpp" #include "openvino/op/multiply.hpp" diff --git a/src/core/include/openvino/opsets/opset16_tbl.hpp b/src/core/include/openvino/opsets/opset16_tbl.hpp index 39d3d5d1d80889..a8312c9a09dc58 100644 --- a/src/core/include/openvino/opsets/opset16_tbl.hpp +++ b/src/core/include/openvino/opsets/opset16_tbl.hpp @@ -16,6 +16,7 @@ _OPENVINO_OP_REG(ShapeOf, ov::op::v3) // New operations added in opset16 _OPENVINO_OP_REG(Identity, ov::op::v16) _OPENVINO_OP_REG(ISTFT, ov::op::v16) +_OPENVINO_OP_REG(MOE, ov::op::v16) _OPENVINO_OP_REG(SegmentMax, ov::op::v16) _OPENVINO_OP_REG(SparseFillEmptyRows, ov::op::v16) _OPENVINO_OP_REG(AvgPool, ov::op::v16) diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp new file mode 100644 index 00000000000000..80ce3a4342455c --- /dev/null +++ b/src/core/src/op/moe.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/moe.hpp" + +#include "itt.hpp" + +namespace ov { +namespace op { +namespace v16 { + +MOE::MOE(const OutputVector& args, const Config& config) : Op(args), m_config(config) { + constructor_validate_and_infer_types(); +} + +const MOE::Config& MOE::get_config() const { + return m_config; +} + +void MOE::set_config(const Config& config) { + m_config = config; +} + +std::shared_ptr MOE::clone_with_new_inputs(const ov::OutputVector& new_args) const { + OV_OP_SCOPE(v16_MOE_clone_with_new_inputs); + check_new_args_count(this, new_args); + + return std::make_shared(new_args, m_config); +} + +void MOE::validate_and_infer_types() { + OV_OP_SCOPE(v16_MOE_validate_and_infer_types); + // At minimum we need 2 inputs: hidden_states and router_logits + OPENVINO_ASSERT(get_input_size() >= 2, "MOE must have at least 2 inputs whereas it has ", get_input_size()); + + // For now, just do basic validation. The input layout validation can be more flexible + // to allow incremental building during pattern matching + // Expected inputs: + // 0: hidden_states + // 1: router_logits + // 2+: expert constants (flexible layout during construction) + + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + +bool MOE::visit_attributes(ov::AttributeVisitor& visitor) { + OV_OP_SCOPE(v16_MOE_visit_attributes); + visitor.start_structure("config"); + + visitor.on_attribute("topk", m_config.topk); + visitor.on_attribute("expert_num", m_config.expert_num); + visitor.on_attribute("hidden_size", m_config.hidden_size); + visitor.on_attribute("intermediate_size", m_config.intermediate_size); + visitor.on_attribute("group_size", m_config.group_size); + visitor.on_attribute("weight_type", m_config.weight_type); + visitor.on_attribute("scale_type", m_config.scale_type); + visitor.on_attribute("zp_type", m_config.zp_type); + visitor.finish_structure(); + + return true; +} + +std::shared_ptr MOE::get_expert_const(size_t expert_idx, size_t weight_type, size_t const_type) const { + OPENVINO_ASSERT(expert_idx < m_config.expert_num, "Expert index out of range"); + OPENVINO_ASSERT(weight_type < 3, "Weight type must be 0 (gate), 1 (up), or 2 (down)"); + OPENVINO_ASSERT(const_type < 3, "Const type must be 0 (weight), 1 (scale), or 2 (zp)"); + + // Calculate input index based on expert and weight/const type + // Input layout: [hidden_states, router_logits, expert0_gate_weight, expert0_gate_scale?, expert0_gate_zp?, + // expert0_up_weight, expert0_up_scale?, expert0_up_zp?, expert0_down_weight, expert0_down_scale?, expert0_down_zp?, ...] + + size_t base_idx = 2; // Start after hidden_states and router_logits + + // For now, assume simple layout: weight, scale?, zp? for each of gate, up, down + size_t constants_per_weight_type = 1; // Just weights for now, will need to extend for scales/zps + if (m_config.scale_type != ov::element::dynamic) constants_per_weight_type++; + if (m_config.zp_type != ov::element::dynamic) constants_per_weight_type++; + + size_t constants_per_expert = 3 * constants_per_weight_type; // 3 weight types * constants per type + + size_t expert_base = base_idx + expert_idx * constants_per_expert; + size_t weight_base = expert_base + weight_type * constants_per_weight_type; + size_t input_idx = weight_base + const_type; + + if (input_idx >= get_input_size()) { + return nullptr; // Constant not provided (e.g., scale or zp for non-quantized weights) + } + + auto input_node = get_input_node_shared_ptr(input_idx); + return ov::as_type_ptr(input_node); +} + +} // namespace v16 +} // namespace op +} // namespace ov From 4a50118c69417756917ba9821f05510c666310e4 Mon Sep 17 00:00:00 2001 From: mitruska Date: Mon, 22 Sep 2025 21:47:02 +0000 Subject: [PATCH 03/19] MOE attrs/inputs adjust --- src/core/include/openvino/op/moe.hpp | 42 ++---------------------- src/core/src/op/moe.cpp | 49 ++++------------------------ 2 files changed, 10 insertions(+), 81 deletions(-) diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/include/openvino/op/moe.hpp index 4320cfd368ab95..4f3493bc546aa0 100644 --- a/src/core/include/openvino/op/moe.hpp +++ b/src/core/include/openvino/op/moe.hpp @@ -28,40 +28,13 @@ class OPENVINO_API MOE : public ov::op::Op { }; struct Config { - size_t topk{}; - size_t expert_num{}; - size_t hidden_size{}; - size_t intermediate_size{}; - size_t group_size{}; // quantized group size, 0 for no group size. same for gate/up/down - ov::element::Type weight_type{}; // same for gate/up/down - ov::element::Type scale_type{}; // same for gate/up/down - ov::element::Type zp_type{}; // same for gate/up/down - Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP}; - float expert_alpha{1.0f}; // Expert attribute, e.g. sigmoid alpha (gpt-oss: 1.702) - float expert_beta{0.0f}; // Expert attribute, e.g. clamp limit (gpt-oss: 7.0) - - bool operator==(const Config& rhs) const { - return std::tie(topk, - expert_num, - hidden_size, - intermediate_size, - group_size, - weight_type, - scale_type, - zp_type) == std::tie(rhs.topk, - rhs.expert_num, - rhs.hidden_size, - rhs.intermediate_size, - rhs.group_size, - rhs.weight_type, - rhs.scale_type, - rhs.zp_type); - } + float expert_alpha{1.0f}; // Expert attribute, e.g. sigmoid alpha + float expert_beta{0.0f}; // Expert attribute, e.g. clamp limit }; /// \brief Constructs a MOE operation with config only - /// \param args The input tensors: [hidden_states, router_logits] followed by expert weights/scales/zps + /// \param args The input tensors /// \param config Configuration for the MOE operation MOE(const OutputVector& args, const Config& config); @@ -72,15 +45,6 @@ class OPENVINO_API MOE : public ov::op::Op { void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - /// \brief Get expert weight/scale/zp constant for a specific expert and weight type - /// \param expert_idx Index of the expert (0 to expert_num-1) - /// \param weight_type 0=gate, 1=up, 2=down - /// \param const_type 0=weight, 1=scale, 2=zp - /// \return Constant node or nullptr if not present - std::shared_ptr get_expert_const(size_t expert_idx, - size_t weight_type, - size_t const_type) const; - private: Config m_config; }; diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp index 80ce3a4342455c..b31a585478ad08 100644 --- a/src/core/src/op/moe.cpp +++ b/src/core/src/op/moe.cpp @@ -46,49 +46,14 @@ void MOE::validate_and_infer_types() { bool MOE::visit_attributes(ov::AttributeVisitor& visitor) { OV_OP_SCOPE(v16_MOE_visit_attributes); - visitor.start_structure("config"); - - visitor.on_attribute("topk", m_config.topk); - visitor.on_attribute("expert_num", m_config.expert_num); - visitor.on_attribute("hidden_size", m_config.hidden_size); - visitor.on_attribute("intermediate_size", m_config.intermediate_size); - visitor.on_attribute("group_size", m_config.group_size); - visitor.on_attribute("weight_type", m_config.weight_type); - visitor.on_attribute("scale_type", m_config.scale_type); - visitor.on_attribute("zp_type", m_config.zp_type); - visitor.finish_structure(); - - return true; -} -std::shared_ptr MOE::get_expert_const(size_t expert_idx, size_t weight_type, size_t const_type) const { - OPENVINO_ASSERT(expert_idx < m_config.expert_num, "Expert index out of range"); - OPENVINO_ASSERT(weight_type < 3, "Weight type must be 0 (gate), 1 (up), or 2 (down)"); - OPENVINO_ASSERT(const_type < 3, "Const type must be 0 (weight), 1 (scale), or 2 (zp)"); - - // Calculate input index based on expert and weight/const type - // Input layout: [hidden_states, router_logits, expert0_gate_weight, expert0_gate_scale?, expert0_gate_zp?, - // expert0_up_weight, expert0_up_scale?, expert0_up_zp?, expert0_down_weight, expert0_down_scale?, expert0_down_zp?, ...] - - size_t base_idx = 2; // Start after hidden_states and router_logits - - // For now, assume simple layout: weight, scale?, zp? for each of gate, up, down - size_t constants_per_weight_type = 1; // Just weights for now, will need to extend for scales/zps - if (m_config.scale_type != ov::element::dynamic) constants_per_weight_type++; - if (m_config.zp_type != ov::element::dynamic) constants_per_weight_type++; - - size_t constants_per_expert = 3 * constants_per_weight_type; // 3 weight types * constants per type - - size_t expert_base = base_idx + expert_idx * constants_per_expert; - size_t weight_base = expert_base + weight_type * constants_per_weight_type; - size_t input_idx = weight_base + const_type; - - if (input_idx >= get_input_size()) { - return nullptr; // Constant not provided (e.g., scale or zp for non-quantized weights) - } - - auto input_node = get_input_node_shared_ptr(input_idx); - return ov::as_type_ptr(input_node); + // visitor.on_attribute("expert_type", m_config.expert_type); + // TODO: Add adapter + + visitor.on_attribute("expert_alpha", m_config.expert_alpha); + visitor.on_attribute("expert_beta", m_config.expert_beta); + + return true; } } // namespace v16 From 7e3230ef47d4e0a18f1753982e22a783353ed948 Mon Sep 17 00:00:00 2001 From: mitruska Date: Mon, 22 Sep 2025 22:12:49 +0000 Subject: [PATCH 04/19] Adjust inputs desc --- .../common_optimizations/matmul_experts_fusion.cpp | 2 +- src/core/include/openvino/op/moe.hpp | 11 ++++++++++- src/core/src/op/moe.cpp | 10 +--------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index 23e0277cea7a5e..6ec241fd025712 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -78,8 +78,8 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { auto topk_indices_node = pm.at(router_topk_indices).get_node_shared_ptr(); ov::OutputVector moe_inputs = {experts_input_node, - topk_indices_node, routing_weights_node, + topk_indices_node, gate_up_weight, gate_up_bias_node, down_proj_weight, diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/include/openvino/op/moe.hpp index 4f3493bc546aa0..fa2a51432b068d 100644 --- a/src/core/include/openvino/op/moe.hpp +++ b/src/core/include/openvino/op/moe.hpp @@ -34,7 +34,16 @@ class OPENVINO_API MOE : public ov::op::Op { }; /// \brief Constructs a MOE operation with config only - /// \param args The input tensors + /// \param args The input tensors, in the following order: + /// 0: hidden_states - input tensor with hidden representations + /// 1: router_topk_output_weights - normalized weights for selected experts (input to final multiplication) + /// 2: router_topk_output_indices - indices of selected top-k experts + /// 3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or [num_experts, hidden_size, 2 * inter_size] if fused + /// 4: w0_bias (optional) - expert bias for first projection, shape [num_experts, ...] or empty tensor if not needed + /// 5: w1_weight - expert weights for second projection, shape [num_experts, inter_size, hidden_size] + /// 6: w1_bias (optional) - expert bias for second projection, shape [num_experts, ...] or empty tensor if not needed + /// 7: w2_weight - expert weights for final projection, shape [num_experts, hidden_size, inter_size] + /// 8: w2_bias (optional/redundant) - expert bias for final projection, usually not required /// \param config Configuration for the MOE operation MOE(const OutputVector& args, const Config& config); diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp index b31a585478ad08..57c2bd67d94317 100644 --- a/src/core/src/op/moe.cpp +++ b/src/core/src/op/moe.cpp @@ -31,15 +31,7 @@ std::shared_ptr MOE::clone_with_new_inputs(const ov::OutputVector& new void MOE::validate_and_infer_types() { OV_OP_SCOPE(v16_MOE_validate_and_infer_types); - // At minimum we need 2 inputs: hidden_states and router_logits - OPENVINO_ASSERT(get_input_size() >= 2, "MOE must have at least 2 inputs whereas it has ", get_input_size()); - - // For now, just do basic validation. The input layout validation can be more flexible - // to allow incremental building during pattern matching - // Expected inputs: - // 0: hidden_states - // 1: router_logits - // 2+: expert constants (flexible layout during construction) + // TODO: Add inputs validation set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); } From 104246af146b33f6aa884e916911a7c56f75d477 Mon Sep 17 00:00:00 2001 From: mitruska Date: Mon, 22 Sep 2025 22:29:28 +0000 Subject: [PATCH 05/19] Add adapters for expert_type enum --- src/core/include/openvino/op/moe.hpp | 34 ++++++++++++++++++++-------- src/core/src/op/moe.cpp | 19 +++++++++++++--- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/include/openvino/op/moe.hpp index fa2a51432b068d..e61b2c686f758b 100644 --- a/src/core/include/openvino/op/moe.hpp +++ b/src/core/include/openvino/op/moe.hpp @@ -22,10 +22,7 @@ class OPENVINO_API MOE : public ov::op::Op { MOE() = default; - enum class Expert_type { - GEMM3_SWIGLU, - GEMM2_BIAS_SWIGLU_CLAMP - }; + enum class Expert_type { GEMM3_SWIGLU, GEMM2_BIAS_SWIGLU_CLAMP }; struct Config { Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP}; @@ -38,12 +35,13 @@ class OPENVINO_API MOE : public ov::op::Op { /// 0: hidden_states - input tensor with hidden representations /// 1: router_topk_output_weights - normalized weights for selected experts (input to final multiplication) /// 2: router_topk_output_indices - indices of selected top-k experts - /// 3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or [num_experts, hidden_size, 2 * inter_size] if fused - /// 4: w0_bias (optional) - expert bias for first projection, shape [num_experts, ...] or empty tensor if not needed - /// 5: w1_weight - expert weights for second projection, shape [num_experts, inter_size, hidden_size] - /// 6: w1_bias (optional) - expert bias for second projection, shape [num_experts, ...] or empty tensor if not needed - /// 7: w2_weight - expert weights for final projection, shape [num_experts, hidden_size, inter_size] - /// 8: w2_bias (optional/redundant) - expert bias for final projection, usually not required + /// 3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or + /// [num_experts, hidden_size, 2 * inter_size] if fused 4: w0_bias (optional) - expert bias for first projection, + /// shape [num_experts, ...] or empty tensor if not needed 5: w1_weight - expert weights for second projection, + /// shape [num_experts, inter_size, hidden_size] 6: w1_bias (optional) - expert bias for second projection, shape + /// [num_experts, ...] or empty tensor if not needed 7: w2_weight - expert weights for final projection, shape + /// [num_experts, hidden_size, inter_size] 8: w2_bias (optional/redundant) - expert bias for final projection, + /// usually not required /// \param config Configuration for the MOE operation MOE(const OutputVector& args, const Config& config); @@ -59,3 +57,19 @@ class OPENVINO_API MOE : public ov::op::Op { }; } // namespace ov::op::v16 + +namespace ov { +OPENVINO_API +std::ostream& operator<<(std::ostream& s, const ov::op::v16::MOE::Expert_type& type); + +template <> +class OPENVINO_API + AttributeAdapter : public EnumAttributeAdapterBase { +public: + AttributeAdapter(ov::op::v16::MOE::Expert_type& value) + : EnumAttributeAdapterBase(value) {} + + OPENVINO_RTTI("AttributeAdapter"); + ~AttributeAdapter() override = default; +}; +} // namespace ov diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp index 57c2bd67d94317..d63302128b4c40 100644 --- a/src/core/src/op/moe.cpp +++ b/src/core/src/op/moe.cpp @@ -39,9 +39,7 @@ void MOE::validate_and_infer_types() { bool MOE::visit_attributes(ov::AttributeVisitor& visitor) { OV_OP_SCOPE(v16_MOE_visit_attributes); - // visitor.on_attribute("expert_type", m_config.expert_type); - // TODO: Add adapter - + visitor.on_attribute("expert_type", m_config.expert_type); visitor.on_attribute("expert_alpha", m_config.expert_alpha); visitor.on_attribute("expert_beta", m_config.expert_beta); @@ -50,4 +48,19 @@ bool MOE::visit_attributes(ov::AttributeVisitor& visitor) { } // namespace v16 } // namespace op + +std::ostream& operator<<(std::ostream& s, const ov::op::v16::MOE::Expert_type& type) { + return s << as_string(type); +} + +template <> +OPENVINO_API EnumNames& EnumNames::get() { + static auto enum_names = EnumNames( + "ov::op::v16::MOE::Expert_type", + { + {"gemm2_bias_swiglu_clamp", ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP}, + {"gemm2_bias_gelu", ov::op::v16::MOE::Expert_type::GEMM3_SWIGLU}, + }); + return enum_names; +} } // namespace ov From 6df368c47ed2621eff7fbcf705c2471870f9a533 Mon Sep 17 00:00:00 2001 From: mitruska Date: Tue, 23 Sep 2025 23:48:53 +0000 Subject: [PATCH 06/19] Fuse Multiply output before Reshape --- .../common_optimizations/matmul_experts_fusion.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index 6ec241fd025712..fba0fa35d6e115 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -69,13 +69,14 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { matcher_pass_callback callback = [=](pattern::Matcher& m) { auto& pm = m.get_pattern_value_map(); - auto experts_input_node = pm.at(tile).get_node()->input_value(0); + auto experts_input_node = pm.at(experts_input).get_node()->input_value(0); + auto routing_weights_node = pm.at(unsqueeze_routing_weights).get_node_shared_ptr(); auto gate_up_weight = pm.at(gate_up_matmul).get_node()->input_value(1).get_node_shared_ptr(); auto gate_up_bias_node = pm.at(gate_up_add).get_node()->input_value(1).get_node_shared_ptr(); auto down_proj_weight = pm.at(down_proj_matmul).get_node()->input_value(1).get_node_shared_ptr(); auto down_proj_bias_node = pm.at(down_proj_add).get_node()->input_value(1).get_node_shared_ptr(); - auto topk_indices_node = pm.at(router_topk_indices).get_node_shared_ptr(); + auto topk_indices_node = pm.at(scatter_elements_update).get_node()->input_value(1); ov::OutputVector moe_inputs = {experts_input_node, routing_weights_node, From c6448d3a6d12098943754c07aa3866651a3c5e70 Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 00:07:02 +0000 Subject: [PATCH 07/19] MOE fusion unit test --- .../fuse_vectorized_moe_test.cpp | 253 ++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp new file mode 100644 index 00000000000000..f234e4c0ffc069 --- /dev/null +++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp @@ -0,0 +1,253 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common_test_utils/graph_comparator.hpp" +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/core/node_vector.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/scatter_elements_update.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/unsqueeze.hpp" +#include "openvino/runtime/core.hpp" +#include "ov_ops/type_relaxed.hpp" +#include "transformations/common_optimizations/matmul_experts_fusion.hpp" +#include "transformations/utils/gen_pattern.hpp" + +inline std::shared_ptr build_moe_pattern_model() { + using namespace ov; + + const size_t batch = 2; + const Dimension in_dim = Dimension::dynamic(); + const size_t hidden_size = 2048; + const size_t intermediate_size = 4096; + const size_t topk = 2; + const size_t number_of_experts = 3; + const size_t fusion_factor = 2; + const auto expert_alpha = 1.702f; + const auto expert_beta = 7.0f; + + auto input_shape = PartialShape{batch, in_dim, hidden_size}; + auto input = std::make_shared(element::f32, input_shape); + auto experts_reshape = std::make_shared( + input, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{-1, hidden_size}), + false); + + auto tile = std::make_shared( + experts_reshape, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{number_of_experts, 1})); + auto after_tile_reshape = std::make_shared( + tile, + op::v0::Constant::create(element::i64, Shape{3}, std::vector{number_of_experts, batch, hidden_size}), + false); + + auto gate_up_matmul = std::make_shared( + after_tile_reshape, + op::v0::Constant::create(element::f32, + Shape{number_of_experts, hidden_size, intermediate_size * fusion_factor}, + {1.0f})); + auto gate_up_add = std::make_shared( + gate_up_matmul, + op::v0::Constant::create(element::f32, Shape{number_of_experts, 1, intermediate_size * fusion_factor}, {0.0f})); + + auto slice1 = std::make_shared( + gate_up_add, + op::v0::Constant::create(element::i64, Shape{3}, std::vector{0, 0, 0}), + op::v0::Constant::create(element::i64, + Shape{3}, + std::vector{number_of_experts, batch, intermediate_size * 2}), + op::v0::Constant::create(element::i64, Shape{3}, std::vector{1, 1, 2}), + op::v0::Constant::create(element::i64, Shape{3}, std::vector{0, 1, 2})); + auto clamp = std::make_shared(slice1, -expert_beta, expert_beta); + auto add1 = std::make_shared(clamp, op::v0::Constant::create(element::f32, Shape{1}, {1.0f})); + + auto slice2 = std::make_shared( + gate_up_add, + op::v0::Constant::create(element::i64, Shape{3}, std::vector{0, 1, 0}), + op::v0::Constant::create(element::i64, + Shape{3}, + std::vector{number_of_experts, batch, intermediate_size * 2}), + op::v0::Constant::create(element::i64, Shape{3}, std::vector{1, 1, 2}), + op::v0::Constant::create(element::i64, Shape{3}, std::vector{0, 1, 2})); + auto minimum1 = + std::make_shared(slice2, op::v0::Constant::create(element::f32, Shape{1}, {10.0f})); + auto swish_beta = op::v0::Constant::create(element::f32, Shape{}, std::vector{expert_alpha}); + auto swish = std::make_shared(minimum1, swish_beta); + + auto multiply2 = std::make_shared(add1, swish); + + auto down_proj_matmul = std::make_shared( + multiply2, + op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f})); + + auto down_proj_add = std::make_shared( + down_proj_matmul, + op::v0::Constant::create(element::f32, Shape{number_of_experts, 1, hidden_size}, {1.0f})); + + auto end_reshape = std::make_shared( + down_proj_add, + op::v0::Constant::create(element::i64, + Shape{4}, + std::vector{number_of_experts, batch, -1, hidden_size}), + false); + + // Router subgraph used to test correctness of routing weights connection + auto reshape_2nd_consumer_router_matmul = std::make_shared( + experts_reshape, + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size}, {1.0f}), + false, + true); + + auto router_bias = + std::make_shared(reshape_2nd_consumer_router_matmul, + op::v0::Constant::create(element::f32, Shape{1, number_of_experts}, {1.0f})); + + auto router_topk_values_and_indices = + std::make_shared(router_bias, + op::v0::Constant::create(element::i64, Shape{}, {topk}), + -1, + op::v11::TopK::Mode::MAX, + op::v11::TopK::SortType::SORT_VALUES, + element::i64); + + auto router_topk_values = router_topk_values_and_indices->output(0); + auto router_topk_indices = router_topk_values_and_indices->output(1); + + auto scatter_elements_update = std::make_shared( + router_topk_values, + router_topk_indices, + op::v0::Constant::create(element::f32, Shape{batch, topk}, {0}), + op::v0::Constant::create(element::i64, Shape{1}, std::vector{1})); + auto router_transpose = std::make_shared( + scatter_elements_update, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{1, 0})); + auto router_reshape = std::make_shared( + router_transpose, + op::v0::Constant::create(element::i64, Shape{3}, std::vector{number_of_experts, batch, -1}), + true); + auto unsqueeze_routing_weights = + std::make_shared(router_reshape, + op::v0::Constant::create(element::i64, Shape{1}, std::vector{1})); + + auto mul3 = std::make_shared(end_reshape, unsqueeze_routing_weights); + + // ReduceSum - final node of the MOE pattern to be fused + auto reduce_sum = + std::make_shared(mul3, + op::v0::Constant::create(element::i64, Shape{1}, std::vector{0}), + true); + + return std::make_shared(ov::OutputVector{reduce_sum}, ov::ParameterVector{input}); +} + +inline std::shared_ptr build_fused_moe_reference_model() { + using namespace ov; + + const size_t batch = 2; + const Dimension in_dim = Dimension::dynamic(); + const size_t hidden_size = 2048; + const size_t intermediate_size = 4096; + const size_t topk = 2; + const size_t number_of_experts = 3; + const size_t fusion_factor = 2; + const auto expert_alpha = 1.702f; + const auto expert_beta = 7.0f; + + auto input_shape = PartialShape{batch, in_dim, hidden_size}; + auto input = std::make_shared(element::f32, input_shape); + + // Begin of Router subgraph (not fused, but valuable for testing) + auto experts_reshape = std::make_shared( + input, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{-1, hidden_size}), + false); + + auto reshape_2nd_consumer_router_matmul = std::make_shared( + experts_reshape, + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size}, {1.0f}), + false, + true); + + auto router_bias = + std::make_shared(reshape_2nd_consumer_router_matmul, + op::v0::Constant::create(element::f32, Shape{1, number_of_experts}, {1.0f})); + + auto router_topk_values_and_indices = + std::make_shared(router_bias, + op::v0::Constant::create(element::i64, Shape{}, {topk}), + -1, + op::v11::TopK::Mode::MAX, + op::v11::TopK::SortType::SORT_VALUES, + element::i64); + + auto router_topk_values = router_topk_values_and_indices->output(0); + auto router_topk_indices = router_topk_values_and_indices->output(1); + + auto scatter_elements_update = std::make_shared( + router_topk_values, + router_topk_indices, + op::v0::Constant::create(element::f32, Shape{batch, topk}, {0}), + op::v0::Constant::create(element::i64, Shape{1}, std::vector{1})); + auto router_transpose = std::make_shared( + scatter_elements_update, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{1, 0})); + auto router_reshape = std::make_shared( + router_transpose, + op::v0::Constant::create(element::i64, Shape{3}, std::vector{number_of_experts, batch, -1}), + true); + auto unsqueeze_routing_weights = + std::make_shared(router_reshape, + op::v0::Constant::create(element::i64, Shape{1}, std::vector{1})); + // End of Router subgraph + + // Expert MatMuls weights fused into MOE + auto w0_weight = op::v0::Constant::create(element::f32, + Shape{number_of_experts, hidden_size, intermediate_size * fusion_factor}, + {1.0f}); + auto w0_bias = + op::v0::Constant::create(element::f32, Shape{number_of_experts, 1, intermediate_size * fusion_factor}, {0.0f}); + auto w1_weight = + op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}); + auto w1_bias = op::v0::Constant::create(element::f32, Shape{number_of_experts, 1, hidden_size}, {1.0f}); + + ov::OutputVector moe_inputs = + {input, unsqueeze_routing_weights, router_topk_indices, w0_weight, w0_bias, w1_weight, w1_bias}; + + ov::op::v16::MOE::Config config; + config.expert_type = ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP; + config.expert_alpha = expert_alpha; + config.expert_beta = expert_beta; + + auto moe = std::make_shared(moe_inputs, config); + return std::make_shared(ov::OutputVector{moe}, ov::ParameterVector{input}); +} + +TEST_F(TransformationTestsF, FuseVectorizedMOE_basic) { + model = build_moe_pattern_model(); + manager.register_pass(); + model_ref = build_fused_moe_reference_model(); +} From f5c1c4187947fb2605e481e8c54a66484898b081 Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 00:09:11 +0000 Subject: [PATCH 08/19] Add missing header --- .../common_optimizations/matmul_experts_fusion.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index fba0fa35d6e115..dbc35413bbf0ff 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -1,3 +1,7 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + #include "transformations/common_optimizations/matmul_experts_fusion.hpp" #include "itt.hpp" From 762bc9ae2490ad3bf00130d025f7a170d8a70097 Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 01:20:38 +0000 Subject: [PATCH 09/19] Move MOE op to internal --- .../matmul_experts_fusion.cpp | 6 ++-- .../fuse_vectorized_moe_test.cpp | 6 ++-- .../{include => dev_api}/openvino/op/moe.hpp | 34 +++++++++++-------- src/core/include/openvino/op/ops.hpp | 1 - .../include/openvino/opsets/opset16_tbl.hpp | 1 - src/core/src/op/moe.cpp | 23 ++++++------- 6 files changed, 36 insertions(+), 35 deletions(-) rename src/core/{include => dev_api}/openvino/op/moe.hpp (62%) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index dbc35413bbf0ff..9cc74aff6f7dc1 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -90,7 +90,7 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { down_proj_weight, down_proj_bias_node}; - ov::op::v16::MOE::Config config; + ov::op::internal::MOE::Config config; // Extract expert_alpha from Swish beta attribute auto swish_beta_const = ov::as_type_ptr(pm.at(swish_beta).get_node_shared_ptr()); @@ -103,9 +103,9 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { } // Set expert_type - config.expert_type = ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP; + config.expert_type = ov::op::internal::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP; - auto moe = std::make_shared(moe_inputs, config); + auto moe = std::make_shared(moe_inputs, config); moe->set_friendly_name(m.get_match_root()->get_friendly_name()); ov::copy_runtime_info(m.get_matched_nodes(), moe); ov::replace_node(m.get_match_root(), moe); diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp index f234e4c0ffc069..105d41efe3a6cb 100644 --- a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp +++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp @@ -237,12 +237,12 @@ inline std::shared_ptr build_fused_moe_reference_model() { ov::OutputVector moe_inputs = {input, unsqueeze_routing_weights, router_topk_indices, w0_weight, w0_bias, w1_weight, w1_bias}; - ov::op::v16::MOE::Config config; - config.expert_type = ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP; + ov::op::internal::MOE::Config config; + config.expert_type = ov::op::internal::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP; config.expert_alpha = expert_alpha; config.expert_beta = expert_beta; - auto moe = std::make_shared(moe_inputs, config); + auto moe = std::make_shared(moe_inputs, config); return std::make_shared(ov::OutputVector{moe}, ov::ParameterVector{input}); } diff --git a/src/core/include/openvino/op/moe.hpp b/src/core/dev_api/openvino/op/moe.hpp similarity index 62% rename from src/core/include/openvino/op/moe.hpp rename to src/core/dev_api/openvino/op/moe.hpp index e61b2c686f758b..4a9e78b975a66d 100644 --- a/src/core/include/openvino/op/moe.hpp +++ b/src/core/dev_api/openvino/op/moe.hpp @@ -12,13 +12,13 @@ #include "openvino/op/constant.hpp" #include "openvino/op/op.hpp" -namespace ov::op::v16 { +namespace ov::op::internal { /// /// \brief MOE experts /// \ingroup ov_ops_cpp_api class OPENVINO_API MOE : public ov::op::Op { public: - OPENVINO_OP("MOE", "opset16"); + OPENVINO_OP("MOE"); MOE() = default; @@ -36,12 +36,16 @@ class OPENVINO_API MOE : public ov::op::Op { /// 1: router_topk_output_weights - normalized weights for selected experts (input to final multiplication) /// 2: router_topk_output_indices - indices of selected top-k experts /// 3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or - /// [num_experts, hidden_size, 2 * inter_size] if fused 4: w0_bias (optional) - expert bias for first projection, - /// shape [num_experts, ...] or empty tensor if not needed 5: w1_weight - expert weights for second projection, - /// shape [num_experts, inter_size, hidden_size] 6: w1_bias (optional) - expert bias for second projection, shape - /// [num_experts, ...] or empty tensor if not needed 7: w2_weight - expert weights for final projection, shape - /// [num_experts, hidden_size, inter_size] 8: w2_bias (optional/redundant) - expert bias for final projection, - /// usually not required + /// [num_experts, hidden_size, 2 * inter_size] if fused + /// 4: w0_bias (optional) - expert bias for first projection, + /// shape [num_experts, ...] or empty tensor if not needed + /// 5: w1_weight - expert weights for second projection, + /// shape [num_experts, inter_size, hidden_size] + /// 6: w1_bias (optional) - expert bias for second projection, shape + /// [num_experts, ...] or empty tensor if not needed + /// 7: w2_weight - expert weights for final projection, shape + /// [num_experts, hidden_size, inter_size] + /// 8: w2_bias (optional) - expert bias for final projection /// \param config Configuration for the MOE operation MOE(const OutputVector& args, const Config& config); @@ -56,20 +60,20 @@ class OPENVINO_API MOE : public ov::op::Op { Config m_config; }; -} // namespace ov::op::v16 +} // namespace ov::op::internal namespace ov { OPENVINO_API -std::ostream& operator<<(std::ostream& s, const ov::op::v16::MOE::Expert_type& type); +std::ostream& operator<<(std::ostream& s, const ov::op::internal::MOE::Expert_type& type); template <> -class OPENVINO_API - AttributeAdapter : public EnumAttributeAdapterBase { +class OPENVINO_API AttributeAdapter + : public EnumAttributeAdapterBase { public: - AttributeAdapter(ov::op::v16::MOE::Expert_type& value) - : EnumAttributeAdapterBase(value) {} + AttributeAdapter(ov::op::internal::MOE::Expert_type& value) + : EnumAttributeAdapterBase(value) {} - OPENVINO_RTTI("AttributeAdapter"); + OPENVINO_RTTI("AttributeAdapter"); ~AttributeAdapter() override = default; }; } // namespace ov diff --git a/src/core/include/openvino/op/ops.hpp b/src/core/include/openvino/op/ops.hpp index a4150c218b1c73..dcb5fc0385ecde 100644 --- a/src/core/include/openvino/op/ops.hpp +++ b/src/core/include/openvino/op/ops.hpp @@ -119,7 +119,6 @@ #include "openvino/op/minimum.hpp" #include "openvino/op/mish.hpp" #include "openvino/op/mod.hpp" -#include "openvino/op/moe.hpp" #include "openvino/op/multiclass_nms.hpp" #include "openvino/op/multinomial.hpp" #include "openvino/op/multiply.hpp" diff --git a/src/core/include/openvino/opsets/opset16_tbl.hpp b/src/core/include/openvino/opsets/opset16_tbl.hpp index 98a3dcbd1912b4..6d01ab1d13cb22 100644 --- a/src/core/include/openvino/opsets/opset16_tbl.hpp +++ b/src/core/include/openvino/opsets/opset16_tbl.hpp @@ -16,7 +16,6 @@ _OPENVINO_OP_REG(ShapeOf, ov::op::v3) // New operations added in opset16 _OPENVINO_OP_REG(Identity, ov::op::v16) _OPENVINO_OP_REG(ISTFT, ov::op::v16) -_OPENVINO_OP_REG(MOE, ov::op::v16) _OPENVINO_OP_REG(SegmentMax, ov::op::v16) _OPENVINO_OP_REG(SparseFillEmptyRows, ov::op::v16) _OPENVINO_OP_REG(AvgPool, ov::op::v16) diff --git a/src/core/src/op/moe.cpp b/src/core/src/op/moe.cpp index d63302128b4c40..6d28bf1bc52d6c 100644 --- a/src/core/src/op/moe.cpp +++ b/src/core/src/op/moe.cpp @@ -8,7 +8,7 @@ namespace ov { namespace op { -namespace v16 { +namespace internal { MOE::MOE(const OutputVector& args, const Config& config) : Op(args), m_config(config) { constructor_validate_and_infer_types(); @@ -23,22 +23,21 @@ void MOE::set_config(const Config& config) { } std::shared_ptr MOE::clone_with_new_inputs(const ov::OutputVector& new_args) const { - OV_OP_SCOPE(v16_MOE_clone_with_new_inputs); + OV_OP_SCOPE(internal_MOE_clone_with_new_inputs); check_new_args_count(this, new_args); return std::make_shared(new_args, m_config); } void MOE::validate_and_infer_types() { - OV_OP_SCOPE(v16_MOE_validate_and_infer_types); + OV_OP_SCOPE(internal_MOE_validate_and_infer_types); // TODO: Add inputs validation set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); } bool MOE::visit_attributes(ov::AttributeVisitor& visitor) { - OV_OP_SCOPE(v16_MOE_visit_attributes); - + OV_OP_SCOPE(internal_MOE_visit_attributes); visitor.on_attribute("expert_type", m_config.expert_type); visitor.on_attribute("expert_alpha", m_config.expert_alpha); visitor.on_attribute("expert_beta", m_config.expert_beta); @@ -46,20 +45,20 @@ bool MOE::visit_attributes(ov::AttributeVisitor& visitor) { return true; } -} // namespace v16 +} // namespace internal } // namespace op -std::ostream& operator<<(std::ostream& s, const ov::op::v16::MOE::Expert_type& type) { +std::ostream& operator<<(std::ostream& s, const ov::op::internal::MOE::Expert_type& type) { return s << as_string(type); } template <> -OPENVINO_API EnumNames& EnumNames::get() { - static auto enum_names = EnumNames( - "ov::op::v16::MOE::Expert_type", +OPENVINO_API EnumNames& EnumNames::get() { + static auto enum_names = EnumNames( + "ov::op::internal::MOE::Expert_type", { - {"gemm2_bias_swiglu_clamp", ov::op::v16::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP}, - {"gemm2_bias_gelu", ov::op::v16::MOE::Expert_type::GEMM3_SWIGLU}, + {"gemm2_bias_swiglu_clamp", ov::op::internal::MOE::Expert_type::GEMM2_BIAS_SWIGLU_CLAMP}, + {"gemm3_swiglu", ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU}, }); return enum_names; } From 41145cf52a06ac28ef593500129653f0372438fb Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 01:21:31 +0000 Subject: [PATCH 10/19] Apply MOE transformation for CPU --- .../intel_cpu/src/transformations/transformation_pipeline.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 201f11b8587e68..7812cfbe02da1e 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -69,6 +69,7 @@ #include "transformations/common_optimizations/mark_precision_sensitive_shapeof_subgraphs.hpp" #include "transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.hpp" #include "transformations/common_optimizations/matmul_const_transposes_extraction.hpp" +#include "transformations/common_optimizations/matmul_experts_fusion.hpp" #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp" #include "transformations/common_optimizations/mul_fake_quantize_fusion.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" @@ -555,7 +556,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis }); }, ov::pass::KeepConstAndDecompression); - + CPU_REGISTER_PASS_COMMON(manager, ov::pass::FuseVectorizedMOE); CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion); CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion); ov::pass::ConvertPagedAttnInputs::KVCacheConfig cacheConfig; From 5a346843dc56a7228314ebeff5c454db32e81f4e Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 01:22:43 +0000 Subject: [PATCH 11/19] Revert CPIU transformation pipeline change --- .../intel_cpu/src/transformations/transformation_pipeline.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 7812cfbe02da1e..432a934431dbc7 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -69,7 +69,6 @@ #include "transformations/common_optimizations/mark_precision_sensitive_shapeof_subgraphs.hpp" #include "transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.hpp" #include "transformations/common_optimizations/matmul_const_transposes_extraction.hpp" -#include "transformations/common_optimizations/matmul_experts_fusion.hpp" #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp" #include "transformations/common_optimizations/mul_fake_quantize_fusion.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" @@ -556,7 +555,6 @@ void Transformations::PreLpt(const std::vector& defaultPrecis }); }, ov::pass::KeepConstAndDecompression); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::FuseVectorizedMOE); CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion); CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion); ov::pass::ConvertPagedAttnInputs::KVCacheConfig cacheConfig; From b46f9608186509d787432aaaa24d87af7d0b83d8 Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 13:44:34 +0000 Subject: [PATCH 12/19] Fix cast warning --- .../common_optimizations/matmul_experts_fusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index 9cc74aff6f7dc1..2251ac6c01ee48 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -99,7 +99,7 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { // Extract expert_beta from Clamp max attribute if (auto clamp_op = ov::as_type_ptr(pm.at(clamp).get_node_shared_ptr())) { - config.expert_beta = clamp_op->get_max(); + config.expert_beta = static_cast(clamp_op->get_max()); } // Set expert_type From e343cd8b4419b5750521d6b7346a0a9f4a5e79a3 Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 15:07:35 +0000 Subject: [PATCH 13/19] Remove OPENVINO_API macros --- src/core/dev_api/openvino/op/moe.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core/dev_api/openvino/op/moe.hpp b/src/core/dev_api/openvino/op/moe.hpp index 4a9e78b975a66d..0a8a90be7e2aa1 100644 --- a/src/core/dev_api/openvino/op/moe.hpp +++ b/src/core/dev_api/openvino/op/moe.hpp @@ -15,7 +15,6 @@ namespace ov::op::internal { /// /// \brief MOE experts -/// \ingroup ov_ops_cpp_api class OPENVINO_API MOE : public ov::op::Op { public: OPENVINO_OP("MOE"); @@ -63,11 +62,10 @@ class OPENVINO_API MOE : public ov::op::Op { } // namespace ov::op::internal namespace ov { -OPENVINO_API std::ostream& operator<<(std::ostream& s, const ov::op::internal::MOE::Expert_type& type); template <> -class OPENVINO_API AttributeAdapter +class AttributeAdapter : public EnumAttributeAdapterBase { public: AttributeAdapter(ov::op::internal::MOE::Expert_type& value) From 0406105e0b03cc1ddd6d638030fef188d54be6f6 Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 15:52:05 +0000 Subject: [PATCH 14/19] Update input desc --- src/core/dev_api/openvino/op/moe.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/core/dev_api/openvino/op/moe.hpp b/src/core/dev_api/openvino/op/moe.hpp index 0a8a90be7e2aa1..cf0eb992da83cd 100644 --- a/src/core/dev_api/openvino/op/moe.hpp +++ b/src/core/dev_api/openvino/op/moe.hpp @@ -21,7 +21,7 @@ class OPENVINO_API MOE : public ov::op::Op { MOE() = default; - enum class Expert_type { GEMM3_SWIGLU, GEMM2_BIAS_SWIGLU_CLAMP }; + enum class Expert_type { GEMM2_BIAS_SWIGLU_CLAMP, GEMM3_SWIGLU }; struct Config { Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP}; @@ -32,8 +32,9 @@ class OPENVINO_API MOE : public ov::op::Op { /// \brief Constructs a MOE operation with config only /// \param args The input tensors, in the following order: /// 0: hidden_states - input tensor with hidden representations - /// 1: router_topk_output_weights - normalized weights for selected experts (input to final multiplication) - /// 2: router_topk_output_indices - indices of selected top-k experts + /// 1: routing_weights - [num_experts, ...] normalized weights for selected experts + /// (input to final multiplication) + /// 2: router_topk_output_indices - [..., topk] indices of selected top-k experts /// 3: w0_weight - expert weights for first projection, shape [num_experts, inter_size, hidden_size] or /// [num_experts, hidden_size, 2 * inter_size] if fused /// 4: w0_bias (optional) - expert bias for first projection, From eaede0de925187e6d4b942426ce41b004fcdd511 Mon Sep 17 00:00:00 2001 From: mitruska Date: Wed, 24 Sep 2025 16:12:24 +0000 Subject: [PATCH 15/19] No keep dims in Reduce --- .../common_optimizations/matmul_experts_fusion.cpp | 2 +- .../tests/common_optimizations/fuse_vectorized_moe_test.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index 2251ac6c01ee48..685c579e2a7d91 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -67,7 +67,7 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { auto unsqueeze_routing_weights = pattern::wrap_type({router_reshape, pattern::any_input()}); auto mul3 = pattern::wrap_type({end_reshape, unsqueeze_routing_weights}); - auto reduce_sum = pattern::wrap_type({mul3, pattern::any_input()}); + auto reduce_sum = pattern::wrap_type({mul3, pattern::any_input()}, {{"keep_dims", false}}); auto moe_pattern = reduce_sum; matcher_pass_callback callback = [=](pattern::Matcher& m) { diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp index 105d41efe3a6cb..a3ee9f6de550e6 100644 --- a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp +++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp @@ -159,7 +159,7 @@ inline std::shared_ptr build_moe_pattern_model() { auto reduce_sum = std::make_shared(mul3, op::v0::Constant::create(element::i64, Shape{1}, std::vector{0}), - true); + false); return std::make_shared(ov::OutputVector{reduce_sum}, ov::ParameterVector{input}); } From 9ce1569eb8cc2276352c6598dcdf4be7e4b96afe Mon Sep 17 00:00:00 2001 From: mitruska Date: Tue, 30 Sep 2025 10:15:07 +0000 Subject: [PATCH 16/19] Add transpose attrs to MatMul patterns --- .../common_optimizations/matmul_experts_fusion.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index 685c579e2a7d91..0c6862309fb25b 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -33,7 +33,8 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { auto experts_input = pattern::wrap_type({pattern::any_input(), pattern::any_input()}); auto tile = pattern::wrap_type({experts_input, pattern::any_input()}); auto after_tile_reshape = pattern::wrap_type({tile, pattern::any_input()}); - auto gate_up_matmul = pattern::wrap_type({after_tile_reshape, pattern::any_input()}); + auto gate_up_matmul = pattern::wrap_type({after_tile_reshape, pattern::any_input()}, + {{"transpose_a", false}, {"transpose_b", false}}); auto gate_up_add = pattern::wrap_type({gate_up_matmul, pattern::any_input()}); // Branch 1: Slice_1 -> Clamp -> Add_1 @@ -53,7 +54,8 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { auto multiply2 = pattern::wrap_type({add1, swish}); // Down projection - auto down_proj_matmul = pattern::wrap_type({multiply2, pattern::any_input()}); + auto down_proj_matmul = pattern::wrap_type({multiply2, pattern::any_input()}, + {{"transpose_a", false}, {"transpose_b", false}}); auto down_proj_add = pattern::wrap_type({down_proj_matmul, pattern::wrap_const()}); auto end_reshape = pattern::wrap_type({down_proj_add, pattern::any_input()}); From 90f31a2d422cd531a1b5aca225ade1235ecfff74 Mon Sep 17 00:00:00 2001 From: mitruska Date: Tue, 30 Sep 2025 11:34:53 +0000 Subject: [PATCH 17/19] Switch beta with alpha to match the beta for swish naming --- .../common_optimizations/matmul_experts_fusion.cpp | 8 ++++---- src/core/dev_api/openvino/op/moe.hpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index 0c6862309fb25b..2f8ccc6df706f9 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -94,14 +94,14 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { ov::op::internal::MOE::Config config; - // Extract expert_alpha from Swish beta attribute + // Extract expert_beta from Swish beta attribute auto swish_beta_const = ov::as_type_ptr(pm.at(swish_beta).get_node_shared_ptr()); auto swish_beta_const_val = swish_beta_const->cast_vector()[0]; - config.expert_alpha = swish_beta_const_val; + config.expert_beta = swish_beta_const_val; - // Extract expert_beta from Clamp max attribute + // Extract expert_alpha from Clamp max attribute if (auto clamp_op = ov::as_type_ptr(pm.at(clamp).get_node_shared_ptr())) { - config.expert_beta = static_cast(clamp_op->get_max()); + config.expert_alpha = static_cast(clamp_op->get_max()); } // Set expert_type diff --git a/src/core/dev_api/openvino/op/moe.hpp b/src/core/dev_api/openvino/op/moe.hpp index cf0eb992da83cd..5147f15fa8b184 100644 --- a/src/core/dev_api/openvino/op/moe.hpp +++ b/src/core/dev_api/openvino/op/moe.hpp @@ -25,8 +25,8 @@ class OPENVINO_API MOE : public ov::op::Op { struct Config { Expert_type expert_type{Expert_type::GEMM2_BIAS_SWIGLU_CLAMP}; - float expert_alpha{1.0f}; // Expert attribute, e.g. sigmoid alpha - float expert_beta{0.0f}; // Expert attribute, e.g. clamp limit + float expert_alpha{0.0f}; // Expert attribute for clamp bounds + float expert_beta{1.0f}; // Expert attribute for swish beta }; /// \brief Constructs a MOE operation with config only From 3a968558a38e383be25f2b7bedcaa6b206300d18 Mon Sep 17 00:00:00 2001 From: mitruska Date: Thu, 2 Oct 2025 15:10:12 +0000 Subject: [PATCH 18/19] Add fusion transformation for the second expert_type (GEMM3) --- .../matmul_experts_fusion.hpp | 26 ++- .../matmul_experts_fusion.cpp | 81 ++++++- .../fuse_vectorized_moe_test.cpp | 216 +++++++++++++++++- 3 files changed, 308 insertions(+), 15 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp index 27eac10769899b..482695ff3ce9ae 100644 --- a/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/matmul_experts_fusion.hpp @@ -4,19 +4,37 @@ #pragma once +#include "openvino/pass/graph_rewrite.hpp" #include "openvino/pass/matcher_pass.hpp" #include "transformations_visibility.hpp" namespace ov { namespace pass { -class TRANSFORMATIONS_API FuseVectorizedMOE; +class TRANSFORMATIONS_API FuseVectorizedMOE2GEMM; +class TRANSFORMATIONS_API FuseVectorizedMOE3GEMM; +class TRANSFORMATIONS_API VectorizedExpertsFusion; } // namespace pass } // namespace ov -class ov::pass::FuseVectorizedMOE : public ov::pass::MatcherPass { +class ov::pass::FuseVectorizedMOE2GEMM : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("FuseVectorizedMOE"); - FuseVectorizedMOE(); + OPENVINO_MATCHER_PASS_RTTI("FuseVectorizedMOE2GEMM"); + FuseVectorizedMOE2GEMM(); +}; + +class ov::pass::FuseVectorizedMOE3GEMM : public ov::pass::MatcherPass { +public: + OPENVINO_MATCHER_PASS_RTTI("FuseVectorizedMOE3GEMM"); + FuseVectorizedMOE3GEMM(); +}; + +class ov::pass::VectorizedExpertsFusion : public ov::pass::GraphRewrite { +public: + OPENVINO_GRAPH_REWRITE_RTTI("VectorizedExpertsFusion"); + VectorizedExpertsFusion() { + add_matcher(); + add_matcher(); + } }; diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index 2f8ccc6df706f9..27baeaea4e6a07 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -27,8 +27,8 @@ #include "transformations/utils/utils.hpp" using namespace ov::pass; -ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { - MATCHER_SCOPE(FuseVectorizedMOE); +ov::pass::FuseVectorizedMOE2GEMM::FuseVectorizedMOE2GEMM() { + MATCHER_SCOPE(FuseVectorizedMOE2GEMM); auto experts_input = pattern::wrap_type({pattern::any_input(), pattern::any_input()}); auto tile = pattern::wrap_type({experts_input, pattern::any_input()}); @@ -119,3 +119,80 @@ ov::pass::FuseVectorizedMOE::FuseVectorizedMOE() { auto matcher = std::make_shared(moe_pattern, matcher_name); this->register_matcher(matcher, callback); } + +ov::pass::FuseVectorizedMOE3GEMM::FuseVectorizedMOE3GEMM() { + MATCHER_SCOPE(FuseVectorizedMOE3GEMM); + + auto experts_input = pattern::wrap_type({pattern::any_input(), pattern::any_input()}); + auto tile = pattern::wrap_type({experts_input, pattern::any_input()}); + auto after_tile_reshape = pattern::wrap_type({tile, pattern::any_input()}); + + // First GEMM (activation gate) + auto gate_matmul = pattern::wrap_type({after_tile_reshape, pattern::any_input()}, + {{"transpose_a", false}, {"transpose_b", false}}); + auto swish = pattern::wrap_type({gate_matmul}); + // Second GEMM (up_projection) + auto up_matmul = pattern::wrap_type({after_tile_reshape, pattern::any_input()}, + {{"transpose_a", false}, {"transpose_b", false}}); + // Join: Multiply (SwiGLU) + auto swiglu = pattern::wrap_type({swish, up_matmul}); + + // Third GEMM (down_projection) + auto down_matmul = pattern::wrap_type({swiglu, pattern::any_input()}, + {{"transpose_a", false}, {"transpose_b", false}}); + auto end_reshape = pattern::wrap_type({down_matmul, pattern::any_input()}); + + // Routing weights/mask + auto router_topk_indices = pattern::any_input(); + auto scatter_elements_update = pattern::wrap_type( + {pattern::any_input(), router_topk_indices, pattern::any_input(), pattern::any_input()}); + auto router_transpose = pattern::wrap_type({scatter_elements_update, pattern::any_input()}); + auto router_reshape = pattern::wrap_type({router_transpose, pattern::any_input()}); + auto unsqueeze_routing_weights = pattern::wrap_type({router_reshape, pattern::any_input()}); + + auto mul3 = pattern::wrap_type({end_reshape, unsqueeze_routing_weights}); + auto reduce_sum = pattern::wrap_type({mul3, pattern::any_input()}, {{"keep_dims", false}}); + auto moe_pattern = reduce_sum; + + matcher_pass_callback callback = [=](pattern::Matcher& m) { + auto& pm = m.get_pattern_value_map(); + auto experts_input_node = pm.at(experts_input).get_node()->input_value(0); + auto routing_weights_node = pm.at(unsqueeze_routing_weights).get_node_shared_ptr(); + auto gate_weight = pm.at(gate_matmul).get_node()->input_value(1).get_node_shared_ptr(); + auto up_weight = pm.at(up_matmul).get_node()->input_value(1).get_node_shared_ptr(); + auto down_weight = pm.at(down_matmul).get_node()->input_value(1).get_node_shared_ptr(); + auto topk_indices_node = pm.at(scatter_elements_update).get_node()->input_value(1); + + ov::OutputVector moe_inputs = { + experts_input_node, + routing_weights_node, + topk_indices_node, + gate_weight, + up_weight, + down_weight, + }; + + ov::op::internal::MOE::Config config; + config.expert_type = ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU; + // Extract expert_beta if Swish has beta input provided + if (auto swish_op = ov::as_type_ptr(pm.at(swish).get_node_shared_ptr())) { + if (swish_op->get_input_size() > 1) { + if (auto swish_beta_const = + ov::as_type_ptr(swish_op->get_input_node_shared_ptr(1))) { + config.expert_beta = swish_beta_const->cast_vector()[0]; + } + } + } + + auto moe = std::make_shared(moe_inputs, config); + moe->set_friendly_name(m.get_match_root()->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), moe); + ov::replace_node(m.get_match_root(), moe); + + register_new_node(moe); + return true; + }; + + auto matcher = std::make_shared(moe_pattern, matcher_name); + this->register_matcher(matcher, callback); +} diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp index a3ee9f6de550e6..037429c61c06ca 100644 --- a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp +++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp @@ -37,7 +37,7 @@ #include "transformations/common_optimizations/matmul_experts_fusion.hpp" #include "transformations/utils/gen_pattern.hpp" -inline std::shared_ptr build_moe_pattern_model() { +inline std::shared_ptr build_2gemm_moe_pattern_model() { using namespace ov; const size_t batch = 2; @@ -151,7 +151,7 @@ inline std::shared_ptr build_moe_pattern_model() { true); auto unsqueeze_routing_weights = std::make_shared(router_reshape, - op::v0::Constant::create(element::i64, Shape{1}, std::vector{1})); + op::v0::Constant::create(element::i64, Shape{1}, std::vector{-1})); auto mul3 = std::make_shared(end_reshape, unsqueeze_routing_weights); @@ -164,7 +164,7 @@ inline std::shared_ptr build_moe_pattern_model() { return std::make_shared(ov::OutputVector{reduce_sum}, ov::ParameterVector{input}); } -inline std::shared_ptr build_fused_moe_reference_model() { +inline std::shared_ptr build_fused_2gemm_moe_reference_model() { using namespace ov; const size_t batch = 2; @@ -220,8 +220,7 @@ inline std::shared_ptr build_fused_moe_reference_model() { op::v0::Constant::create(element::i64, Shape{3}, std::vector{number_of_experts, batch, -1}), true); auto unsqueeze_routing_weights = - std::make_shared(router_reshape, - op::v0::Constant::create(element::i64, Shape{1}, std::vector{1})); + std::make_shared(router_reshape, op::v0::Constant::create(element::i64, Shape{1}, {-1})); // End of Router subgraph // Expert MatMuls weights fused into MOE @@ -246,8 +245,207 @@ inline std::shared_ptr build_fused_moe_reference_model() { return std::make_shared(ov::OutputVector{moe}, ov::ParameterVector{input}); } -TEST_F(TransformationTestsF, FuseVectorizedMOE_basic) { - model = build_moe_pattern_model(); - manager.register_pass(); - model_ref = build_fused_moe_reference_model(); +inline std::shared_ptr build_3gemm_moe_pattern_model() { + using namespace ov; + + const size_t batch = 2; + const Dimension in_dim = Dimension::dynamic(); + const size_t hidden_size = 2048; + const size_t intermediate_size = 4096; + const size_t number_of_experts = 3; + const size_t topk = 2; + + auto input_shape = PartialShape{batch, in_dim, hidden_size}; + auto input = std::make_shared(element::f32, input_shape); + auto experts_reshape = std::make_shared( + input, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{-1, hidden_size}), + false); + + auto tile = std::make_shared( + experts_reshape, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{number_of_experts, 1})); + auto after_tile_reshape = std::make_shared( + tile, + op::v0::Constant::create(element::i64, Shape{3}, std::vector{number_of_experts, batch, hidden_size}), + false); + + // First GEMM (gate) + auto gate_matmul = std::make_shared( + after_tile_reshape, + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}), + false, + false); + + auto swish = std::make_shared(gate_matmul); + + // Second GEMM (up) + auto up_matmul = std::make_shared( + after_tile_reshape, + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}), + false, + false); + + auto swiglu = std::make_shared(swish, up_matmul); + + // Third GEMM (down) + auto down_matmul = std::make_shared( + swiglu, + op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}), + false, + false); + + auto experts_out_reshape = std::make_shared( + down_matmul, + op::v0::Constant::create(element::i64, + Shape{4}, + std::vector{number_of_experts, batch, -1, hidden_size}), + false); + + // Router subgraph used to test correctness of routing weights connection + auto router_matmul = std::make_shared( + experts_reshape, + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size}, {1.0f}), + false, + true); + + auto router_topk_values_and_indices = + std::make_shared(router_matmul, + op::v0::Constant::create(element::i64, Shape{}, {topk}), + -1, + op::v11::TopK::Mode::MAX, + op::v11::TopK::SortType::SORT_VALUES, + element::i64); + + auto router_topk_values = router_topk_values_and_indices->output(0); + auto router_topk_indices = router_topk_values_and_indices->output(1); + + auto scatter_elements_update = std::make_shared( + router_topk_values, + router_topk_indices, + op::v0::Constant::create(element::f32, Shape{batch, topk}, {0}), + op::v0::Constant::create(element::i64, Shape{1}, std::vector{1})); + auto router_transpose = std::make_shared( + scatter_elements_update, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{1, 0})); + auto router_reshape = std::make_shared( + router_transpose, + op::v0::Constant::create(element::i64, Shape{3}, std::vector{number_of_experts, batch, -1}), + true); + auto unsqueeze_routing_weights = + std::make_shared(router_reshape, + op::v0::Constant::create(element::i64, Shape{1}, std::vector{-1})); + + auto mul3 = std::make_shared(experts_out_reshape, unsqueeze_routing_weights); + + // ReduceSum - final node of the MOE pattern to be fused + auto reduce_sum = + std::make_shared(mul3, + op::v0::Constant::create(element::i64, Shape{1}, std::vector{0}), + false); + + return std::make_shared(ov::OutputVector{reduce_sum}, ov::ParameterVector{input}); } + +inline std::shared_ptr build_fused_3gemm_moe_reference_model() { + using namespace ov; + + const size_t batch = 2; + const Dimension in_dim = Dimension::dynamic(); + const size_t hidden_size = 2048; + const size_t intermediate_size = 4096; + const size_t number_of_experts = 3; + const size_t topk = 2; + + auto input = std::make_shared(element::f32, PartialShape{batch, in_dim, hidden_size}); + + // Begin of Router subgraph (not fused, but valuable for testing) + auto experts_reshape = std::make_shared( + input, + op::v0::Constant::create(element::i64, Shape{2}, std::vector{-1, hidden_size}), + false); + + auto router_matmul = std::make_shared( + experts_reshape, + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size}, {1.0f}), + false, + true); + + auto router_topk = std::make_shared(router_matmul, + op::v0::Constant::create(element::i64, Shape{}, {topk}), + -1, + op::v11::TopK::Mode::MAX, + op::v11::TopK::SortType::SORT_VALUES, + element::i64); + + auto router_topk_values = router_topk->output(0); + auto router_topk_indices = router_topk->output(1); + + auto scatter_elements_update = std::make_shared( + router_topk_values, + router_topk_indices, + op::v0::Constant::create(element::f32, Shape{batch, topk}, {0}), + op::v0::Constant::create(element::i64, Shape{1}, {1})); + + auto router_transpose = + std::make_shared(scatter_elements_update, + op::v0::Constant::create(element::i64, Shape{2}, {1, 0})); + auto router_reshape = std::make_shared( + router_transpose, + op::v0::Constant::create(element::i64, Shape{3}, std::vector{number_of_experts, batch, -1}), + true); + + auto unsqueeze_routing_weights = + std::make_shared(router_reshape, op::v0::Constant::create(element::i64, Shape{1}, {-1})); + + // MOE fused op + auto w0_weight = + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}); + auto w1_weight = + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}); + auto w2_weight = + op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}); + + ov::OutputVector moe_inputs = + {input, unsqueeze_routing_weights, router_topk_indices, w0_weight, w1_weight, w2_weight}; + + ov::op::internal::MOE::Config config; + config.expert_type = ov::op::internal::MOE::Expert_type::GEMM3_SWIGLU; + + auto moe = std::make_shared(moe_inputs, config); + return std::make_shared(ov::OutputVector{moe}, ov::ParameterVector{input}); +} + +TEST_F(TransformationTestsF, FuseVectorizedMOE2GEMM_basic) { + model = build_2gemm_moe_pattern_model(); + manager.register_pass(); + model_ref = build_fused_2gemm_moe_reference_model(); +} + +TEST_F(TransformationTestsF, FuseVectorizedMOE2GEMM_VectorizedExpertsFusion) { + model = build_2gemm_moe_pattern_model(); + manager.register_pass(); + model_ref = build_fused_2gemm_moe_reference_model(); +} + +TEST_F(TransformationTestsF, FuseVectorizedMOE2GEMM_no_fusion) { + model = build_3gemm_moe_pattern_model(); + manager.register_pass(); +} + +TEST_F(TransformationTestsF, FuseVectorizedMOE3GEMM_basic) { + model = build_3gemm_moe_pattern_model(); + manager.register_pass(); + model_ref = build_fused_3gemm_moe_reference_model(); +} + +TEST_F(TransformationTestsF, FuseVectorizedMOE3GEMM_VectorizedExpertsFusion) { + model = build_3gemm_moe_pattern_model(); + manager.register_pass(); + model_ref = build_fused_3gemm_moe_reference_model(); +} + +TEST_F(TransformationTestsF, FuseVectorizedMOE3GEMM_no_fusion) { + model = build_2gemm_moe_pattern_model(); + manager.register_pass(); +} \ No newline at end of file From df97c220e4c280435c983fe7b75a74c6d81364f0 Mon Sep 17 00:00:00 2001 From: mitruska Date: Mon, 13 Oct 2025 10:04:18 +0000 Subject: [PATCH 19/19] Update GEMM3 transpose_b attr to be true --- .../matmul_experts_fusion.cpp | 6 +++--- .../fuse_vectorized_moe_test.cpp | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp index 27baeaea4e6a07..76bbbef9abf8e0 100644 --- a/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_experts_fusion.cpp @@ -129,17 +129,17 @@ ov::pass::FuseVectorizedMOE3GEMM::FuseVectorizedMOE3GEMM() { // First GEMM (activation gate) auto gate_matmul = pattern::wrap_type({after_tile_reshape, pattern::any_input()}, - {{"transpose_a", false}, {"transpose_b", false}}); + {{"transpose_a", false}, {"transpose_b", true}}); auto swish = pattern::wrap_type({gate_matmul}); // Second GEMM (up_projection) auto up_matmul = pattern::wrap_type({after_tile_reshape, pattern::any_input()}, - {{"transpose_a", false}, {"transpose_b", false}}); + {{"transpose_a", false}, {"transpose_b", true}}); // Join: Multiply (SwiGLU) auto swiglu = pattern::wrap_type({swish, up_matmul}); // Third GEMM (down_projection) auto down_matmul = pattern::wrap_type({swiglu, pattern::any_input()}, - {{"transpose_a", false}, {"transpose_b", false}}); + {{"transpose_a", false}, {"transpose_b", true}}); auto end_reshape = pattern::wrap_type({down_matmul, pattern::any_input()}); // Routing weights/mask diff --git a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp index 037429c61c06ca..90fac722910d04 100644 --- a/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp +++ b/src/common/transformations/tests/common_optimizations/fuse_vectorized_moe_test.cpp @@ -273,27 +273,27 @@ inline std::shared_ptr build_3gemm_moe_pattern_model() { // First GEMM (gate) auto gate_matmul = std::make_shared( after_tile_reshape, - op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}), + op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}), false, - false); + true); auto swish = std::make_shared(gate_matmul); // Second GEMM (up) auto up_matmul = std::make_shared( after_tile_reshape, - op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}), + op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}), false, - false); + true); auto swiglu = std::make_shared(swish, up_matmul); // Third GEMM (down) auto down_matmul = std::make_shared( swiglu, - op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}), + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}), false, - false); + true); auto experts_out_reshape = std::make_shared( down_matmul, @@ -400,11 +400,11 @@ inline std::shared_ptr build_fused_3gemm_moe_reference_model() { // MOE fused op auto w0_weight = - op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}); + op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}); auto w1_weight = - op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}); - auto w2_weight = op::v0::Constant::create(element::f32, Shape{number_of_experts, intermediate_size, hidden_size}, {1.0f}); + auto w2_weight = + op::v0::Constant::create(element::f32, Shape{number_of_experts, hidden_size, intermediate_size}, {1.0f}); ov::OutputVector moe_inputs = {input, unsqueeze_routing_weights, router_topk_indices, w0_weight, w1_weight, w2_weight};