From 93a6e3848fe908f17b684beff8c0b3003caa2453 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Sat, 14 Jun 2025 17:28:23 -0700 Subject: [PATCH] Catch model import failure and report the appropriate error --- .../providers/openvino/backend_manager.cc | 46 +- .../core/providers/openvino/exceptions.h | 82 +++ .../openvino/openvino_execution_provider.cc | 217 +++--- .../core/providers/openvino/ov_interface.cc | 675 +++++++++--------- 4 files changed, 558 insertions(+), 462 deletions(-) create mode 100644 onnxruntime/core/providers/openvino/exceptions.h diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index cadeab4cbd4cc..1f6d409d1302d 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -20,6 +20,7 @@ #include "core/providers/openvino/ov_interface.h" #include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" +#include "core/providers/openvino/exceptions.h" #include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h" namespace onnxruntime { @@ -154,30 +155,31 @@ BackendManager::BackendManager(SessionContext& session_context, subgraph_context_, shared_context_, model_stream); - } catch (const OnnxRuntimeException& ex) { - std::string exception_str = ex.what(); - - if (session_context_.device_type.find("NPU") != std::string::npos && - exception_str.find("intel_npu") != std::string::npos) { - // Handle NPU device related errors -#ifndef NDEBUG - ORT_THROW(exception_str + "\nModel needs to be recompiled\n"); -#else - std::string error_message = "UNKNOWN NPU ERROR"; - std::string error_code = "code 0x0"; - std::regex error_message_pattern(R"(\bZE_\w*\b)"); - std::regex error_code_pattern("code 0x[0-9a-fA-F]+"); - std::smatch matches; - if (std::regex_search(exception_str, matches, error_message_pattern)) { - error_message = matches[0]; - } - if (std::regex_search(exception_str, matches, error_code_pattern)) { - error_code = matches[0]; + } catch (const ovep_exception& ex) { +#ifndef OPENVINO_DISABLE_NPU_FALLBACK + bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos && + !session_context_.so_disable_cpu_ep_fallback && + !subgraph_context_.is_ep_ctx_graph; + if (eligible_for_cpu_fallback) { + std::string exception_str = ex.what(); + LOGS_DEFAULT(VERBOSE) << exception_str; + LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." + << "Falling back to OV CPU for execution"; + session_context_.device_type = "CPU"; + session_context_.precision = "FP32"; + try { + concrete_backend_ = BackendFactory::MakeBackend(model_proto, + session_context_, + subgraph_context_, + shared_context_, + model_stream); + } catch (std::string const& msg) { + ORT_THROW(msg); } - throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n"); + } else #endif - } else { - ORT_THROW(exception_str); + { + throw ex; } } } diff --git a/onnxruntime/core/providers/openvino/exceptions.h b/onnxruntime/core/providers/openvino/exceptions.h new file mode 100644 index 0000000000000..0f1737ff22cad --- /dev/null +++ b/onnxruntime/core/providers/openvino/exceptions.h @@ -0,0 +1,82 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#pragma once + +#include +#include +#include + +#include "core/common/status.h" + +namespace onnxruntime { +namespace openvino_ep { + +struct ovep_exception : public std::exception { + enum class type { + compile_model, + import_model, + query_prop, + read_model, + unknown, + }; + + ovep_exception(const std::string& message, + enum class type type) : message_{message}, + type_{type}, + error_code_{ze_result_code_from_string(message)}, + error_name_{ze_result_name_from_string(message)} {} + + const char* what() const noexcept override { + return message_.data(); + } + + uint32_t get_code() const { return error_code_; } + + operator common::Status() const { + common::StatusCategory category_ort{common::ONNXRUNTIME}; + + if (type_ == type::unknown) { + return {category_ort, common::FAIL, message_}; + } + + // Newer drivers + if ((type_ == type::import_model) && + (error_code_ == 0x7800000f /* ZE_RESULT_ERROR_INVALID_NATIVE_BINARY */)) { + std::string message{error_name_ + ", code 0x" + std::to_string(error_code_) + "\nModel needs to be recompiled\n"}; + return {category_ort, common::INVALID_GRAPH, message}; + } + + std::string error_message = "Unhandled exception type: " + std::to_string(static_cast(type_)); + return {category_ort, common::FAIL, error_message}; + } + + protected: + std::string message_; + type type_{type::unknown}; + uint32_t error_code_{0}; + std::string error_name_; + + private: + uint32_t ze_result_code_from_string(const std::string& ov_exception_string) { + uint32_t error_code{0}; + std::regex error_code_pattern("code 0x([0-9a-fA-F]+)"); + std::smatch matches; + if (std::regex_search(ov_exception_string, matches, error_code_pattern)) { + std::from_chars(&(*matches[1].first), &(*matches[1].second), error_code, 16); + } + return error_code; + } + std::string ze_result_name_from_string(const std::string& ov_exception_string) { + std::string error_message = "UNKNOWN NPU ERROR"; + std::regex error_message_pattern(R"(\bZE_\w*\b)"); + std::smatch matches; + if (std::regex_search(ov_exception_string, matches, error_message_pattern)) { + error_message = matches[0]; + } + return error_message; + } +}; + +} // namespace openvino_ep +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index a0fa885cbfc38..cd8f1a51147be 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -12,6 +12,7 @@ #include "core/providers/openvino/onnx_ctx_model_helper.h" #include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" +#include "core/providers/openvino/exceptions.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "openvino/core/version.hpp" #ifdef USE_OVEP_NPU_MEMORY @@ -94,124 +95,128 @@ common::Status OpenVINOExecutionProvider::Compile( auto& logger = *GetLogger(); Status status = Status::OK(); - bool is_epctx_model = false; - if (!fused_nodes.empty()) { - // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext - const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); - session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); - session_context_.onnx_opset_version = - graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); - - // OVIR wrapped in epctx should be treated as source but this code does not - // This corner case is not in use and will be addressed in a future commit - is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0); - } - - // The block below is executed during EP context model inference - auto& metadata = shared_context_->shared_weights.metadata; // Metadata object in memory - if (session_context_.so_share_ep_contexts && - is_epctx_model && - metadata.empty()) { - fs::path context_model_file_path = session_context_.so_context_file_path; - if (context_model_file_path.empty()) { - // If ep.context_file_path is not set the input model path is used - context_model_file_path = session_context_.onnx_model_path_name; + try { + bool is_epctx_model = false; + if (!fused_nodes.empty()) { + // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext + const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); + session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); + session_context_.onnx_opset_version = + graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); + + // OVIR wrapped in epctx should be treated as source but this code does not + // This corner case is not in use and will be addressed in a future commit + is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0); } - // Metadata is always read from model location, this could be a source or epctx model - fs::path metadata_filename = context_model_file_path.stem().string() + "_metadata.bin"; - fs::path metadata_file_path = context_model_file_path.parent_path() / metadata_filename; - std::ifstream file(metadata_file_path, std::ios::binary); - ORT_RETURN_IF_NOT(file, "Metadata file was not found: " + metadata_file_path.string()); - shared_context_->shared_weights.metadata_filepath = std::move(metadata_file_path); - file >> metadata; - } - - struct OpenVINOEPFunctionState { - AllocateFunc allocate_func = nullptr; - DestroyFunc destroy_func = nullptr; - AllocatorHandle allocator_handle = nullptr; - BackendManager& backend_manager; - }; - - for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { - const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; - const Node& fused_node = fused_node_graph.fused_node; - - NodeComputeInfo compute_info; - - // During backend creation, we check if user wants to use precompiled blob onnx model or the original model - // For precompiled blob, directly load the model instead of compiling the model - // For original model, check if the user wants to export a model with pre-compiled blob - - auto& backend_manager = backend_managers_.emplace_back(session_context_, - *shared_context_, - fused_node, - graph_body_viewer, - logger, - ep_ctx_handle_); - - compute_info.create_state_func = - [&backend_manager](ComputeContext* context, FunctionState* state) { - OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{ - .allocate_func = context->allocate_func, - .destroy_func = context->release_func, - .allocator_handle = context->allocator_handle, - .backend_manager = backend_manager}; - *state = static_cast(p); - return 0; - }; - - compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { - auto function_state = static_cast(state); - try { - function_state->backend_manager.Compute(context); - } catch (const std::exception& ex) { - return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what()); + // The block below is executed during EP context model inference + auto& metadata = shared_context_->shared_weights.metadata; // Metadata object in memory + if (session_context_.so_share_ep_contexts && + is_epctx_model && + metadata.empty()) { + fs::path context_model_file_path = session_context_.so_context_file_path; + if (context_model_file_path.empty()) { + // If ep.context_file_path is not set the input model path is used + context_model_file_path = session_context_.onnx_model_path_name; } - return Status::OK(); + + // Metadata is always read from model location, this could be a source or epctx model + fs::path metadata_filename = context_model_file_path.stem().string() + "_metadata.bin"; + fs::path metadata_file_path = context_model_file_path.parent_path() / metadata_filename; + std::ifstream file(metadata_file_path, std::ios::binary); + ORT_RETURN_IF_NOT(file, "Metadata file was not found: " + metadata_file_path.string()); + shared_context_->shared_weights.metadata_filepath = std::move(metadata_file_path); + file >> metadata; + } + + struct OpenVINOEPFunctionState { + AllocateFunc allocate_func = nullptr; + DestroyFunc destroy_func = nullptr; + AllocatorHandle allocator_handle = nullptr; + BackendManager& backend_manager; }; - compute_info.release_state_func = - [](FunctionState state) { - if (state) { - OpenVINOEPFunctionState* function_state = static_cast(state); - delete function_state; - } - }; + for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { + const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; + const Node& fused_node = fused_node_graph.fused_node; + + NodeComputeInfo compute_info; + + // During backend creation, we check if user wants to use precompiled blob onnx model or the original model + // For precompiled blob, directly load the model instead of compiling the model + // For original model, check if the user wants to export a model with pre-compiled blob + + auto& backend_manager = backend_managers_.emplace_back(session_context_, + *shared_context_, + fused_node, + graph_body_viewer, + logger, + ep_ctx_handle_); + + compute_info.create_state_func = + [&backend_manager](ComputeContext* context, FunctionState* state) { + OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{ + .allocate_func = context->allocate_func, + .destroy_func = context->release_func, + .allocator_handle = context->allocator_handle, + .backend_manager = backend_manager}; + *state = static_cast(p); + return 0; + }; + + compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { + auto function_state = static_cast(state); + try { + function_state->backend_manager.Compute(context); + } catch (const std::exception& ex) { + return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what()); + } + return Status::OK(); + }; - node_compute_funcs.push_back(std::move(compute_info)); + compute_info.release_state_func = + [](FunctionState state) { + if (state) { + OpenVINOEPFunctionState* function_state = static_cast(state); + delete function_state; + } + }; - if (!status.IsOK()) { - break; + node_compute_funcs.push_back(std::move(compute_info)); + + if (!status.IsOK()) { + break; + } } - } - // The block below is executed during EP context model generation - if (session_context_.so_context_enable && - session_context_.so_share_ep_contexts && - !metadata.empty()) { - // For models after the first the metadata name comes from the shared context - fs::path metadata_file_path = shared_context_->shared_weights.metadata_filepath; - if (metadata_file_path.empty()) { - metadata_file_path = session_context_.so_context_file_path; - std::string name_append{"_metadata.bin"}; + // The block below is executed during EP context model generation + if (session_context_.so_context_enable && + session_context_.so_share_ep_contexts && + !metadata.empty()) { + // For models after the first the metadata name comes from the shared context + fs::path metadata_file_path = shared_context_->shared_weights.metadata_filepath; if (metadata_file_path.empty()) { - metadata_file_path = session_context_.onnx_model_path_name; - name_append = "_ctx" + name_append; + metadata_file_path = session_context_.so_context_file_path; + std::string name_append{"_metadata.bin"}; + if (metadata_file_path.empty()) { + metadata_file_path = session_context_.onnx_model_path_name; + name_append = "_ctx" + name_append; + } + auto metadata_filename = metadata_file_path.stem().string() + name_append; + metadata_file_path.replace_filename(metadata_filename); + shared_context_->shared_weights.metadata_filepath = metadata_file_path; } - auto metadata_filename = metadata_file_path.stem().string() + name_append; - metadata_file_path.replace_filename(metadata_filename); - shared_context_->shared_weights.metadata_filepath = metadata_file_path; - } - // Metadata is generated only for shared contexts - // If saving metadata then save it to the provided path or use the original model path - // Multiple calls to Compile() will update the metadata and for the last call - // the resulting file will contain the aggregated content - std::ofstream file{metadata_file_path, std::ios::binary}; - ORT_RETURN_IF_NOT(file, "Metadata file could not be written: ", metadata_file_path); - file << metadata; + // Metadata is generated only for shared contexts + // If saving metadata then save it to the provided path or use the original model path + // Multiple calls to Compile() will update the metadata and for the last call + // the resulting file will contain the aggregated content + std::ofstream file{metadata_file_path, std::ios::binary}; + ORT_RETURN_IF_NOT(file, "Metadata file could not be written: ", metadata_file_path); + file << metadata; + } + } catch (const ovep_exception& ex) { + status = ex; } return status; diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 2d29df8eb4197..2af77992da305 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -11,18 +11,25 @@ #include "core/providers/openvino/backend_utils.h" #include "core/providers/openvino/backends/basic_backend.h" #include "core/providers/openvino/ov_stateful_patch_utils.h" +#include "core/providers/openvino/exceptions.h" namespace onnxruntime { namespace openvino_ep { -template -inline auto OvExceptionBoundary(Func&& func, std::format_string&& fmt, Args&&... args) { +template +inline auto OvExceptionBoundary(Func&& func, Args&&... args) { try { return func(); } catch (const ov::Exception& e) { - ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what())); + const auto message = log_tag + (args + ...) + ": " + std::string(e.what()); + if constexpr (typed) { + ORT_THROW_EX(ovep_exception, message, ovep_exception::type::import_model); + } else { + ORT_THROW(message); + } } catch (...) { - ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...))); + const auto message = log_tag + (args + ...); + ORT_THROW(message); } } @@ -57,105 +64,105 @@ void printDebugInfo(const ov::CompiledModel& obj) { } #endif -// Function to check if a given OV property is enabled -std::optional queryOVProperty(const std::string& property, const std::string& device_type) { - try { - // Get the property value - auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties); - return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end(); - } catch (const std::exception&) { - return std::nullopt; // Property not found or invalid + // Function to check if a given OV property is enabled + std::optional queryOVProperty(const std::string& property, const std::string& device_type) { + try { + // Get the property value + auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties); + return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end(); + } catch (const std::exception&) { + return std::nullopt; // Property not found or invalid + } } -} -std::shared_ptr OVCore::ReadModel(std::string&& model, const std::string& model_path) { - return OvExceptionBoundary([&]() { - std::istringstream modelStringStream(std::move(model)); - std::istream& modelStream = modelStringStream; - // Try to load with FrontEndManager - ov::frontend::FrontEndManager manager; - ov::frontend::FrontEnd::Ptr FE; - ov::frontend::InputModel::Ptr inputModel; - - ov::AnyVector params{&modelStream, model_path}; - - FE = manager.load_by_model(params); - if (FE) { - inputModel = FE->load(params); - return FE->convert(inputModel); - } else { - ORT_THROW(log_tag + "Unknown exception while Reading network"); - } - }, - "Exception while Reading network"); -} + std::shared_ptr OVCore::ReadModel(std::string && model, const std::string& model_path) { + return OvExceptionBoundary([&]() { + std::istringstream modelStringStream(std::move(model)); + std::istream& modelStream = modelStringStream; + // Try to load with FrontEndManager + ov::frontend::FrontEndManager manager; + ov::frontend::FrontEnd::Ptr FE; + ov::frontend::InputModel::Ptr inputModel; + + ov::AnyVector params{&modelStream, model_path}; + + FE = manager.load_by_model(params); + if (FE) { + inputModel = FE->load(params); + return FE->convert(inputModel); + } else { + ORT_THROW(log_tag + "Unknown exception while Reading network"); + } + }, + "Exception while Reading network"); + } -OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr& model, - std::string& hw_target, - const ov::AnyMap& device_config) { - ov::CompiledModel compiled_model; - ov::AnyMap config = device_config; + OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr & model, + std::string & hw_target, + const ov::AnyMap& device_config) { + ov::CompiledModel compiled_model; + ov::AnyMap config = device_config; - if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "Stateless OV Model Statistic:" << std::endl; - LogBasicModelInfo(model); - } + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "Stateless OV Model Statistic:" << std::endl; + LogBasicModelInfo(model); + } - bool model_status = IsStateful(model); - LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); - if (!model_status) { - LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; - PatchStatefulDecoder(model); - } + bool model_status = IsStateful(model); + LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); + if (!model_status) { + LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; + PatchStatefulDecoder(model); + } - if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "Stateful OV Model Statistic:" << std::endl; - LogBasicModelInfo(model); - } + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "Stateful OV Model Statistic:" << std::endl; + LogBasicModelInfo(model); + } - auto kv_pos = GetKVAxesPos(model); + auto kv_pos = GetKVAxesPos(model); - if (hw_target.find("NPU") != std::string::npos) { - KVDesc kv_desc; - auto parse_genai_config = [&](const std::string& key, unsigned int default_value) { - return (config.count(key) && !config.at(key).empty() && config.at(key).as() != "0") ? config.at(key).as() : default_value; - }; + if (hw_target.find("NPU") != std::string::npos) { + KVDesc kv_desc; + auto parse_genai_config = [&](const std::string& key, unsigned int default_value) { + return (config.count(key) && !config.at(key).empty() && config.at(key).as() != "0") ? config.at(key).as() : default_value; + }; - kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len); - kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len); + kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len); + kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len); - // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0 - if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) { - ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty"); - } + // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0 + if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) { + ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty"); + } - if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl; - std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl; - std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl; - std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl; + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl; + std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl; + std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl; + std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl; + } + + UpdateNPUConfig(config, kv_pos, kv_desc); + } else { + // This patches the OV IR model so that it only produces the logits required for sampling. + // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device, + // while this is here mostly to align this behavior for other devices viz. (CPU, GPU). + ApplySliceBeforeMatmulTransformation(model); } - UpdateNPUConfig(config, kv_pos, kv_desc); - } else { - // This patches the OV IR model so that it only produces the logits required for sampling. - // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device, - // while this is here mostly to align this behavior for other devices viz. (CPU, GPU). - ApplySliceBeforeMatmulTransformation(model); + LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow"; + compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config); + OVExeNetwork exe(compiled_model, hw_target, true); + return exe; } - LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow"; - compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config); - OVExeNetwork exe(compiled_model, hw_target, true); - return exe; -} - OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_network, std::string& hw_target, ov::AnyMap& device_config, bool enable_causallm, const std::string& name) { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { OVExeNetwork exe; if (enable_causallm) { auto mutable_model = ie_cnn_network->clone(); @@ -166,148 +173,148 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_netwo } #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network for graph {}", name); -} + return exe; + }, + "Exception while Loading Network for graph {}", name); + } -OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, - std::string& hw_target, - ov::AnyMap& device_config, - const std::string& name) { - return OvExceptionBoundary([&]() { - ov::CompiledModel obj; + OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name) { + return OvExceptionBoundary([&]() { + ov::CompiledModel obj; - obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); + obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); #ifndef NDEBUG - printDebugInfo(obj); + printDebugInfo(obj); #endif - OVExeNetwork exe(obj, hw_target); - return exe; - }, - "Exception while Loading Network for graph {}", name); -} + OVExeNetwork exe(obj, hw_target); + return exe; + }, + "Exception while Loading Network for graph {}", name); + } -OVExeNetwork OVCore::ImportModel(std::istream& model_stream, - std::string hw_target, - const ov::AnyMap& device_config, - std::string name) { - return OvExceptionBoundary([&]() { - ov::CompiledModel obj; - obj = core.import_model(model_stream, hw_target, device_config); - OVExeNetwork exe(obj, hw_target); + OVExeNetwork OVCore::ImportModel(std::istream & model_stream, + std::string hw_target, + const ov::AnyMap& device_config, + std::string name) { + return OvExceptionBoundary([&]() { + ov::CompiledModel obj; + obj = core.import_model(model_stream, hw_target, device_config); + OVExeNetwork exe(obj, hw_target); #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network for graph {}", name); -} - -OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream, - std::string& hw_target, - const ov::AnyMap& device_config, - bool enable_causallm, - std::filesystem::path model_file_path) { - return OvExceptionBoundary([&]() { - OVExeNetwork exe; - - bool isXML = backend_utils::IsModelStreamXML(model_stream); + return exe; + }, + "Exception while Loading Network for graph {}", name); + } - // Helper function to check if file exists and is readable - const auto check_file_access = [&model_file_path](const std::filesystem::path& path) { - try { - if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) { - ORT_THROW(log_tag + "Required file missing or empty: " + path.string()); - } - std::ifstream file(path); - if (!file) { - ORT_THROW(log_tag + "Required file not readable: " + path.string()); + OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream & model_stream, + std::string & hw_target, + const ov::AnyMap& device_config, + bool enable_causallm, + std::filesystem::path model_file_path) { + return OvExceptionBoundary([&]() { + OVExeNetwork exe; + + bool isXML = backend_utils::IsModelStreamXML(model_stream); + + // Helper function to check if file exists and is readable + const auto check_file_access = [&model_file_path](const std::filesystem::path& path) { + try { + if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) { + ORT_THROW(log_tag + "Required file missing or empty: " + path.string()); + } + std::ifstream file(path); + if (!file) { + ORT_THROW(log_tag + "Required file not readable: " + path.string()); + } + } catch (const std::exception& e) { + ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what()); } - } catch (const std::exception& e) { - ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what()); - } - }; + }; - if (isXML) { - // If the model is XML, we need to load it with the XML content in read_model() - // where weights from bin file is directly consumed - auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml"); + if (isXML) { + // If the model is XML, we need to load it with the XML content in read_model() + // where weights from bin file is directly consumed + auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml"); - check_file_access(xml_file_path); + check_file_access(xml_file_path); - LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string(); + LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string(); - // Load the model explicitly with XML contents - std::shared_ptr model = core.read_model(xml_file_path.string()); + // Load the model explicitly with XML contents + std::shared_ptr model = core.read_model(xml_file_path.string()); - if (enable_causallm) { - exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config); - } else { - auto obj = core.compile_model(model, hw_target, device_config); - exe = OVExeNetwork(obj, hw_target); + if (enable_causallm) { + exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config); + } else { + auto obj = core.compile_model(model, hw_target, device_config); + exe = OVExeNetwork(obj, hw_target); + } } - } #ifndef NDEBUG - printDebugInfo(exe.Get()); + printDebugInfo(exe.Get()); #endif - return exe; - }, - "Exception while Loading Network from OVIR model file: {}", model_file_path.string()); -} + return exe; + }, + "Exception while Loading Network from OVIR model file: {}", model_file_path.string()); + } void OVCore::SetCache(const std::string& cache_dir_path) { core.set_property(ov::cache_dir(cache_dir_path)); } -std::vector OVCore::GetAvailableDevices() const { - std::vector available_devices = core.get_available_devices(); - return available_devices; -} - -std::vector OVCore::GetAvailableDevices(const std::string& device_type) const { - std::vector available_devices; - std::vector devicesIDs; - // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU) - try { - devicesIDs = core.get_property(device_type, ov::available_devices); - } catch (const ov::Exception&) { - // plugin is not created by e.g. invalid env - // Empty device list will be returned - } catch (const std::exception& ex) { - ORT_THROW(log_tag + "An exception occurred while trying to create the ", - device_type, - " device: ", - ex.what()); - } catch (...) { - ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ", - device_type, - " device"); + std::vector OVCore::GetAvailableDevices() const { + std::vector available_devices = core.get_available_devices(); + return available_devices; } - if (devicesIDs.size() > 1 || - (devicesIDs.size() == 1 && devicesIDs[0] == "0")) { - for (const auto& deviceID : devicesIDs) { - available_devices.push_back(device_type + '.' + deviceID); + std::vector OVCore::GetAvailableDevices(const std::string& device_type) const { + std::vector available_devices; + std::vector devicesIDs; + // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU) + try { + devicesIDs = core.get_property(device_type, ov::available_devices); + } catch (const ov::Exception&) { + // plugin is not created by e.g. invalid env + // Empty device list will be returned + } catch (const std::exception& ex) { + ORT_THROW(log_tag + "An exception occurred while trying to create the ", + device_type, + " device: ", + ex.what()); + } catch (...) { + ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ", + device_type, + " device"); } - } - if (!devicesIDs.empty()) { - available_devices.push_back(device_type); - } - return available_devices; -} + if (devicesIDs.size() > 1 || + (devicesIDs.size() == 1 && devicesIDs[0] == "0")) { + for (const auto& deviceID : devicesIDs) { + available_devices.push_back(device_type + '.' + deviceID); + } + } + if (!devicesIDs.empty()) { + available_devices.push_back(device_type); + } -void OVCore::SetStreams(const std::string& device_type, int num_streams) { - core.set_property(device_type, {ov::num_streams(num_streams)}); -} + return available_devices; + } + + void OVCore::SetStreams(const std::string& device_type, int num_streams) { + core.set_property(device_type, {ov::num_streams(num_streams)}); + } std::shared_ptr OVExeNetwork::CreateInferRequest() { - return OvExceptionBoundary([&]() { + return OvExceptionBoundary([&]() { auto infReq = compiled_model_obj.create_infer_request(); std::shared_ptr ovInfReq; if (is_stateful_causallm) { @@ -318,191 +325,191 @@ std::shared_ptr OVExeNetwork::CreateInferRequest() { return ovInfReq; }, - "Exception while creating InferRequest object"); -} + "Exception while creating InferRequest object"); + } -OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { - return OvExceptionBoundary([&]() { - auto tobj = ovInfReq.get_tensor(input_name); - OVTensorPtr blob = std::make_shared(tobj); - return blob; - }, - " Cannot access IE Blob for input: {}", input_name); -} + OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { + return OvExceptionBoundary([&]() { + auto tobj = ovInfReq.get_tensor(input_name); + OVTensorPtr blob = std::make_shared(tobj); + return blob; + }, + " Cannot access IE Blob for input: {}", input_name); + } std::string OVInferRequest::GetInputTensorName(uint32_t index) { - return OvExceptionBoundary([&]() -> const std::string& { + return OvExceptionBoundary([&]() { const auto& model = ovInfReq.get_compiled_model(); return *model.input(index).get_names().begin(); }, " Cannot access IE Blob for input number: {}", index); } -void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { - OvExceptionBoundary([&]() { - ovInfReq.set_tensor(name, *(blob.get())); - }, - " Cannot set Remote Blob for output: {}", name); -} + void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { + OvExceptionBoundary([&]() { + ovInfReq.set_tensor(name, *(blob.get())); + }, + " Cannot set Remote Blob for output: {}", name); + } -uint32_t OVInferRequest::GetNumInputs() { - return static_cast(ovInfReq.get_compiled_model().inputs().size()); -} + uint32_t OVInferRequest::GetNumInputs() { + return static_cast(ovInfReq.get_compiled_model().inputs().size()); + } -void OVInferRequest::Infer() { - OvExceptionBoundary([&]() { - ovInfReq.infer(); - }, - "In Error Couldn't start Inference"); -} + void OVInferRequest::Infer() { + OvExceptionBoundary([&]() { + ovInfReq.infer(); + }, + "In Error Couldn't start Inference"); + } -StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) - : OVInferRequest(std::move(infer_request)), target_device(device) { - bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); - if (gpu_or_npu) { - prefill_use_full_chat_history = true; + StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) + : OVInferRequest(std::move(infer_request)), target_device(device) { + bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); + if (gpu_or_npu) { + prefill_use_full_chat_history = true; + } } -} -void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type, - const std::vector& shape, int32_t fill_value) { - ov::Tensor tensor = ov::Tensor(type, shape); - std::fill_n(tensor.data(), tensor.get_size(), fill_value); - ovInfReq.set_tensor(tensor_name, tensor); -} + void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type, + const std::vector& shape, int32_t fill_value) { + ov::Tensor tensor = ov::Tensor(type, shape); + std::fill_n(tensor.data(), tensor.get_size(), fill_value); + ovInfReq.set_tensor(tensor_name, tensor); + } -void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector& cache) { - auto tensor = ovInfReq.get_tensor(tensor_name); - auto* pData = tensor.data(); - for (size_t i = 0; i < tensor.get_size(); i++) { - cache.emplace_back(pData[i]); + void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector& cache) { + auto tensor = ovInfReq.get_tensor(tensor_name); + auto* pData = tensor.data(); + for (size_t i = 0; i < tensor.get_size(); i++) { + cache.emplace_back(pData[i]); + } } -} -void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name, - const std::vector& cache_data) { - auto tensor = ovInfReq.get_tensor(tensor_name); - auto new_shape = tensor.get_shape(); - new_shape[1] = cache_data.size(); + void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name, + const std::vector& cache_data) { + auto tensor = ovInfReq.get_tensor(tensor_name); + auto new_shape = tensor.get_shape(); + new_shape[1] = cache_data.size(); - auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape); - auto* pNewData = new_tensor.data(); - std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t)); + auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape); + auto* pNewData = new_tensor.data(); + std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t)); - ovInfReq.set_tensor(tensor_name, new_tensor); -} + ovInfReq.set_tensor(tensor_name, new_tensor); + } -std::optional StatefulOVInferRequest::FindTensor(const std::string& tensor_name) { - // Check if tensor exists by examining input names in the compiled model - const auto& model = ovInfReq.get_compiled_model(); - bool tensor_exists = false; + std::optional StatefulOVInferRequest::FindTensor(const std::string& tensor_name) { + // Check if tensor exists by examining input names in the compiled model + const auto& model = ovInfReq.get_compiled_model(); + bool tensor_exists = false; - for (const auto& input : model.inputs()) { - const auto& names = input.get_names(); - if (names.find(tensor_name) != names.end()) { - tensor_exists = true; - break; + for (const auto& input : model.inputs()) { + const auto& names = input.get_names(); + if (names.find(tensor_name) != names.end()) { + tensor_exists = true; + break; + } } - } - if (tensor_exists) { - return ovInfReq.get_tensor(tensor_name); - } + if (tensor_exists) { + return ovInfReq.get_tensor(tensor_name); + } - return std::nullopt; -} + return std::nullopt; + } -void StatefulOVInferRequest::PreProcessInferRequest() { - // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. - // TODO(ankit): Address this issue and implement the fix at the appropriate layer. - FillTensor("beam_idx", ov::element::i32, {1}, 0); + void StatefulOVInferRequest::PreProcessInferRequest() { + // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. + // TODO(ankit): Address this issue and implement the fix at the appropriate layer. + FillTensor("beam_idx", ov::element::i32, {1}, 0); - // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. - if (prefill_use_full_chat_history) { - auto input_ids_tensor = ovInfReq.get_tensor("input_ids"); - CacheTensor("input_ids", cached_input_ids); + // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. + if (prefill_use_full_chat_history) { + auto input_ids_tensor = ovInfReq.get_tensor("input_ids"); + CacheTensor("input_ids", cached_input_ids); - // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists - auto position_ids_opt = FindTensor("position_ids"); - bool has_position_ids = position_ids_opt.has_value(); + // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists + auto position_ids_opt = FindTensor("position_ids"); + bool has_position_ids = position_ids_opt.has_value(); - if (has_position_ids) { - CacheTensor("position_ids", cached_position_ids); - } + if (has_position_ids) { + CacheTensor("position_ids", cached_position_ids); + } - // If we're about to run the prefill model - if (input_ids_tensor.get_size() > 1) { - // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids". - // This indicates that we are running a subsequent prompt (not the initial prefill). - if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) { - // Clear the internal KVCache state. For NPU device, this operation is a no-op. - ovInfReq.reset_state(); + // If we're about to run the prefill model + if (input_ids_tensor.get_size() > 1) { + // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids". + // This indicates that we are running a subsequent prompt (not the initial prefill). + if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) { + // Clear the internal KVCache state. For NPU device, this operation is a no-op. + ovInfReq.reset_state(); - // Set tensors using cached values - SetTensorFromCache("input_ids", cached_input_ids); + // Set tensors using cached values + SetTensorFromCache("input_ids", cached_input_ids); - // Only set position_ids if it exists and we have cached values - if (has_position_ids && !cached_position_ids.empty()) { - SetTensorFromCache("position_ids", cached_position_ids); + // Only set position_ids if it exists and we have cached values + if (has_position_ids && !cached_position_ids.empty()) { + SetTensorFromCache("position_ids", cached_position_ids); + } } } } } -} -void StatefulOVInferRequest::Infer() { - PreProcessInferRequest(); - OVInferRequest::Infer(); -} + void StatefulOVInferRequest::Infer() { + PreProcessInferRequest(); + OVInferRequest::Infer(); + } -void StatefulOVInferRequest::RewindKVCache(size_t index) { - LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; + void StatefulOVInferRequest::RewindKVCache(size_t index) { + LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; - if (prefill_use_full_chat_history) { - // Clear the internal KVCache state. For NPU device, this operation is a no-op. - ovInfReq.reset_state(); + if (prefill_use_full_chat_history) { + // Clear the internal KVCache state. For NPU device, this operation is a no-op. + ovInfReq.reset_state(); - // Resize the cached "input_ids" and "position_ids" to the specified index. - if (cached_input_ids.size() > index) { - cached_input_ids.resize(index); - } + // Resize the cached "input_ids" and "position_ids" to the specified index. + if (cached_input_ids.size() > index) { + cached_input_ids.resize(index); + } - if (cached_position_ids.size() > index) { - cached_position_ids.resize(index); - } - } else { - if (index == 0) { - // In this case, since we're resetting the entire KVCache, simply reset the state. - ovInfReq.reset_state(); + if (cached_position_ids.size() > index) { + cached_position_ids.resize(index); + } } else { - // Retrieve KVCache states and trim them to the specified index. - // The following logic is adapted from: - // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329 - auto states = ovInfReq.query_state(); - for (auto& state : states) { - ov::Tensor old_tensor = state.get_state(); - // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size] - auto shape = old_tensor.get_shape(); - - if (shape[2] > index) { - // Update the sequence length dimension to the specified index. - shape[2] = index; - - ov::Coordinate new_shape_begin{0, 0, 0, 0}; - ov::Coordinate new_shape_end{shape}; - - // Create a trimmed tensor with the updated shape. - auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); - - // Copy the trimmed tensor into a new tensor and update the state. - ov::Tensor new_tensor(old_tensor.get_element_type(), shape); - trimmed_tensor.copy_to(new_tensor); - - state.set_state(new_tensor); + if (index == 0) { + // In this case, since we're resetting the entire KVCache, simply reset the state. + ovInfReq.reset_state(); + } else { + // Retrieve KVCache states and trim them to the specified index. + // The following logic is adapted from: + // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329 + auto states = ovInfReq.query_state(); + for (auto& state : states) { + ov::Tensor old_tensor = state.get_state(); + // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size] + auto shape = old_tensor.get_shape(); + + if (shape[2] > index) { + // Update the sequence length dimension to the specified index. + shape[2] = index; + + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{shape}; + + // Create a trimmed tensor with the updated shape. + auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); + + // Copy the trimmed tensor into a new tensor and update the state. + ov::Tensor new_tensor(old_tensor.get_element_type(), shape); + trimmed_tensor.copy_to(new_tensor); + + state.set_state(new_tensor); + } } } } } -} } // namespace openvino_ep } // namespace onnxruntime