From 93a6e3848fe908f17b684beff8c0b3003caa2453 Mon Sep 17 00:00:00 2001
From: "Javier E. Martinez" <javier.e.martinez@intel.com>
Date: Sat, 14 Jun 2025 17:28:23 -0700
Subject: [PATCH] Catch model import failure and report the appropriate error

---
 .../providers/openvino/backend_manager.cc     |  46 +-
 .../core/providers/openvino/exceptions.h      |  82 +++
 .../openvino/openvino_execution_provider.cc   | 217 +++---
 .../core/providers/openvino/ov_interface.cc   | 675 +++++++++---------
 4 files changed, 558 insertions(+), 462 deletions(-)
 create mode 100644 onnxruntime/core/providers/openvino/exceptions.h

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index cadeab4cbd4cc..1f6d409d1302d 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -20,6 +20,7 @@
 #include "core/providers/openvino/ov_interface.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/exceptions.h"
 #include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h"
 
 namespace onnxruntime {
@@ -154,30 +155,31 @@ BackendManager::BackendManager(SessionContext& session_context,
                                                       subgraph_context_,
                                                       shared_context_,
                                                       model_stream);
-    } catch (const OnnxRuntimeException& ex) {
-      std::string exception_str = ex.what();
-
-      if (session_context_.device_type.find("NPU") != std::string::npos &&
-          exception_str.find("intel_npu") != std::string::npos) {
-        // Handle NPU device related errors
-#ifndef NDEBUG
-        ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
-#else
-        std::string error_message = "UNKNOWN NPU ERROR";
-        std::string error_code = "code 0x0";
-        std::regex error_message_pattern(R"(\bZE_\w*\b)");
-        std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
-        std::smatch matches;
-        if (std::regex_search(exception_str, matches, error_message_pattern)) {
-          error_message = matches[0];
-        }
-        if (std::regex_search(exception_str, matches, error_code_pattern)) {
-          error_code = matches[0];
+    } catch (const ovep_exception& ex) {
+#ifndef OPENVINO_DISABLE_NPU_FALLBACK
+      bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
+                                       !session_context_.so_disable_cpu_ep_fallback &&
+                                       !subgraph_context_.is_ep_ctx_graph;
+      if (eligible_for_cpu_fallback) {
+        std::string exception_str = ex.what();
+        LOGS_DEFAULT(VERBOSE) << exception_str;
+        LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
+                              << "Falling back to OV CPU for execution";
+        session_context_.device_type = "CPU";
+        session_context_.precision = "FP32";
+        try {
+          concrete_backend_ = BackendFactory::MakeBackend(model_proto,
+                                                          session_context_,
+                                                          subgraph_context_,
+                                                          shared_context_,
+                                                          model_stream);
+        } catch (std::string const& msg) {
+          ORT_THROW(msg);
         }
-        throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
+      } else
 #endif
-      } else {
-        ORT_THROW(exception_str);
+      {
+        throw ex;
       }
     }
   }
diff --git a/onnxruntime/core/providers/openvino/exceptions.h b/onnxruntime/core/providers/openvino/exceptions.h
new file mode 100644
index 0000000000000..0f1737ff22cad
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/exceptions.h
@@ -0,0 +1,82 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include <exception>
+#include <regex>
+#include <string>
+
+#include "core/common/status.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+struct ovep_exception : public std::exception {
+  enum class type {
+    compile_model,
+    import_model,
+    query_prop,
+    read_model,
+    unknown,
+  };
+
+  ovep_exception(const std::string& message,
+                 enum class type type) : message_{message},
+                                         type_{type},
+                                         error_code_{ze_result_code_from_string(message)},
+                                         error_name_{ze_result_name_from_string(message)} {}
+
+  const char* what() const noexcept override {
+    return message_.data();
+  }
+
+  uint32_t get_code() const { return error_code_; }
+
+  operator common::Status() const {
+    common::StatusCategory category_ort{common::ONNXRUNTIME};
+
+    if (type_ == type::unknown) {
+      return {category_ort, common::FAIL, message_};
+    }
+
+    // Newer drivers
+    if ((type_ == type::import_model) &&
+        (error_code_ == 0x7800000f /* ZE_RESULT_ERROR_INVALID_NATIVE_BINARY */)) {
+      std::string message{error_name_ + ", code 0x" + std::to_string(error_code_) + "\nModel needs to be recompiled\n"};
+      return {category_ort, common::INVALID_GRAPH, message};
+    }
+
+    std::string error_message = "Unhandled exception type: " + std::to_string(static_cast<int>(type_));
+    return {category_ort, common::FAIL, error_message};
+  }
+
+ protected:
+  std::string message_;
+  type type_{type::unknown};
+  uint32_t error_code_{0};
+  std::string error_name_;
+
+ private:
+  uint32_t ze_result_code_from_string(const std::string& ov_exception_string) {
+    uint32_t error_code{0};
+    std::regex error_code_pattern("code 0x([0-9a-fA-F]+)");
+    std::smatch matches;
+    if (std::regex_search(ov_exception_string, matches, error_code_pattern)) {
+      std::from_chars(&(*matches[1].first), &(*matches[1].second), error_code, 16);
+    }
+    return error_code;
+  }
+  std::string ze_result_name_from_string(const std::string& ov_exception_string) {
+    std::string error_message = "UNKNOWN NPU ERROR";
+    std::regex error_message_pattern(R"(\bZE_\w*\b)");
+    std::smatch matches;
+    if (std::regex_search(ov_exception_string, matches, error_message_pattern)) {
+      error_message = matches[0];
+    }
+    return error_message;
+  }
+};
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index a0fa885cbfc38..cd8f1a51147be 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -12,6 +12,7 @@
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/exceptions.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "openvino/core/version.hpp"
 #ifdef USE_OVEP_NPU_MEMORY
@@ -94,124 +95,128 @@ common::Status OpenVINOExecutionProvider::Compile(
   auto& logger = *GetLogger();
   Status status = Status::OK();
 
-  bool is_epctx_model = false;
-  if (!fused_nodes.empty()) {
-    // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
-    const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
-    session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
-    session_context_.onnx_opset_version =
-        graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
-
-    // OVIR wrapped in epctx should be treated as source but this code does not
-    // This corner case is not in use and will be addressed in a future commit
-    is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0);
-  }
-
-  // The block below is executed during EP context model inference
-  auto& metadata = shared_context_->shared_weights.metadata;  // Metadata object in memory
-  if (session_context_.so_share_ep_contexts &&
-      is_epctx_model &&
-      metadata.empty()) {
-    fs::path context_model_file_path = session_context_.so_context_file_path;
-    if (context_model_file_path.empty()) {
-      // If ep.context_file_path is not set the input model path is used
-      context_model_file_path = session_context_.onnx_model_path_name;
+  try {
+    bool is_epctx_model = false;
+    if (!fused_nodes.empty()) {
+      // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
+      const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
+      session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
+      session_context_.onnx_opset_version =
+          graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
+
+      // OVIR wrapped in epctx should be treated as source but this code does not
+      // This corner case is not in use and will be addressed in a future commit
+      is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0);
     }
 
-    // Metadata is always read from model location, this could be a source or epctx model
-    fs::path metadata_filename = context_model_file_path.stem().string() + "_metadata.bin";
-    fs::path metadata_file_path = context_model_file_path.parent_path() / metadata_filename;
-    std::ifstream file(metadata_file_path, std::ios::binary);
-    ORT_RETURN_IF_NOT(file, "Metadata file was not found: " + metadata_file_path.string());
-    shared_context_->shared_weights.metadata_filepath = std::move(metadata_file_path);
-    file >> metadata;
-  }
-
-  struct OpenVINOEPFunctionState {
-    AllocateFunc allocate_func = nullptr;
-    DestroyFunc destroy_func = nullptr;
-    AllocatorHandle allocator_handle = nullptr;
-    BackendManager& backend_manager;
-  };
-
-  for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
-    const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
-    const Node& fused_node = fused_node_graph.fused_node;
-
-    NodeComputeInfo compute_info;
-
-    // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
-    // For precompiled blob, directly load the model instead of compiling the model
-    // For original model, check if the user wants to export a model with pre-compiled blob
-
-    auto& backend_manager = backend_managers_.emplace_back(session_context_,
-                                                           *shared_context_,
-                                                           fused_node,
-                                                           graph_body_viewer,
-                                                           logger,
-                                                           ep_ctx_handle_);
-
-    compute_info.create_state_func =
-        [&backend_manager](ComputeContext* context, FunctionState* state) {
-          OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
-              .allocate_func = context->allocate_func,
-              .destroy_func = context->release_func,
-              .allocator_handle = context->allocator_handle,
-              .backend_manager = backend_manager};
-          *state = static_cast<FunctionState>(p);
-          return 0;
-        };
-
-    compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
-      auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
-      try {
-        function_state->backend_manager.Compute(context);
-      } catch (const std::exception& ex) {
-        return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
+    // The block below is executed during EP context model inference
+    auto& metadata = shared_context_->shared_weights.metadata;  // Metadata object in memory
+    if (session_context_.so_share_ep_contexts &&
+        is_epctx_model &&
+        metadata.empty()) {
+      fs::path context_model_file_path = session_context_.so_context_file_path;
+      if (context_model_file_path.empty()) {
+        // If ep.context_file_path is not set the input model path is used
+        context_model_file_path = session_context_.onnx_model_path_name;
       }
-      return Status::OK();
+
+      // Metadata is always read from model location, this could be a source or epctx model
+      fs::path metadata_filename = context_model_file_path.stem().string() + "_metadata.bin";
+      fs::path metadata_file_path = context_model_file_path.parent_path() / metadata_filename;
+      std::ifstream file(metadata_file_path, std::ios::binary);
+      ORT_RETURN_IF_NOT(file, "Metadata file was not found: " + metadata_file_path.string());
+      shared_context_->shared_weights.metadata_filepath = std::move(metadata_file_path);
+      file >> metadata;
+    }
+
+    struct OpenVINOEPFunctionState {
+      AllocateFunc allocate_func = nullptr;
+      DestroyFunc destroy_func = nullptr;
+      AllocatorHandle allocator_handle = nullptr;
+      BackendManager& backend_manager;
     };
 
-    compute_info.release_state_func =
-        [](FunctionState state) {
-          if (state) {
-            OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
-            delete function_state;
-          }
-        };
+    for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
+      const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
+      const Node& fused_node = fused_node_graph.fused_node;
+
+      NodeComputeInfo compute_info;
+
+      // During backend creation, we check if user wants to use precompiled blob onnx model or the original model
+      // For precompiled blob, directly load the model instead of compiling the model
+      // For original model, check if the user wants to export a model with pre-compiled blob
+
+      auto& backend_manager = backend_managers_.emplace_back(session_context_,
+                                                             *shared_context_,
+                                                             fused_node,
+                                                             graph_body_viewer,
+                                                             logger,
+                                                             ep_ctx_handle_);
+
+      compute_info.create_state_func =
+          [&backend_manager](ComputeContext* context, FunctionState* state) {
+            OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{
+                .allocate_func = context->allocate_func,
+                .destroy_func = context->release_func,
+                .allocator_handle = context->allocator_handle,
+                .backend_manager = backend_manager};
+            *state = static_cast<FunctionState>(p);
+            return 0;
+          };
+
+      compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
+        auto function_state = static_cast<OpenVINOEPFunctionState*>(state);
+        try {
+          function_state->backend_manager.Compute(context);
+        } catch (const std::exception& ex) {
+          return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what());
+        }
+        return Status::OK();
+      };
 
-    node_compute_funcs.push_back(std::move(compute_info));
+      compute_info.release_state_func =
+          [](FunctionState state) {
+            if (state) {
+              OpenVINOEPFunctionState* function_state = static_cast<OpenVINOEPFunctionState*>(state);
+              delete function_state;
+            }
+          };
 
-    if (!status.IsOK()) {
-      break;
+      node_compute_funcs.push_back(std::move(compute_info));
+
+      if (!status.IsOK()) {
+        break;
+      }
     }
-  }
 
-  // The block below is executed during EP context model generation
-  if (session_context_.so_context_enable &&
-      session_context_.so_share_ep_contexts &&
-      !metadata.empty()) {
-    // For models after the first the metadata name comes from the shared context
-    fs::path metadata_file_path = shared_context_->shared_weights.metadata_filepath;
-    if (metadata_file_path.empty()) {
-      metadata_file_path = session_context_.so_context_file_path;
-      std::string name_append{"_metadata.bin"};
+    // The block below is executed during EP context model generation
+    if (session_context_.so_context_enable &&
+        session_context_.so_share_ep_contexts &&
+        !metadata.empty()) {
+      // For models after the first the metadata name comes from the shared context
+      fs::path metadata_file_path = shared_context_->shared_weights.metadata_filepath;
       if (metadata_file_path.empty()) {
-        metadata_file_path = session_context_.onnx_model_path_name;
-        name_append = "_ctx" + name_append;
+        metadata_file_path = session_context_.so_context_file_path;
+        std::string name_append{"_metadata.bin"};
+        if (metadata_file_path.empty()) {
+          metadata_file_path = session_context_.onnx_model_path_name;
+          name_append = "_ctx" + name_append;
+        }
+        auto metadata_filename = metadata_file_path.stem().string() + name_append;
+        metadata_file_path.replace_filename(metadata_filename);
+        shared_context_->shared_weights.metadata_filepath = metadata_file_path;
       }
-      auto metadata_filename = metadata_file_path.stem().string() + name_append;
-      metadata_file_path.replace_filename(metadata_filename);
-      shared_context_->shared_weights.metadata_filepath = metadata_file_path;
-    }
 
-    // Metadata is generated only for shared contexts
-    // If saving metadata then save it to the provided path or use the original model path
-    // Multiple calls to Compile() will update the metadata and for the last call
-    //   the resulting file will contain the aggregated content
-    std::ofstream file{metadata_file_path, std::ios::binary};
-    ORT_RETURN_IF_NOT(file, "Metadata file could not be written: ", metadata_file_path);
-    file << metadata;
+      // Metadata is generated only for shared contexts
+      // If saving metadata then save it to the provided path or use the original model path
+      // Multiple calls to Compile() will update the metadata and for the last call
+      //   the resulting file will contain the aggregated content
+      std::ofstream file{metadata_file_path, std::ios::binary};
+      ORT_RETURN_IF_NOT(file, "Metadata file could not be written: ", metadata_file_path);
+      file << metadata;
+    }
+  } catch (const ovep_exception& ex) {
+    status = ex;
   }
 
   return status;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 2d29df8eb4197..2af77992da305 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -11,18 +11,25 @@
 #include "core/providers/openvino/backend_utils.h"
 #include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/ov_stateful_patch_utils.h"
+#include "core/providers/openvino/exceptions.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
 
-template <typename Func, typename... Args>
-inline auto OvExceptionBoundary(Func&& func, std::format_string<Args...>&& fmt, Args&&... args) {
+template <bool typed, typename Func, typename... Args>
+inline auto OvExceptionBoundary(Func&& func, Args&&... args) {
   try {
     return func();
   } catch (const ov::Exception& e) {
-    ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)) + ": " + std::string(e.what()));
+    const auto message = log_tag + (args + ...) + ": " + std::string(e.what());
+    if constexpr (typed) {
+      ORT_THROW_EX(ovep_exception, message, ovep_exception::type::import_model);
+    } else {
+      ORT_THROW(message);
+    }
   } catch (...) {
-    ORT_THROW(log_tag + std::vformat(fmt.get(), std::make_format_args(args...)));
+    const auto message = log_tag + (args + ...);
+    ORT_THROW(message);
   }
 }
 
@@ -57,105 +64,105 @@ void printDebugInfo(const ov::CompiledModel& obj) {
 }
 #endif
 
-// Function to check if a given OV property is enabled
-std::optional<bool> queryOVProperty(const std::string& property, const std::string& device_type) {
-  try {
-    // Get the property value
-    auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties);
-    return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end();
-  } catch (const std::exception&) {
-    return std::nullopt;  // Property not found or invalid
+  // Function to check if a given OV property is enabled
+  std::optional<bool> queryOVProperty(const std::string& property, const std::string& device_type) {
+    try {
+      // Get the property value
+      auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties);
+      return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end();
+    } catch (const std::exception&) {
+      return std::nullopt;  // Property not found or invalid
+    }
   }
-}
 
-std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::string& model_path) {
-  return OvExceptionBoundary([&]() {
-    std::istringstream modelStringStream(std::move(model));
-    std::istream& modelStream = modelStringStream;
-    // Try to load with FrontEndManager
-    ov::frontend::FrontEndManager manager;
-    ov::frontend::FrontEnd::Ptr FE;
-    ov::frontend::InputModel::Ptr inputModel;
-
-    ov::AnyVector params{&modelStream, model_path};
-
-    FE = manager.load_by_model(params);
-    if (FE) {
-      inputModel = FE->load(params);
-      return FE->convert(inputModel);
-    } else {
-      ORT_THROW(log_tag + "Unknown exception while Reading network");
-    }
-  },
-                             "Exception while Reading network");
-}
+  std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string && model, const std::string& model_path) {
+    return OvExceptionBoundary<false>([&]() {
+      std::istringstream modelStringStream(std::move(model));
+      std::istream& modelStream = modelStringStream;
+      // Try to load with FrontEndManager
+      ov::frontend::FrontEndManager manager;
+      ov::frontend::FrontEnd::Ptr FE;
+      ov::frontend::InputModel::Ptr inputModel;
+
+      ov::AnyVector params{&modelStream, model_path};
+
+      FE = manager.load_by_model(params);
+      if (FE) {
+        inputModel = FE->load(params);
+        return FE->convert(inputModel);
+      } else {
+        ORT_THROW(log_tag + "Unknown exception while Reading network");
+      }
+    },
+                                      "Exception while Reading network");
+  }
 
-OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
-                                          std::string& hw_target,
-                                          const ov::AnyMap& device_config) {
-  ov::CompiledModel compiled_model;
-  ov::AnyMap config = device_config;
+  OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork> & model,
+                                            std::string & hw_target,
+                                            const ov::AnyMap& device_config) {
+    ov::CompiledModel compiled_model;
+    ov::AnyMap config = device_config;
 
-  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-    std::cout << "Stateless OV Model Statistic:" << std::endl;
-    LogBasicModelInfo(model);
-  }
+    if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+      std::cout << "Stateless OV Model Statistic:" << std::endl;
+      LogBasicModelInfo(model);
+    }
 
-  bool model_status = IsStateful(model);
-  LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False");
-  if (!model_status) {
-    LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
-    PatchStatefulDecoder(model);
-  }
+    bool model_status = IsStateful(model);
+    LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False");
+    if (!model_status) {
+      LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
+      PatchStatefulDecoder(model);
+    }
 
-  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-    std::cout << "Stateful OV Model Statistic:" << std::endl;
-    LogBasicModelInfo(model);
-  }
+    if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+      std::cout << "Stateful OV Model Statistic:" << std::endl;
+      LogBasicModelInfo(model);
+    }
 
-  auto kv_pos = GetKVAxesPos(model);
+    auto kv_pos = GetKVAxesPos(model);
 
-  if (hw_target.find("NPU") != std::string::npos) {
-    KVDesc kv_desc;
-    auto parse_genai_config = [&](const std::string& key, unsigned int default_value) {
-      return (config.count(key) && !config.at(key).empty() && config.at(key).as<std::string>() != "0") ? config.at(key).as<unsigned int>() : default_value;
-    };
+    if (hw_target.find("NPU") != std::string::npos) {
+      KVDesc kv_desc;
+      auto parse_genai_config = [&](const std::string& key, unsigned int default_value) {
+        return (config.count(key) && !config.at(key).empty() && config.at(key).as<std::string>() != "0") ? config.at(key).as<unsigned int>() : default_value;
+      };
 
-    kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len);
-    kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len);
+      kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len);
+      kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len);
 
-    // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0
-    if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) {
-      ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty");
-    }
+      // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0
+      if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) {
+        ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty");
+      }
 
-    if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
-      std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
-      std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
-      std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl;
-      std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl;
+      if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+        std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
+        std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
+        std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl;
+        std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl;
+      }
+
+      UpdateNPUConfig(config, kv_pos, kv_desc);
+    } else {
+      // This patches the OV IR model so that it only produces the logits required for sampling.
+      // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device,
+      // while this is here mostly to align this behavior for other devices viz. (CPU, GPU).
+      ApplySliceBeforeMatmulTransformation(model);
     }
 
-    UpdateNPUConfig(config, kv_pos, kv_desc);
-  } else {
-    // This patches the OV IR model so that it only produces the logits required for sampling.
-    // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device,
-    // while this is here mostly to align this behavior for other devices viz. (CPU, GPU).
-    ApplySliceBeforeMatmulTransformation(model);
+    LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow";
+    compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
+    OVExeNetwork exe(compiled_model, hw_target, true);
+    return exe;
   }
 
-  LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow";
-  compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
-  OVExeNetwork exe(compiled_model, hw_target, true);
-  return exe;
-}
-
 OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
                                   std::string& hw_target,
                                   ov::AnyMap& device_config,
                                   bool enable_causallm,
                                   const std::string& name) {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     OVExeNetwork exe;
     if (enable_causallm) {
       auto mutable_model = ie_cnn_network->clone();
@@ -166,148 +173,148 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
     }
 
 #ifndef NDEBUG
-    printDebugInfo(exe.Get());
+      printDebugInfo(exe.Get());
 #endif
 
-    return exe;
-  },
-                             "Exception while Loading Network for graph {}", name);
-}
+      return exe;
+    },
+                                      "Exception while Loading Network for graph {}", name);
+  }
 
-OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
-                                  std::string& hw_target,
-                                  ov::AnyMap& device_config,
-                                  const std::string& name) {
-  return OvExceptionBoundary([&]() {
-    ov::CompiledModel obj;
+  OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
+                                    std::string& hw_target,
+                                    ov::AnyMap& device_config,
+                                    const std::string& name) {
+    return OvExceptionBoundary<false>([&]() {
+      ov::CompiledModel obj;
 
-    obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
+      obj = core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
 #ifndef NDEBUG
-    printDebugInfo(obj);
+      printDebugInfo(obj);
 #endif
-    OVExeNetwork exe(obj, hw_target);
-    return exe;
-  },
-                             "Exception while Loading Network for graph {}", name);
-}
+      OVExeNetwork exe(obj, hw_target);
+      return exe;
+    },
+                                      "Exception while Loading Network for graph {}", name);
+  }
 
-OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
-                                 std::string hw_target,
-                                 const ov::AnyMap& device_config,
-                                 std::string name) {
-  return OvExceptionBoundary([&]() {
-    ov::CompiledModel obj;
-    obj = core.import_model(model_stream, hw_target, device_config);
-    OVExeNetwork exe(obj, hw_target);
+  OVExeNetwork OVCore::ImportModel(std::istream & model_stream,
+                                   std::string hw_target,
+                                   const ov::AnyMap& device_config,
+                                   std::string name) {
+    return OvExceptionBoundary<true>([&]() {
+      ov::CompiledModel obj;
+      obj = core.import_model(model_stream, hw_target, device_config);
+      OVExeNetwork exe(obj, hw_target);
 #ifndef NDEBUG
-    printDebugInfo(exe.Get());
+      printDebugInfo(exe.Get());
 #endif
-    return exe;
-  },
-                             "Exception while Loading Network for graph {}", name);
-}
-
-OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream& model_stream,
-                                                  std::string& hw_target,
-                                                  const ov::AnyMap& device_config,
-                                                  bool enable_causallm,
-                                                  std::filesystem::path model_file_path) {
-  return OvExceptionBoundary([&]() {
-    OVExeNetwork exe;
-
-    bool isXML = backend_utils::IsModelStreamXML(model_stream);
+      return exe;
+    },
+                                     "Exception while Loading Network for graph {}", name);
+  }
 
-    // Helper function to check if file exists and is readable
-    const auto check_file_access = [&model_file_path](const std::filesystem::path& path) {
-      try {
-        if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) {
-          ORT_THROW(log_tag + "Required file missing or empty: " + path.string());
-        }
-        std::ifstream file(path);
-        if (!file) {
-          ORT_THROW(log_tag + "Required file not readable: " + path.string());
+  OVExeNetwork OVCore::ImportEPCtxOVIREncapsulation(std::istream & model_stream,
+                                                    std::string & hw_target,
+                                                    const ov::AnyMap& device_config,
+                                                    bool enable_causallm,
+                                                    std::filesystem::path model_file_path) {
+    return OvExceptionBoundary<false>([&]() {
+      OVExeNetwork exe;
+
+      bool isXML = backend_utils::IsModelStreamXML(model_stream);
+
+      // Helper function to check if file exists and is readable
+      const auto check_file_access = [&model_file_path](const std::filesystem::path& path) {
+        try {
+          if (!std::filesystem::exists(path) || std::filesystem::is_empty(path)) {
+            ORT_THROW(log_tag + "Required file missing or empty: " + path.string());
+          }
+          std::ifstream file(path);
+          if (!file) {
+            ORT_THROW(log_tag + "Required file not readable: " + path.string());
+          }
+        } catch (const std::exception& e) {
+          ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what());
         }
-      } catch (const std::exception& e) {
-        ORT_THROW(log_tag + "Exception while checking file access for: " + path.string() + " - " + e.what());
-      }
-    };
+      };
 
-    if (isXML) {
-      // If the model is XML, we need to load it with the XML content in read_model()
-      // where weights from bin file is directly consumed
-      auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml");
+      if (isXML) {
+        // If the model is XML, we need to load it with the XML content in read_model()
+        // where weights from bin file is directly consumed
+        auto xml_file_path = model_file_path.parent_path() / (model_file_path.stem().string() + ".xml");
 
-      check_file_access(xml_file_path);
+        check_file_access(xml_file_path);
 
-      LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string();
+        LOGS_DEFAULT(INFO) << log_tag << "Reading OVIR from XML file path: " << xml_file_path.string();
 
-      // Load the model explicitly with XML contents
-      std::shared_ptr<ov::Model> model = core.read_model(xml_file_path.string());
+        // Load the model explicitly with XML contents
+        std::shared_ptr<ov::Model> model = core.read_model(xml_file_path.string());
 
-      if (enable_causallm) {
-        exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config);
-      } else {
-        auto obj = core.compile_model(model, hw_target, device_config);
-        exe = OVExeNetwork(obj, hw_target);
+        if (enable_causallm) {
+          exe = OVCore::Get()->StatefulCompileModel(model, hw_target, device_config);
+        } else {
+          auto obj = core.compile_model(model, hw_target, device_config);
+          exe = OVExeNetwork(obj, hw_target);
+        }
       }
-    }
 
 #ifndef NDEBUG
-    printDebugInfo(exe.Get());
+      printDebugInfo(exe.Get());
 #endif
-    return exe;
-  },
-                             "Exception while Loading Network from OVIR model file: {}", model_file_path.string());
-}
+      return exe;
+    },
+                                      "Exception while Loading Network from OVIR model file: {}", model_file_path.string());
+  }
 
 void OVCore::SetCache(const std::string& cache_dir_path) {
   core.set_property(ov::cache_dir(cache_dir_path));
 }
 
-std::vector<std::string> OVCore::GetAvailableDevices() const {
-  std::vector<std::string> available_devices = core.get_available_devices();
-  return available_devices;
-}
-
-std::vector<std::string> OVCore::GetAvailableDevices(const std::string& device_type) const {
-  std::vector<std::string> available_devices;
-  std::vector<std::string> devicesIDs;
-  // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU)
-  try {
-    devicesIDs = core.get_property(device_type, ov::available_devices);
-  } catch (const ov::Exception&) {
-    // plugin is not created by e.g. invalid env
-    // Empty device list will be returned
-  } catch (const std::exception& ex) {
-    ORT_THROW(log_tag + "An exception occurred while trying to create the ",
-              device_type,
-              " device: ",
-              ex.what());
-  } catch (...) {
-    ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ",
-              device_type,
-              " device");
+  std::vector<std::string> OVCore::GetAvailableDevices() const {
+    std::vector<std::string> available_devices = core.get_available_devices();
+    return available_devices;
   }
 
-  if (devicesIDs.size() > 1 ||
-      (devicesIDs.size() == 1 && devicesIDs[0] == "0")) {
-    for (const auto& deviceID : devicesIDs) {
-      available_devices.push_back(device_type + '.' + deviceID);
+  std::vector<std::string> OVCore::GetAvailableDevices(const std::string& device_type) const {
+    std::vector<std::string> available_devices;
+    std::vector<std::string> devicesIDs;
+    // Uses logic from OpenVINO to only return available devices of the specified type (e.g. CPU, NPU or GPU)
+    try {
+      devicesIDs = core.get_property(device_type, ov::available_devices);
+    } catch (const ov::Exception&) {
+      // plugin is not created by e.g. invalid env
+      // Empty device list will be returned
+    } catch (const std::exception& ex) {
+      ORT_THROW(log_tag + "An exception occurred while trying to create the ",
+                device_type,
+                " device: ",
+                ex.what());
+    } catch (...) {
+      ORT_THROW(log_tag + "Unknown exception occurred while trying to create the ",
+                device_type,
+                " device");
     }
-  }
-  if (!devicesIDs.empty()) {
-    available_devices.push_back(device_type);
-  }
 
-  return available_devices;
-}
+    if (devicesIDs.size() > 1 ||
+        (devicesIDs.size() == 1 && devicesIDs[0] == "0")) {
+      for (const auto& deviceID : devicesIDs) {
+        available_devices.push_back(device_type + '.' + deviceID);
+      }
+    }
+    if (!devicesIDs.empty()) {
+      available_devices.push_back(device_type);
+    }
 
-void OVCore::SetStreams(const std::string& device_type, int num_streams) {
-  core.set_property(device_type, {ov::num_streams(num_streams)});
-}
+    return available_devices;
+  }
+
+  void OVCore::SetStreams(const std::string& device_type, int num_streams) {
+    core.set_property(device_type, {ov::num_streams(num_streams)});
+  }
 
 std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
-  return OvExceptionBoundary([&]() {
+  return OvExceptionBoundary<false>([&]() {
     auto infReq = compiled_model_obj.create_infer_request();
     std::shared_ptr<OVInferRequest> ovInfReq;
     if (is_stateful_causallm) {
@@ -318,191 +325,191 @@ std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
     return ovInfReq;
   },
 
-                             "Exception while creating InferRequest object");
-}
+                                      "Exception while creating InferRequest object");
+  }
 
-OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
-  return OvExceptionBoundary([&]() {
-    auto tobj = ovInfReq.get_tensor(input_name);
-    OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
-    return blob;
-  },
-                             " Cannot access IE Blob for input: {}", input_name);
-}
+  OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
+    return OvExceptionBoundary<false>([&]() {
+      auto tobj = ovInfReq.get_tensor(input_name);
+      OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
+      return blob;
+    },
+                                      " Cannot access IE Blob for input: {}", input_name);
+  }
 
 std::string OVInferRequest::GetInputTensorName(uint32_t index) {
-  return OvExceptionBoundary([&]() -> const std::string& {
+  return OvExceptionBoundary<false>([&]() {
     const auto& model = ovInfReq.get_compiled_model();
     return *model.input(index).get_names().begin();
   },
                              " Cannot access IE Blob for input number: {}", index);
 }
 
-void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
-  OvExceptionBoundary([&]() {
-    ovInfReq.set_tensor(name, *(blob.get()));
-  },
-                      " Cannot set Remote Blob for output: {}", name);
-}
+  void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
+    OvExceptionBoundary<false>([&]() {
+      ovInfReq.set_tensor(name, *(blob.get()));
+    },
+                               " Cannot set Remote Blob for output: {}", name);
+  }
 
-uint32_t OVInferRequest::GetNumInputs() {
-  return static_cast<uint32_t>(ovInfReq.get_compiled_model().inputs().size());
-}
+  uint32_t OVInferRequest::GetNumInputs() {
+    return static_cast<uint32_t>(ovInfReq.get_compiled_model().inputs().size());
+  }
 
-void OVInferRequest::Infer() {
-  OvExceptionBoundary([&]() {
-    ovInfReq.infer();
-  },
-                      "In Error Couldn't start Inference");
-}
+  void OVInferRequest::Infer() {
+    OvExceptionBoundary<false>([&]() {
+      ovInfReq.infer();
+    },
+                               "In Error Couldn't start Inference");
+  }
 
-StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
-    : OVInferRequest(std::move(infer_request)), target_device(device) {
-  bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
-  if (gpu_or_npu) {
-    prefill_use_full_chat_history = true;
+  StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
+      : OVInferRequest(std::move(infer_request)), target_device(device) {
+    bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
+    if (gpu_or_npu) {
+      prefill_use_full_chat_history = true;
+    }
   }
-}
 
-void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type,
-                                        const std::vector<size_t>& shape, int32_t fill_value) {
-  ov::Tensor tensor = ov::Tensor(type, shape);
-  std::fill_n(tensor.data<int32_t>(), tensor.get_size(), fill_value);
-  ovInfReq.set_tensor(tensor_name, tensor);
-}
+  void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type,
+                                          const std::vector<size_t>& shape, int32_t fill_value) {
+    ov::Tensor tensor = ov::Tensor(type, shape);
+    std::fill_n(tensor.data<int32_t>(), tensor.get_size(), fill_value);
+    ovInfReq.set_tensor(tensor_name, tensor);
+  }
 
-void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache) {
-  auto tensor = ovInfReq.get_tensor(tensor_name);
-  auto* pData = tensor.data<int64_t>();
-  for (size_t i = 0; i < tensor.get_size(); i++) {
-    cache.emplace_back(pData[i]);
+  void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache) {
+    auto tensor = ovInfReq.get_tensor(tensor_name);
+    auto* pData = tensor.data<int64_t>();
+    for (size_t i = 0; i < tensor.get_size(); i++) {
+      cache.emplace_back(pData[i]);
+    }
   }
-}
 
-void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name,
-                                                const std::vector<int64_t>& cache_data) {
-  auto tensor = ovInfReq.get_tensor(tensor_name);
-  auto new_shape = tensor.get_shape();
-  new_shape[1] = cache_data.size();
+  void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name,
+                                                  const std::vector<int64_t>& cache_data) {
+    auto tensor = ovInfReq.get_tensor(tensor_name);
+    auto new_shape = tensor.get_shape();
+    new_shape[1] = cache_data.size();
 
-  auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape);
-  auto* pNewData = new_tensor.data<int64_t>();
-  std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t));
+    auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape);
+    auto* pNewData = new_tensor.data<int64_t>();
+    std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t));
 
-  ovInfReq.set_tensor(tensor_name, new_tensor);
-}
+    ovInfReq.set_tensor(tensor_name, new_tensor);
+  }
 
-std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string& tensor_name) {
-  // Check if tensor exists by examining input names in the compiled model
-  const auto& model = ovInfReq.get_compiled_model();
-  bool tensor_exists = false;
+  std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string& tensor_name) {
+    // Check if tensor exists by examining input names in the compiled model
+    const auto& model = ovInfReq.get_compiled_model();
+    bool tensor_exists = false;
 
-  for (const auto& input : model.inputs()) {
-    const auto& names = input.get_names();
-    if (names.find(tensor_name) != names.end()) {
-      tensor_exists = true;
-      break;
+    for (const auto& input : model.inputs()) {
+      const auto& names = input.get_names();
+      if (names.find(tensor_name) != names.end()) {
+        tensor_exists = true;
+        break;
+      }
     }
-  }
 
-  if (tensor_exists) {
-    return ovInfReq.get_tensor(tensor_name);
-  }
+    if (tensor_exists) {
+      return ovInfReq.get_tensor(tensor_name);
+    }
 
-  return std::nullopt;
-}
+    return std::nullopt;
+  }
 
-void StatefulOVInferRequest::PreProcessInferRequest() {
-  // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
-  // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
-  FillTensor("beam_idx", ov::element::i32, {1}, 0);
+  void StatefulOVInferRequest::PreProcessInferRequest() {
+    // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
+    // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
+    FillTensor("beam_idx", ov::element::i32, {1}, 0);
 
-  // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
-  if (prefill_use_full_chat_history) {
-    auto input_ids_tensor = ovInfReq.get_tensor("input_ids");
-    CacheTensor("input_ids", cached_input_ids);
+    // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
+    if (prefill_use_full_chat_history) {
+      auto input_ids_tensor = ovInfReq.get_tensor("input_ids");
+      CacheTensor("input_ids", cached_input_ids);
 
-    // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists
-    auto position_ids_opt = FindTensor("position_ids");
-    bool has_position_ids = position_ids_opt.has_value();
+      // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists
+      auto position_ids_opt = FindTensor("position_ids");
+      bool has_position_ids = position_ids_opt.has_value();
 
-    if (has_position_ids) {
-      CacheTensor("position_ids", cached_position_ids);
-    }
+      if (has_position_ids) {
+        CacheTensor("position_ids", cached_position_ids);
+      }
 
-    // If we're about to run the prefill model
-    if (input_ids_tensor.get_size() > 1) {
-      // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids".
-      // This indicates that we are running a subsequent prompt (not the initial prefill).
-      if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) {
-        // Clear the internal KVCache state. For NPU device, this operation is a no-op.
-        ovInfReq.reset_state();
+      // If we're about to run the prefill model
+      if (input_ids_tensor.get_size() > 1) {
+        // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids".
+        // This indicates that we are running a subsequent prompt (not the initial prefill).
+        if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) {
+          // Clear the internal KVCache state. For NPU device, this operation is a no-op.
+          ovInfReq.reset_state();
 
-        // Set tensors using cached values
-        SetTensorFromCache("input_ids", cached_input_ids);
+          // Set tensors using cached values
+          SetTensorFromCache("input_ids", cached_input_ids);
 
-        // Only set position_ids if it exists and we have cached values
-        if (has_position_ids && !cached_position_ids.empty()) {
-          SetTensorFromCache("position_ids", cached_position_ids);
+          // Only set position_ids if it exists and we have cached values
+          if (has_position_ids && !cached_position_ids.empty()) {
+            SetTensorFromCache("position_ids", cached_position_ids);
+          }
         }
       }
     }
   }
-}
 
-void StatefulOVInferRequest::Infer() {
-  PreProcessInferRequest();
-  OVInferRequest::Infer();
-}
+  void StatefulOVInferRequest::Infer() {
+    PreProcessInferRequest();
+    OVInferRequest::Infer();
+  }
 
-void StatefulOVInferRequest::RewindKVCache(size_t index) {
-  LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index;
+  void StatefulOVInferRequest::RewindKVCache(size_t index) {
+    LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index;
 
-  if (prefill_use_full_chat_history) {
-    // Clear the internal KVCache state. For NPU device, this operation is a no-op.
-    ovInfReq.reset_state();
+    if (prefill_use_full_chat_history) {
+      // Clear the internal KVCache state. For NPU device, this operation is a no-op.
+      ovInfReq.reset_state();
 
-    // Resize the cached "input_ids" and "position_ids" to the specified index.
-    if (cached_input_ids.size() > index) {
-      cached_input_ids.resize(index);
-    }
+      // Resize the cached "input_ids" and "position_ids" to the specified index.
+      if (cached_input_ids.size() > index) {
+        cached_input_ids.resize(index);
+      }
 
-    if (cached_position_ids.size() > index) {
-      cached_position_ids.resize(index);
-    }
-  } else {
-    if (index == 0) {
-      // In this case, since we're resetting the entire KVCache, simply reset the state.
-      ovInfReq.reset_state();
+      if (cached_position_ids.size() > index) {
+        cached_position_ids.resize(index);
+      }
     } else {
-      // Retrieve KVCache states and trim them to the specified index.
-      // The following logic is adapted from:
-      // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329
-      auto states = ovInfReq.query_state();
-      for (auto& state : states) {
-        ov::Tensor old_tensor = state.get_state();
-        // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size]
-        auto shape = old_tensor.get_shape();
-
-        if (shape[2] > index) {
-          // Update the sequence length dimension to the specified index.
-          shape[2] = index;
-
-          ov::Coordinate new_shape_begin{0, 0, 0, 0};
-          ov::Coordinate new_shape_end{shape};
-
-          // Create a trimmed tensor with the updated shape.
-          auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end);
-
-          // Copy the trimmed tensor into a new tensor and update the state.
-          ov::Tensor new_tensor(old_tensor.get_element_type(), shape);
-          trimmed_tensor.copy_to(new_tensor);
-
-          state.set_state(new_tensor);
+      if (index == 0) {
+        // In this case, since we're resetting the entire KVCache, simply reset the state.
+        ovInfReq.reset_state();
+      } else {
+        // Retrieve KVCache states and trim them to the specified index.
+        // The following logic is adapted from:
+        // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329
+        auto states = ovInfReq.query_state();
+        for (auto& state : states) {
+          ov::Tensor old_tensor = state.get_state();
+          // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size]
+          auto shape = old_tensor.get_shape();
+
+          if (shape[2] > index) {
+            // Update the sequence length dimension to the specified index.
+            shape[2] = index;
+
+            ov::Coordinate new_shape_begin{0, 0, 0, 0};
+            ov::Coordinate new_shape_end{shape};
+
+            // Create a trimmed tensor with the updated shape.
+            auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end);
+
+            // Copy the trimmed tensor into a new tensor and update the state.
+            ov::Tensor new_tensor(old_tensor.get_element_type(), shape);
+            trimmed_tensor.copy_to(new_tensor);
+
+            state.set_state(new_tensor);
+          }
         }
       }
     }
   }
-}
 }  // namespace openvino_ep
 }  // namespace onnxruntime