openvinotoolkit · smirnov-alexey · Oct 2, 2025 · Oct 2, 2025 · Oct 9, 2025 · Oct 10, 2025
@@ -157,15 +157,48 @@ void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx,
 }
 
 ov::SoPtr<ov::ITensor> ov::npuw::IBaseInferRequest::get_tensor(const ov::Output<const ov::Node>& port) const {
-    // assert(persistent)
+    if (m_port_to_tensor.find(port) == m_port_to_tensor.end()) {
+        // I/O: allocate here on demand (to reduce memory consumption in case some I/O were shared)
+        // Input
+        for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
+            if (m_npuw_model->inputs()[i] == port) {
+                ov::SoPtr<ov::ITensor> allocated = allocOut(port, global_input_mem_device(i));
+                m_input_allocated.insert(allocated->data());
+                m_port_to_tensor[port] = TensorStorage{allocated, true, true};
+                return m_port_to_tensor.at(port).tensor;
+            }
+        }
+
+        // Output
+        for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+            if (m_npuw_model->outputs()[i] == port) {
+                auto tensor = alloc_global_out(i);
+                m_port_to_tensor[port] = TensorStorage{tensor, true, true};
+                return m_port_to_tensor.at(port).tensor;
+            }
+        }
+    }
+
+    // Not I/O or I/O set by the user - return as is
+    NPUW_ASSERT((!m_port_to_tensor.at(port).persistent || m_port_to_tensor.at(port).set_from_outside ||
+                 m_port_to_tensor.at(port).allocated_on_device) &&
+                "Internal error!");
     return m_port_to_tensor.at(port).tensor;
 }
 
 void ov::npuw::IBaseInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
                                              const ov::SoPtr<ov::ITensor>& tensor) {
-    // Assigning via .at() to ensure it is a known port
-    // assert(persistent)
-    m_port_to_tensor.at(port).tensor = tensor;
+    if (!is_stored(port)) {
+        // TODO: might be useful to check if the tensor is allocated on the device
+        m_port_to_tensor[port] = TensorStorage{tensor, false, false, true};
+    } else {
+        m_port_to_tensor.at(port).tensor = tensor;
+        m_port_to_tensor.at(port).set_from_outside = true;
+    }
+
+    if (is_io(port)) {
+        m_port_to_tensor.at(port).persistent = true;
+    }
 
     // Check if setting input tensor
     if (m_port_to_tensor.at(port).persistent) {
@@ -189,6 +222,24 @@ void ov::npuw::IBaseInferRequest::check_tensors() const {
     return;
 }
 
+bool ov::npuw::IBaseInferRequest::is_stored(const ov::Output<const ov::Node>& port) const {
+    return m_port_to_tensor.find(port) != m_port_to_tensor.end();
+}
+
+bool ov::npuw::IBaseInferRequest::is_io(const ov::Output<const ov::Node>& port) const {
+    for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
+        if (m_npuw_model->inputs()[i] == port) {
+            return true;
+        }
+    }
+    for (std::size_t i = 0; i < m_npuw_model->outputs().size(); ++i) {
+        if (m_npuw_model->outputs()[i] == port) {
+            return true;
+        }
+    }
+    return false;
+}
+
 void ov::npuw::IBaseInferRequest::handle_set_remote_input(const ov::Output<const ov::Node>& port,
                                                           const ov::SoPtr<ov::ITensor>& tensor) {
     for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
@@ -291,14 +342,14 @@ std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const {
 
 ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type,
                                                           const ov::Shape& shape,
-                                                          const std::string& device) {
+                                                          const std::string& device) const {
     auto ptr = ov::npuw::util::allocMem(type, shape, device, m_npuw_model->get_plugin());
     m_footprint[device] += ptr->get_byte_size();
     return ptr;
 }
 
 ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output<const ov::Node>& node,
-                                                          const std::string& device) {
+                                                          const std::string& device) const {
     return allocMem(node.get_element_type(), node.get_shape(), device);
 }
 
@@ -327,31 +378,7 @@ std::string ov::npuw::IBaseInferRequest::global_output_mem_device(std::size_t id
     return *proto_comp_model_desc.device_it;
 }
 
-void ov::npuw::IBaseInferRequest::alloc_io() {
-    // Preallocate input tensors
-    LOG_INFO("Preallocating input tensors...");
-    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
-        const auto& port = m_npuw_model->inputs()[i];
-        ov::SoPtr<ov::ITensor> allocated = allocOut(port, global_input_mem_device(i));
-        m_input_allocated.insert(allocated->data());
-        m_port_to_tensor[port] = TensorStorage{allocated, true};
-    }  // for(inputs)
-
-    // Preallocate output tensors
-    LOG_INFO("Preallocating output tensors...");
-    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
-        LOG_BLOCK();
-        const auto& port = m_npuw_model->outputs()[i];
-        LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);
-
-        // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
-        const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
-        LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
-
-        auto tensor = alloc_global_out(i);
-        m_port_to_tensor[port] = TensorStorage{tensor, true};
-    }
-
+void ov::npuw::IBaseInferRequest::alloc_quant_gather() {
     // Try to allocate intermediate tensors to gather into, when host quant gather is enabled
     for (size_t i = 0; i < m_num_submodels; i++) {
         auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
@@ -362,7 +389,7 @@ void ov::npuw::IBaseInferRequest::alloc_io() {
     }
 }
 
-ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) {
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) const {
     const auto& port = m_npuw_model->outputs().at(out_idx);
     return allocOut(port, global_output_mem_device(out_idx));
 }
@@ -524,7 +551,7 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ
         LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl);
 
         const auto& g_port = m_npuw_model->inputs()[param_idx];
-        const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
+        const auto& g_tnsr = is_stored(g_port) ? m_port_to_tensor.at(g_port).tensor : get_tensor(g_port);
         const auto& s_port = request->get_inputs()[sub_in_idx];
         LOG_DEBUG("Processing " << g_port << " -> " << s_port << "...");
         LOG_BLOCK();
@@ -743,7 +770,7 @@ void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr req
         std::tie(result_idx, sub_out_idx) = it;
         const auto& g_port = m_npuw_model->outputs()[result_idx];
         const auto& s_port = request->get_outputs()[sub_out_idx];
-        request->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
+        request->set_tensor(s_port, is_stored(g_port) ? m_port_to_tensor.at(g_port).tensor : get_tensor(g_port));
     }
 
     LOG_DEBUG("Done");

@@ -93,13 +93,21 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
 
     struct TensorStorage {
         ov::SoPtr<ov::ITensor> tensor;
-        bool persistent = false;       // true for the parent I/O tensors
-        std::size_t num_readers = 0u;  // fixed during execution
-        std::size_t num_reads = 0u;    // changes during execution (ref-counter-like).
-                                       // reset to 0 before every new execution
+        bool persistent = false;           // true for the parent I/O tensors
+        bool allocated_on_device = false;  // mark for internally allocated I/O
+        bool set_from_outside = false;     // outside I/O tensors shouldn't be reallocated
+        std::size_t num_readers = 0u;      // fixed during execution
+        std::size_t num_reads = 0u;        // changes during execution (ref-counter-like).
+                                           // reset to 0 before every new execution
     };
     // FROM(Every subrequests' output port) TO(Its output tensor)
-    std::map<ov::Output<const ov::Node>, TensorStorage> m_port_to_tensor;
+    mutable std::map<ov::Output<const ov::Node>, TensorStorage>
+        m_port_to_tensor;  // mutable due to lazy I/O allocation in get_tensor()
+
+    // Check that m_port_to_tensor does have a tensor stored at the port
+    bool is_stored(const ov::Output<const ov::Node>& port) const;
+    // Check the port is I/O
+    bool is_io(const ov::Output<const ov::Node>& port) const;
 
     struct QuantGatherTensors {
         ov::Tensor w, z, s;
@@ -147,15 +155,15 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     std::vector<GlobalIO> m_subrequests_gio;
 
     // Tracks tensors we allocated on our own - to recognize and avoid copies
-    std::unordered_set<void*> m_input_allocated;
+    mutable std::unordered_set<void*> m_input_allocated;  // mutable due to lazy I/O allocation in get_tensor()
 
     // Common functionality - shared for subclasses
     const std::size_t m_num_submodels;
 
-    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
-    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
-    virtual void alloc_io();
-    virtual TensorPtr alloc_global_out(std::size_t out_idx);
+    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device) const;
+    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device) const;
+    virtual void alloc_quant_gather();
+    virtual TensorPtr alloc_global_out(std::size_t out_idx) const;
 
     std::string global_input_mem_device(std::size_t idx) const;
     std::string global_output_mem_device(std::size_t idx) const;
@@ -178,7 +186,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
 
     MS m_ms_unpack;
     ov::npuw::perf::Profile<MS> m_profile;
-    ov::npuw::perf::Profile<B> m_footprint;
+    mutable ov::npuw::perf::Profile<B> m_footprint;  // mutable due to lazy I/O allocation in get_tensor()
 
     std::string profile_tag(std::size_t idx) const;
 

@@ -238,14 +238,16 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
                     // Note: these buffers are allocated to the entire NWAY (> tail_size)
                     for (auto&& p : proto_comp_model_desc.spatial->params) {
                         const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
-                        m_spatial_io[real_idx].input_tails[p.idx] =
-                            allocOut(iport, m_npuw_model->funcall_mem_device(real_idx));
+                        m_spatial_io[real_idx].input_tails[p.idx] = allocOut(
+                            iport,
+                            m_npuw_model->funcall_mem_device(real_idx));  // should it be handled lazy way as well?
                     }
                     const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
                     for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
                         const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
-                        m_spatial_io[real_idx].output_tails[out_idx] =
-                            allocOut(oport, m_npuw_model->funcall_mem_device(real_idx));
+                        m_spatial_io[real_idx].output_tails[out_idx] = allocOut(
+                            oport,
+                            m_npuw_model->funcall_mem_device(real_idx));  // should it be handled lazy way as well?
                     }
                 }
             }  // if(spatial)
@@ -324,7 +326,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         }
     }  // if(function_pipelining)
 
-    alloc_io();
+    alloc_quant_gather();
     connect_subrequests();
     init_gio();
 
@@ -387,11 +389,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
 
 void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
                                             const ov::SoPtr<ov::ITensor>& tensor) {
-    // Check that it's I/O
-    NPUW_ASSERT(m_port_to_tensor.at(port).persistent);
-
-    // Assigning via .at() to ensure it is a known port
-    m_port_to_tensor.at(port).tensor = tensor;
+    NPUW_ASSERT(is_io(port));
+    m_port_to_tensor[port] = TensorStorage{tensor, true, false, true};
 
     // Check if setting output tensor
     for (std::size_t i = 0; i < m_npuw_model->outputs().size(); ++i) {
@@ -414,7 +413,7 @@ void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& po
     handle_set_remote_input(port, tensor);
 }
 
-ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) {
+ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) const {
     const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(out_idx);
     auto funcall_result_iter = m_funcall_result.find(from_submodel);
     if (funcall_result_iter != m_funcall_result.end()) {

@@ -83,7 +83,7 @@ class JustInferRequest final : public IBaseInferRequest {
     bool supports_async_pipeline() const override;
     void update_subrequest_links(std::size_t idx) override;
 
-    TensorPtr alloc_global_out(std::size_t out_idx) override;
+    TensorPtr alloc_global_out(std::size_t out_idx) const override;
 
     void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
 

@@ -43,7 +43,7 @@ ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr<ov::npuw:
         LOG_INFO("DONE");
     }  // for(submodels)
 
-    alloc_io();
+    alloc_quant_gather();
 
     LOG_INFO("Connecting subrequests...");
     LOG_BLOCK();