Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 61 additions & 34 deletions src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,48 @@ void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx,
}

ov::SoPtr<ov::ITensor> ov::npuw::IBaseInferRequest::get_tensor(const ov::Output<const ov::Node>& port) const {
// assert(persistent)
if (m_port_to_tensor.find(port) == m_port_to_tensor.end()) {
// I/O: allocate here on demand (to reduce memory consumption in case some I/O were shared)
// Input
for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
if (m_npuw_model->inputs()[i] == port) {
ov::SoPtr<ov::ITensor> allocated = allocOut(port, global_input_mem_device(i));
m_input_allocated.insert(allocated->data());
m_port_to_tensor[port] = TensorStorage{allocated, true, true};
return m_port_to_tensor.at(port).tensor;
}
}

// Output
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
if (m_npuw_model->outputs()[i] == port) {
auto tensor = alloc_global_out(i);
m_port_to_tensor[port] = TensorStorage{tensor, true, true};
return m_port_to_tensor.at(port).tensor;
}
}
}

// Not I/O or I/O set by the user - return as is
NPUW_ASSERT((!m_port_to_tensor.at(port).persistent || m_port_to_tensor.at(port).set_from_outside ||
m_port_to_tensor.at(port).allocated_on_device) &&
"Internal error!");
return m_port_to_tensor.at(port).tensor;
}

void ov::npuw::IBaseInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
const ov::SoPtr<ov::ITensor>& tensor) {
// Assigning via .at() to ensure it is a known port
// assert(persistent)
m_port_to_tensor.at(port).tensor = tensor;
if (!is_stored(port)) {
// TODO: might be useful to check if the tensor is allocated on the device
m_port_to_tensor[port] = TensorStorage{tensor, false, false, true};
} else {
m_port_to_tensor.at(port).tensor = tensor;
m_port_to_tensor.at(port).set_from_outside = true;
}

if (is_io(port)) {
m_port_to_tensor.at(port).persistent = true;
}

// Check if setting input tensor
if (m_port_to_tensor.at(port).persistent) {
Expand All @@ -189,6 +222,24 @@ void ov::npuw::IBaseInferRequest::check_tensors() const {
return;
}

bool ov::npuw::IBaseInferRequest::is_stored(const ov::Output<const ov::Node>& port) const {
return m_port_to_tensor.find(port) != m_port_to_tensor.end();
}

bool ov::npuw::IBaseInferRequest::is_io(const ov::Output<const ov::Node>& port) const {
for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
if (m_npuw_model->inputs()[i] == port) {
return true;
}
}
for (std::size_t i = 0; i < m_npuw_model->outputs().size(); ++i) {
if (m_npuw_model->outputs()[i] == port) {
return true;
}
}
return false;
}

void ov::npuw::IBaseInferRequest::handle_set_remote_input(const ov::Output<const ov::Node>& port,
const ov::SoPtr<ov::ITensor>& tensor) {
for (std::size_t i = 0; i < m_npuw_model->inputs().size(); ++i) {
Expand Down Expand Up @@ -291,14 +342,14 @@ std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const {

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type,
const ov::Shape& shape,
const std::string& device) {
const std::string& device) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alloc usually not const? why this needed

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just make m_footprint mutable? since it is kind of not changing state of inferrequest

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's needed for get_tensor() const

auto ptr = ov::npuw::util::allocMem(type, shape, device, m_npuw_model->get_plugin());
m_footprint[device] += ptr->get_byte_size();
return ptr;
}

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output<const ov::Node>& node,
const std::string& device) {
const std::string& device) const {
return allocMem(node.get_element_type(), node.get_shape(), device);
}

Expand Down Expand Up @@ -327,31 +378,7 @@ std::string ov::npuw::IBaseInferRequest::global_output_mem_device(std::size_t id
return *proto_comp_model_desc.device_it;
}

void ov::npuw::IBaseInferRequest::alloc_io() {
// Preallocate input tensors
LOG_INFO("Preallocating input tensors...");
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& port = m_npuw_model->inputs()[i];
ov::SoPtr<ov::ITensor> allocated = allocOut(port, global_input_mem_device(i));
m_input_allocated.insert(allocated->data());
m_port_to_tensor[port] = TensorStorage{allocated, true};
} // for(inputs)

// Preallocate output tensors
LOG_INFO("Preallocating output tensors...");
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
LOG_BLOCK();
const auto& port = m_npuw_model->outputs()[i];
LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);

// FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);

auto tensor = alloc_global_out(i);
m_port_to_tensor[port] = TensorStorage{tensor, true};
}

void ov::npuw::IBaseInferRequest::alloc_quant_gather() {
// Try to allocate intermediate tensors to gather into, when host quant gather is enabled
for (size_t i = 0; i < m_num_submodels; i++) {
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[i];
Expand All @@ -362,7 +389,7 @@ void ov::npuw::IBaseInferRequest::alloc_io() {
}
}

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) {
ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) const {
const auto& port = m_npuw_model->outputs().at(out_idx);
return allocOut(port, global_output_mem_device(out_idx));
}
Expand Down Expand Up @@ -524,7 +551,7 @@ void ov::npuw::IBaseInferRequest::bind_global_params(std::size_t idx, RqPtr requ
LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl);

const auto& g_port = m_npuw_model->inputs()[param_idx];
const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
const auto& g_tnsr = is_stored(g_port) ? m_port_to_tensor.at(g_port).tensor : get_tensor(g_port);
const auto& s_port = request->get_inputs()[sub_in_idx];
LOG_DEBUG("Processing " << g_port << " -> " << s_port << "...");
LOG_BLOCK();
Expand Down Expand Up @@ -743,7 +770,7 @@ void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr req
std::tie(result_idx, sub_out_idx) = it;
const auto& g_port = m_npuw_model->outputs()[result_idx];
const auto& s_port = request->get_outputs()[sub_out_idx];
request->set_tensor(s_port, m_port_to_tensor.at(g_port).tensor);
request->set_tensor(s_port, is_stored(g_port) ? m_port_to_tensor.at(g_port).tensor : get_tensor(g_port));
}

LOG_DEBUG("Done");
Expand Down
30 changes: 19 additions & 11 deletions src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,21 @@ class IBaseInferRequest : public ov::ISyncInferRequest {

struct TensorStorage {
ov::SoPtr<ov::ITensor> tensor;
bool persistent = false; // true for the parent I/O tensors
std::size_t num_readers = 0u; // fixed during execution
std::size_t num_reads = 0u; // changes during execution (ref-counter-like).
// reset to 0 before every new execution
bool persistent = false; // true for the parent I/O tensors
bool allocated_on_device = false; // mark for internally allocated I/O
bool set_from_outside = false; // outside I/O tensors shouldn't be reallocated
std::size_t num_readers = 0u; // fixed during execution
std::size_t num_reads = 0u; // changes during execution (ref-counter-like).
// reset to 0 before every new execution
};
// FROM(Every subrequests' output port) TO(Its output tensor)
std::map<ov::Output<const ov::Node>, TensorStorage> m_port_to_tensor;
mutable std::map<ov::Output<const ov::Node>, TensorStorage>
m_port_to_tensor; // mutable due to lazy I/O allocation in get_tensor()

// Check that m_port_to_tensor does have a tensor stored at the port
bool is_stored(const ov::Output<const ov::Node>& port) const;
// Check the port is I/O
bool is_io(const ov::Output<const ov::Node>& port) const;

struct QuantGatherTensors {
ov::Tensor w, z, s;
Expand Down Expand Up @@ -147,15 +155,15 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
std::vector<GlobalIO> m_subrequests_gio;

// Tracks tensors we allocated on our own - to recognize and avoid copies
std::unordered_set<void*> m_input_allocated;
mutable std::unordered_set<void*> m_input_allocated; // mutable due to lazy I/O allocation in get_tensor()

// Common functionality - shared for subclasses
const std::size_t m_num_submodels;

TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
virtual void alloc_io();
virtual TensorPtr alloc_global_out(std::size_t out_idx);
TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device) const;
TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device) const;
virtual void alloc_quant_gather();
virtual TensorPtr alloc_global_out(std::size_t out_idx) const;

std::string global_input_mem_device(std::size_t idx) const;
std::string global_output_mem_device(std::size_t idx) const;
Expand All @@ -178,7 +186,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {

MS m_ms_unpack;
ov::npuw::perf::Profile<MS> m_profile;
ov::npuw::perf::Profile<B> m_footprint;
mutable ov::npuw::perf::Profile<B> m_footprint; // mutable due to lazy I/O allocation in get_tensor()

std::string profile_tag(std::size_t idx) const;

Expand Down
21 changes: 10 additions & 11 deletions src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,14 +238,16 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
// Note: these buffers are allocated to the entire NWAY (> tail_size)
for (auto&& p : proto_comp_model_desc.spatial->params) {
const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
m_spatial_io[real_idx].input_tails[p.idx] =
allocOut(iport, m_npuw_model->funcall_mem_device(real_idx));
m_spatial_io[real_idx].input_tails[p.idx] = allocOut(
iport,
m_npuw_model->funcall_mem_device(real_idx)); // should it be handled lazy way as well?
}
const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
m_spatial_io[real_idx].output_tails[out_idx] =
allocOut(oport, m_npuw_model->funcall_mem_device(real_idx));
m_spatial_io[real_idx].output_tails[out_idx] = allocOut(
oport,
m_npuw_model->funcall_mem_device(real_idx)); // should it be handled lazy way as well?
}
}
} // if(spatial)
Expand Down Expand Up @@ -324,7 +326,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
}
} // if(function_pipelining)

alloc_io();
alloc_quant_gather();
connect_subrequests();
init_gio();

Expand Down Expand Up @@ -387,11 +389,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com

void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
const ov::SoPtr<ov::ITensor>& tensor) {
// Check that it's I/O
NPUW_ASSERT(m_port_to_tensor.at(port).persistent);

// Assigning via .at() to ensure it is a known port
m_port_to_tensor.at(port).tensor = tensor;
NPUW_ASSERT(is_io(port));
m_port_to_tensor[port] = TensorStorage{tensor, true, false, true};

// Check if setting output tensor
for (std::size_t i = 0; i < m_npuw_model->outputs().size(); ++i) {
Expand All @@ -414,7 +413,7 @@ void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& po
handle_set_remote_input(port, tensor);
}

ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) {
ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) const {
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(out_idx);
auto funcall_result_iter = m_funcall_result.find(from_submodel);
if (funcall_result_iter != m_funcall_result.end()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class JustInferRequest final : public IBaseInferRequest {
bool supports_async_pipeline() const override;
void update_subrequest_links(std::size_t idx) override;

TensorPtr alloc_global_out(std::size_t out_idx) override;
TensorPtr alloc_global_out(std::size_t out_idx) const override;

void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr<ov::npuw:
LOG_INFO("DONE");
} // for(submodels)

alloc_io();
alloc_quant_gather();

LOG_INFO("Connecting subrequests...");
LOG_BLOCK();
Expand Down
Loading