Skip to content

Commit 6ceb8e7

Browse files
committed
add: detect Q/DQ with int16/uint16 initializers
1 parent ed9e425 commit 6ceb8e7

File tree

1 file changed

+43
-1
lines changed

1 file changed

+43
-1
lines changed

onnxruntime/core/providers/openvino/backend_manager.cc

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,44 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
387387
return false;
388388
}
389389

390+
static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) {
391+
const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr;
392+
return type_proto && type_proto->has_tensor_type() &&
393+
(type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
394+
type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16);
395+
}
396+
397+
// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
398+
static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) {
399+
std::unordered_set<std::string> qdq_ops = {"QuantizeLinear", "DequantizeLinear"};
400+
const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
401+
402+
for (size_t i = 0; i < node_indices.size(); i++) {
403+
gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
404+
405+
if (qdq_ops.find(node->OpType()) != qdq_ops.end()) {
406+
const auto& input_defs = node->InputDefs();
407+
408+
if (node->OpType() == "DequantizeLinear") {
409+
// DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
410+
// Check quantized input tensor and optional zero point
411+
if (Is16BitTensor(input_defs.empty() ? nullptr : input_defs[0]) ||
412+
(input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
413+
return true;
414+
}
415+
} else if (node->OpType() == "QuantizeLinear") {
416+
// QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
417+
const auto& output_defs = node->OutputDefs();
418+
if (Is16BitTensor(output_defs.empty() ? nullptr : output_defs[0]) ||
419+
(input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
420+
return true;
421+
}
422+
}
423+
}
424+
}
425+
return false;
426+
}
427+
390428
static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
391429
[[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
392430
[[maybe_unused]] const onnxruntime::Node& fused_node) {
@@ -445,6 +483,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
445483
}
446484
#endif
447485

486+
// Check if the graph is QDQ and has int16 or uint16 quantization
487+
// If so, we will apply the QDQ scales fix transformation (for GPU device only)
488+
bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);
489+
448490
const auto& onnx_model_path_name = subgraph.ModelPath();
449491
// QDQ stripping enabled only for the NPU and experimentally on the GPU
450492
if ((session_context_.device_type.find("NPU") != std::string::npos) &&
@@ -458,7 +500,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
458500
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
459501
return model_proto;
460502
} else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
461-
enable_ovep_qdq_optimizer) {
503+
is_qdq_graph_uint16_or_int16) {
462504
// Create a copy of the model
463505
std::unique_ptr<onnxruntime::Model> model;
464506
Status status = qdq_scales_fix::Transform(subgraph, logger, model);

0 commit comments

Comments
 (0)