@@ -387,6 +387,44 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
387
387
return false ;
388
388
}
389
389
390
+ static bool Is16BitTensor (const onnxruntime::NodeArg* node_arg) {
391
+ const auto * type_proto = node_arg ? node_arg->TypeAsProto () : nullptr ;
392
+ return type_proto && type_proto->has_tensor_type () &&
393
+ (type_proto->tensor_type ().elem_type () == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
394
+ type_proto->tensor_type ().elem_type () == ONNX_NAMESPACE::TensorProto_DataType_INT16);
395
+ }
396
+
397
+ // Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
398
+ static bool IsQDQGraphWithUint16OrInt16 (const onnxruntime::GraphViewer& graph_viewer) {
399
+ std::unordered_set<std::string> qdq_ops = {" QuantizeLinear" , " DequantizeLinear" };
400
+ const auto & node_indices = graph_viewer.GetNodesInTopologicalOrder ();
401
+
402
+ for (size_t i = 0 ; i < node_indices.size (); i++) {
403
+ gsl::not_null<const onnxruntime::Node*> node (graph_viewer.GetNode (node_indices[i]));
404
+
405
+ if (qdq_ops.find (node->OpType ()) != qdq_ops.end ()) {
406
+ const auto & input_defs = node->InputDefs ();
407
+
408
+ if (node->OpType () == " DequantizeLinear" ) {
409
+ // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
410
+ // Check quantized input tensor and optional zero point
411
+ if (Is16BitTensor (input_defs.empty () ? nullptr : input_defs[0 ]) ||
412
+ (input_defs.size () >= 3 && Is16BitTensor (input_defs[2 ]))) {
413
+ return true ;
414
+ }
415
+ } else if (node->OpType () == " QuantizeLinear" ) {
416
+ // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
417
+ const auto & output_defs = node->OutputDefs ();
418
+ if (Is16BitTensor (output_defs.empty () ? nullptr : output_defs[0 ]) ||
419
+ (input_defs.size () >= 3 && Is16BitTensor (input_defs[2 ]))) {
420
+ return true ;
421
+ }
422
+ }
423
+ }
424
+ }
425
+ return false ;
426
+ }
427
+
390
428
static void DumpOpenVINOEPModel ([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
391
429
[[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
392
430
[[maybe_unused]] const onnxruntime::Node& fused_node) {
@@ -445,6 +483,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
445
483
}
446
484
#endif
447
485
486
+ // Check if the graph is QDQ and has int16 or uint16 quantization
487
+ // If so, we will apply the QDQ scales fix transformation (for GPU device only)
488
+ bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16 (subgraph);
489
+
448
490
const auto & onnx_model_path_name = subgraph.ModelPath ();
449
491
// QDQ stripping enabled only for the NPU and experimentally on the GPU
450
492
if ((session_context_.device_type .find (" NPU" ) != std::string::npos) &&
@@ -458,7 +500,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
458
500
ORT_ENFORCE (status.IsOK (), status.ErrorMessage ());
459
501
return model_proto;
460
502
} else if ((session_context_.device_type .find (" GPU" ) != std::string::npos) &&
461
- enable_ovep_qdq_optimizer ) {
503
+ is_qdq_graph_uint16_or_int16 ) {
462
504
// Create a copy of the model
463
505
std::unique_ptr<onnxruntime::Model> model;
464
506
Status status = qdq_scales_fix::Transform (subgraph, logger, model);
0 commit comments