From bbe32a588564c713145c428fc7e9d7a0a0ded688 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 1 Oct 2025 20:33:54 +0900 Subject: [PATCH 01/28] Adapt ConvertQuantizeDequantize for reusage in QDQStripping --- .../convert_quantize_dequantize.hpp | 4 +- .../convert_quantize_dequantize.cpp | 88 ++++++++----------- 2 files changed, 40 insertions(+), 52 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp b/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp index a1d3841010278f..044f8b1b32b985 100644 --- a/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp @@ -7,6 +7,7 @@ #include #include +#include "openvino/core/type/element_type.hpp" #include "openvino/pass/matcher_pass.hpp" #include "transformations_visibility.hpp" @@ -32,5 +33,6 @@ class TRANSFORMATIONS_API ConvertQuantizeDequantize; class ov::pass::ConvertQuantizeDequantize : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("ConvertQuantizeDequantize"); - ConvertQuantizeDequantize(); + ConvertQuantizeDequantize(const ov::element::TypeVector& supported_low_precisions = {ov::element::i8, ov::element::u8, ov::element::i16, ov::element::u16}, + const ov::element::TypeVector& supported_original_precisions = {ov::element::f32}); }; diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 9f6f18f8f70cc6..3f83b4236e5fa1 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -62,27 +62,32 @@ // v // -ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize() { +ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( + const ov::element::TypeVector& supported_low_precisions, + const ov::element::TypeVector& supported_original_precisions) { MATCHER_SCOPE(ConvertQuantizeDequantize); - auto data_pattern = pass::pattern::any_input(); - auto input_low_pattern = pass::pattern::any_input(); - auto input_high_pattern = pass::pattern::any_input(); - auto output_low_pattern = ov::pass::pattern::wrap_type(); - auto output_high_pattern = ov::pass::pattern::wrap_type(); - auto fq_pattern = ov::pass::pattern::wrap_type( + + using namespace ov::pass::pattern; + using namespace ov::op; + + auto data_pattern = any_input(type_matches_any(supported_original_precisions)); + auto input_low_pattern = any_input(); + auto input_high_pattern = any_input(); + auto output_low_pattern = wrap_type(); + auto output_high_pattern = wrap_type(); + auto fq_pattern = wrap_type( {data_pattern, input_low_pattern, input_high_pattern, output_low_pattern, output_high_pattern}); - auto convert1_pattern = ov::pass::pattern::wrap_type( - {fq_pattern}, - pattern::type_matches_any({element::i8, element::u8, element::i16, element::u16})); + auto convert1_pattern = + wrap_type({fq_pattern}, type_matches_any(supported_low_precisions) && consumers_count(1)); auto convert2_pattern = - ov::pass::pattern::wrap_type({convert1_pattern}, pattern::type_matches(element::f32)); - auto zero_point_pattern = pass::pattern::any_input(); - auto sub_pattern = ov::pass::pattern::wrap_type({convert2_pattern, zero_point_pattern}, - pattern::consumers_count(1)); - auto scale_pattern = pass::pattern::any_input(); - auto mul_pattern = ov::pass::pattern::wrap_type({sub_pattern, scale_pattern}); - - ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](pattern::Matcher& m) { + wrap_type({convert1_pattern}, + type_matches_any(supported_original_precisions) && consumers_count(1)); + auto zero_point_pattern = any_input(); + auto sub_pattern = wrap_type({convert2_pattern, zero_point_pattern}, consumers_count(1)); + auto scale_pattern = any_input(); + auto mul_pattern = wrap_type({sub_pattern, scale_pattern}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { auto pattern_map = m.get_pattern_value_map(); if (transformation_callback(m.get_match_root())) { @@ -108,47 +113,27 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize() { auto convert2 = pattern_map[convert2_pattern]; auto mul = pattern_map[mul_pattern].get_node_shared_ptr(); - // convert1 and convert2 should have only one input - if (convert1.get_target_inputs().size() != 1) - return false; - if (convert2.get_target_inputs().size() != 1) + static const std::unordered_set supported_levels{256, 65536}; + const auto levels = fq->get_levels(); + if (!supported_levels.count(levels)) return false; - // we support: - // i8 or u8: 'levels' attribute must be 256 - // i16 or u16: 'levels' attribute must be 65536 - size_t levels = fq->get_levels(); - if (levels != 256 && levels != 65536) - return false; - - // check if (out_low_val, out_high_val) is (-128, 127) or (0, 255) or (-32768, 32767) or (0, 65535) float out_low_val; - if (!op::util::get_single_value(output_low, out_low_val)) + if (!ov::op::util::get_single_value(output_low, out_low_val)) return false; float out_high_val; - if (!op::util::get_single_value(output_high, out_high_val)) + if (!ov::op::util::get_single_value(output_high, out_high_val)) return false; + + static const std::unordered_map> supported_intervals{ + {ov::element::i8, {-128.f, 127.f}}, + {ov::element::u8, {0.f, 255.f}}, + {ov::element::i16, {-32768.f, 32767.f}}, + {ov::element::u16, {0.f, 65535.f}}}; const auto& type = convert1.get_element_type(); - switch (type) { - case element::Type_t::i8: - if (out_low_val != -128 || out_high_val != 127) - return false; - break; - case element::Type_t::u8: - if (out_low_val != 0 || out_high_val != 255) - return false; - break; - case element::Type_t::i16: - if (out_low_val != -32768 || out_high_val != 32767) - return false; - break; - case element::Type_t::u16: - if (out_low_val != 0 || out_high_val != 65535) - return false; - break; - default: + if (supported_intervals.count(type) == 0 || + supported_intervals.at(type) != std::make_pair(out_low_val, out_high_val)) return false; - } std::shared_ptr new_out_low = std::make_shared(std::make_shared(output_low, zero_point), @@ -181,6 +166,7 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize() { copy_runtime_info({fq, convert1.get_node_shared_ptr(), convert2.get_node_shared_ptr()}, new_fq); replace_node(mul, new_fq); + std::cout << "[ INFO ] ConvertQuantizeDequantize is finished for node " << new_fq->get_friendly_name() << std::endl; return true; }; From 1955d555aeafa3b1963bf331a18376706f0cd211 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 1 Oct 2025 20:34:19 +0900 Subject: [PATCH 02/28] QDQStripping initial implementation --- .../include/low_precision/qdq_stripping.hpp | 31 ++++++++ .../src/qdq_stripping.cpp | 77 +++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 src/common/low_precision_transformations/include/low_precision/qdq_stripping.hpp create mode 100644 src/common/low_precision_transformations/src/qdq_stripping.cpp diff --git a/src/common/low_precision_transformations/include/low_precision/qdq_stripping.hpp b/src/common/low_precision_transformations/include/low_precision/qdq_stripping.hpp new file mode 100644 index 00000000000000..a16285db63bdb7 --- /dev/null +++ b/src/common/low_precision_transformations/include/low_precision/qdq_stripping.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "lpt_visibility.hpp" +#include "openvino/pass/matcher_pass.hpp" +#include "quantization_details.hpp" + +namespace ov { +namespace pass { +namespace low_precision { + +/** + * @ingroup ov_transformation_common_api + * @brief FQStrippingTransformation strips FakeQuantize operations with specified levels + * by replacing them with Clamp operations. + */ +class LP_TRANSFORMATIONS_API FQStrippingTransformation : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("FQStrippingTransformation", "0", MatcherPass); + FQStrippingTransformation(const std::set& levels_to_strip); +}; + +} // namespace low_precision +} // namespace pass +} // namespace ov \ No newline at end of file diff --git a/src/common/low_precision_transformations/src/qdq_stripping.cpp b/src/common/low_precision_transformations/src/qdq_stripping.cpp new file mode 100644 index 00000000000000..25cc2655518827 --- /dev/null +++ b/src/common/low_precision_transformations/src/qdq_stripping.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "low_precision/qdq_stripping.hpp" + +#include + +#include "itt.hpp" +#include "low_precision/common/ie_lpt_exception.hpp" +#include "low_precision/lpt_itt.hpp" +#include "low_precision/network_helper.hpp" +#include "openvino/core/except.hpp" +#include "openvino/core/type.hpp" +#include "openvino/op/clamp.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/fake_quantize.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/util/log.hpp" + +namespace ov { +namespace pass { +namespace low_precision { + +FQStrippingTransformation::FQStrippingTransformation(const std::set& levels_to_strip) { + MATCHER_SCOPE(FQStrippingTransformation); + auto is_scalar = [](const Output& output) -> bool { + return ov::shape_size(output.get_shape()) == 1; + }; + auto input_low_m = pattern::wrap_type(is_scalar); + auto input_high_m = pattern::wrap_type(is_scalar); + auto output_low_m = pattern::wrap_type(is_scalar); + auto output_high_m = pattern::wrap_type(is_scalar); + auto fq_m = pattern::wrap_type( + {pattern::any_input(), input_low_m, input_high_m, output_low_m, output_high_m}); + + ov::graph_rewrite_callback callback = [OV_CAPTURE_CPY_AND_THIS](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + auto node = ov::as_type_ptr(pattern_map.at(fq_m).get_node_shared_ptr()); + if (!node) { + return false; + } + + const size_t levels = node->get_levels(); + if (!levels_to_strip.count(levels)) { + std::cout << "[QDQStripping] Levels " << levels << " not in strip set, skipping" << std::endl; + return false; + } + + std::cout << "[QDQStripping] Levels " << levels << " found in strip set, proceeding with transformation" + << std::endl; + + auto input = node->get_input_node_shared_ptr(0); + auto output_low = ov::as_type_ptr(pattern_map.at(output_low_m).get_node_shared_ptr()); + auto output_high = ov::as_type_ptr(pattern_map.at(output_high_m).get_node_shared_ptr()); + + // TODO: need to check that input and output intervals are equal + if (!output_low || !output_high) { + std::cout << "[QDQStripping] Failed to get constant output_low or output_high nodes" << std::endl; + return false; + } + + auto clamp = std::make_shared(input->output(0), + output_low->cast_vector()[0], + output_high->cast_vector()[0]); + std::cout << "[ INFO ] clamp low = " << clamp->get_min() << ", high = " << clamp->get_max() << std::endl; + + return replace_node_update_name(node, clamp); + }; + + auto m = std::make_shared(fq_m, matcher_name); + this->register_matcher(m, callback); +} + +} // namespace low_precision +} // namespace pass +} // namespace ov \ No newline at end of file From 17d77aba30e14b1524a4b3d36653410d5bbb5267 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 1 Oct 2025 20:51:29 +0900 Subject: [PATCH 03/28] Finalize first QDQ Stripping implementation --- .../include/low_precision/qdq_stripping.hpp | 2 +- .../src/qdq_stripping.cpp | 37 ++++++++++++------- .../convert_quantize_dequantize.cpp | 1 - .../src/plugin/transformations_pipeline.cpp | 12 ++++++ 4 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/common/low_precision_transformations/include/low_precision/qdq_stripping.hpp b/src/common/low_precision_transformations/include/low_precision/qdq_stripping.hpp index a16285db63bdb7..318c0b8c6b5a26 100644 --- a/src/common/low_precision_transformations/include/low_precision/qdq_stripping.hpp +++ b/src/common/low_precision_transformations/include/low_precision/qdq_stripping.hpp @@ -23,7 +23,7 @@ namespace low_precision { class LP_TRANSFORMATIONS_API FQStrippingTransformation : public ov::pass::MatcherPass { public: OPENVINO_RTTI("FQStrippingTransformation", "0", MatcherPass); - FQStrippingTransformation(const std::set& levels_to_strip); + FQStrippingTransformation(const std::set& levels_to_strip, bool replace_with_clamp); }; } // namespace low_precision diff --git a/src/common/low_precision_transformations/src/qdq_stripping.cpp b/src/common/low_precision_transformations/src/qdq_stripping.cpp index 25cc2655518827..48f9f917d16d44 100644 --- a/src/common/low_precision_transformations/src/qdq_stripping.cpp +++ b/src/common/low_precision_transformations/src/qdq_stripping.cpp @@ -14,15 +14,17 @@ #include "openvino/core/type.hpp" #include "openvino/op/clamp.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/equal.hpp" #include "openvino/op/fake_quantize.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/util/log.hpp" +#include "transformations/utils/utils.hpp" namespace ov { namespace pass { namespace low_precision { -FQStrippingTransformation::FQStrippingTransformation(const std::set& levels_to_strip) { +FQStrippingTransformation::FQStrippingTransformation(const std::set& levels_to_strip, bool replace_with_clamp) { MATCHER_SCOPE(FQStrippingTransformation); auto is_scalar = [](const Output& output) -> bool { return ov::shape_size(output.get_shape()) == 1; @@ -43,29 +45,38 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev const size_t levels = node->get_levels(); if (!levels_to_strip.count(levels)) { - std::cout << "[QDQStripping] Levels " << levels << " not in strip set, skipping" << std::endl; return false; } - std::cout << "[QDQStripping] Levels " << levels << " found in strip set, proceeding with transformation" - << std::endl; - auto input = node->get_input_node_shared_ptr(0); + auto input_low = ov::as_type_ptr(pattern_map.at(input_low_m).get_node_shared_ptr()); + auto input_high = ov::as_type_ptr(pattern_map.at(input_high_m).get_node_shared_ptr()); auto output_low = ov::as_type_ptr(pattern_map.at(output_low_m).get_node_shared_ptr()); auto output_high = ov::as_type_ptr(pattern_map.at(output_high_m).get_node_shared_ptr()); // TODO: need to check that input and output intervals are equal - if (!output_low || !output_high) { - std::cout << "[QDQStripping] Failed to get constant output_low or output_high nodes" << std::endl; + if (!input_low || !input_high || !output_low || !output_high) { + return false; + } + auto constants_are_equal = [](const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + auto equal = ov::as_type_ptr(ov::op::util::make_try_fold(lhs, rhs)); + OPENVINO_ASSERT(equal && ov::shape_size(equal->get_shape()) == 1, + "constants_are_equal expects scalar constant as a comparison result"); + return equal->get_vector()[0] == true; + }; + if (!constants_are_equal(input_low, output_low) || !constants_are_equal(input_high, output_high)) { return false; } - auto clamp = std::make_shared(input->output(0), - output_low->cast_vector()[0], - output_high->cast_vector()[0]); - std::cout << "[ INFO ] clamp low = " << clamp->get_min() << ", high = " << clamp->get_max() << std::endl; - - return replace_node_update_name(node, clamp); + if (replace_with_clamp) { + auto clamp = std::make_shared(input->output(0), + output_low->cast_vector()[0], + output_high->cast_vector()[0]); + return replace_node_update_name(node, clamp); + } else { + return replace_output_update_name(node->output(0), node->input_value(0)); + } }; auto m = std::make_shared(fq_m, matcher_name); diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 3f83b4236e5fa1..38b979ceb7cec8 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -166,7 +166,6 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( copy_runtime_info({fq, convert1.get_node_shared_ptr(), convert2.get_node_shared_ptr()}, new_fq); replace_node(mul, new_fq); - std::cout << "[ INFO ] ConvertQuantizeDequantize is finished for node " << new_fq->get_friendly_name() << std::endl; return true; }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 6beeaa3a5081ea..7d28c2f10321db 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -24,6 +24,7 @@ #include "low_precision/fold_convert.hpp" #include "low_precision/fuse_convert.hpp" #include "low_precision/group_convolution.hpp" +#include "low_precision/qdq_stripping.hpp" #include "low_precision/low_precision.hpp" #include "low_precision/mat_mul.hpp" #include "low_precision/multiply_to_group_convolution.hpp" @@ -389,6 +390,17 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto is_model_quantized = ov::pass::low_precision::LowPrecision::isFunctionQuantized(func); enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; + { + using namespace ov::pass::low_precision; + // QDQ stripping pipeline + // 1. Transform DQ part to canonicalized form: Multiply->Add => Subtract->Multiply + manager.register_pass(); + // 2. Fuse FQ->Convert->DQ to a single FQ + manager.register_pass(ov::element::TypeVector{ov::element::i16, ov::element::u16}); + // 3. Strip FQ layers with unsupported levels + bool replace_with_clamp = false; + manager.register_pass(std::set{levels::int16}, replace_with_clamp); + } manager.register_pass( std::vector{ ov::element::i8, ov::element::u8, ov::element::i4, ov::element::u4 }, From e4f6d6a89be8e0dba945d84de6417d221bfbaaad Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 1 Oct 2025 16:43:58 +0200 Subject: [PATCH 04/28] Clang format --- .../src/qdq_stripping.cpp | 12 ++++++------ .../convert_quantize_dequantize.hpp | 5 ++++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/common/low_precision_transformations/src/qdq_stripping.cpp b/src/common/low_precision_transformations/src/qdq_stripping.cpp index 48f9f917d16d44..fe695f201dfb7a 100644 --- a/src/common/low_precision_transformations/src/qdq_stripping.cpp +++ b/src/common/low_precision_transformations/src/qdq_stripping.cpp @@ -53,14 +53,14 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev auto input_high = ov::as_type_ptr(pattern_map.at(input_high_m).get_node_shared_ptr()); auto output_low = ov::as_type_ptr(pattern_map.at(output_low_m).get_node_shared_ptr()); auto output_high = ov::as_type_ptr(pattern_map.at(output_high_m).get_node_shared_ptr()); - - // TODO: need to check that input and output intervals are equal + if (!input_low || !input_high || !output_low || !output_high) { return false; } auto constants_are_equal = [](const std::shared_ptr& lhs, const std::shared_ptr& rhs) { - auto equal = ov::as_type_ptr(ov::op::util::make_try_fold(lhs, rhs)); + auto equal = + ov::as_type_ptr(ov::op::util::make_try_fold(lhs, rhs)); OPENVINO_ASSERT(equal && ov::shape_size(equal->get_shape()) == 1, "constants_are_equal expects scalar constant as a comparison result"); return equal->get_vector()[0] == true; @@ -83,6 +83,6 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev this->register_matcher(m, callback); } -} // namespace low_precision -} // namespace pass -} // namespace ov \ No newline at end of file +} // namespace low_precision +} // namespace pass +} // namespace ov \ No newline at end of file diff --git a/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp b/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp index 044f8b1b32b985..dda3948612d4b3 100644 --- a/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp @@ -33,6 +33,9 @@ class TRANSFORMATIONS_API ConvertQuantizeDequantize; class ov::pass::ConvertQuantizeDequantize : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("ConvertQuantizeDequantize"); - ConvertQuantizeDequantize(const ov::element::TypeVector& supported_low_precisions = {ov::element::i8, ov::element::u8, ov::element::i16, ov::element::u16}, + ConvertQuantizeDequantize(const ov::element::TypeVector& supported_low_precisions = {ov::element::i8, + ov::element::u8, + ov::element::i16, + ov::element::u16}, const ov::element::TypeVector& supported_original_precisions = {ov::element::f32}); }; From 1640fdb9ae50144e317347ea0a5996602b9b0f5d Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 2 Oct 2025 00:56:30 +0900 Subject: [PATCH 05/28] set replace_with_clamp to true --- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 7d28c2f10321db..63c642908566b4 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -398,7 +398,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // 2. Fuse FQ->Convert->DQ to a single FQ manager.register_pass(ov::element::TypeVector{ov::element::i16, ov::element::u16}); // 3. Strip FQ layers with unsupported levels - bool replace_with_clamp = false; + bool replace_with_clamp = true; manager.register_pass(std::set{levels::int16}, replace_with_clamp); } From d6130ae6b264bf729451210ff3447b1b9aff88c0 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 3 Oct 2025 01:36:59 +0900 Subject: [PATCH 06/28] Avoid main LPT pipeline in case of non-u8 activations quantization --- .../low_precision_transformations/src/low_precision.cpp | 2 +- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/low_precision_transformations/src/low_precision.cpp b/src/common/low_precision_transformations/src/low_precision.cpp index 5996f0332c968c..840bc30404e34a 100644 --- a/src/common/low_precision_transformations/src/low_precision.cpp +++ b/src/common/low_precision_transformations/src/low_precision.cpp @@ -323,7 +323,7 @@ bool LowPrecision::isFunctionQuantized(const std::shared_ptr& m } else if (const auto multiSubGraph = ov::as_type_ptr(node)) { // Look inside subraph operations, such as TensorIterator, Loop, If, etc for (size_t i = 0; i < multiSubGraph->get_internal_subgraphs_size(); i++) { - if (isFunctionQuantized(multiSubGraph->get_function(i))) { + if (isFunctionQuantized(multiSubGraph->get_function(i), supported_levels, check_fake_convert)) { return true; } } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 63c642908566b4..48232f11104af3 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -388,10 +388,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { ov::disable_keep_const_precision(node); } - auto is_model_quantized = ov::pass::low_precision::LowPrecision::isFunctionQuantized(func); + using namespace ov::pass::low_precision; + auto is_model_quantized = LowPrecision::isFunctionQuantized(func, std::set{levels::int8, levels::int8_narrow_range}); enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; { - using namespace ov::pass::low_precision; // QDQ stripping pipeline // 1. Transform DQ part to canonicalized form: Multiply->Add => Subtract->Multiply manager.register_pass(); From 5824c91182543a43c568ae3c8a41bee534a987f4 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 28 Oct 2025 02:24:45 +0900 Subject: [PATCH 07/28] WIP: some fixes + debug info --- .../convert_quantize_dequantize.cpp | 87 +++++++++++++++---- .../src/plugin/transformations_pipeline.cpp | 4 +- .../tests/functional/subgraph_tests/test.cpp | 24 +++++ 3 files changed, 95 insertions(+), 20 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/functional/subgraph_tests/test.cpp diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 38b979ceb7cec8..6b191293615242 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -16,6 +16,7 @@ #include "openvino/op/fake_quantize.hpp" #include "openvino/op/multiply.hpp" #include "openvino/op/subtract.hpp" +#include "openvino/pass/pattern/op/optional.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" @@ -62,6 +63,13 @@ // v // +#define LOG_INFO(...) \ + do { \ + if (std::getenv("QDQ_STRIPPING_LOG")) { \ + std::cout << __VA_ARGS__; \ + } \ + } while (0) + ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( const ov::element::TypeVector& supported_low_precisions, const ov::element::TypeVector& supported_original_precisions) { @@ -83,35 +91,36 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( wrap_type({convert1_pattern}, type_matches_any(supported_original_precisions) && consumers_count(1)); auto zero_point_pattern = any_input(); - auto sub_pattern = wrap_type({convert2_pattern, zero_point_pattern}, consumers_count(1)); + auto sub_pattern = optional({convert2_pattern, zero_point_pattern}, consumers_count(1)); auto scale_pattern = any_input(); auto mul_pattern = wrap_type({sub_pattern, scale_pattern}); ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { auto pattern_map = m.get_pattern_value_map(); + LOG_INFO("[ INFO ] ConvertQuantizeDequantize matched\n"); if (transformation_callback(m.get_match_root())) { return false; } - auto data = pattern_map[data_pattern]; - auto input_low = pattern_map[input_low_pattern]; - auto input_high = pattern_map[input_high_pattern]; - auto output_low = ov::as_type_ptr(pattern_map[output_low_pattern].get_node_shared_ptr()); + auto data = pattern_map.at(data_pattern); + auto input_low = pattern_map.at(input_low_pattern); + auto input_high = pattern_map.at(input_high_pattern); + auto output_low = + ov::as_type_ptr(pattern_map.at(output_low_pattern).get_node_shared_ptr()); if (!output_low) return false; auto output_high = - ov::as_type_ptr(pattern_map[output_high_pattern].get_node_shared_ptr()); + ov::as_type_ptr(pattern_map.at(output_high_pattern).get_node_shared_ptr()); if (!output_high) return false; - auto fq = ov::as_type_ptr(pattern_map[fq_pattern].get_node_shared_ptr()); + auto fq = ov::as_type_ptr(pattern_map.at(fq_pattern).get_node_shared_ptr()); if (!fq) return false; - auto zero_point = pattern_map[zero_point_pattern]; - auto scale = pattern_map[scale_pattern]; - auto convert1 = pattern_map[convert1_pattern]; - auto convert2 = pattern_map[convert2_pattern]; - auto mul = pattern_map[mul_pattern].get_node_shared_ptr(); + auto scale = pattern_map.at(scale_pattern); + auto convert1 = pattern_map.at(convert1_pattern); + auto convert2 = pattern_map.at(convert2_pattern); + auto mul = pattern_map.at(mul_pattern).get_node_shared_ptr(); static const std::unordered_set supported_levels{256, 65536}; const auto levels = fq->get_levels(); @@ -131,16 +140,55 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( {ov::element::i16, {-32768.f, 32767.f}}, {ov::element::u16, {0.f, 65535.f}}}; const auto& type = convert1.get_element_type(); + LOG_INFO("[ INFO ] Matched nodes info\n"); + LOG_INFO("\t data = " << data << std::endl); + LOG_INFO("\t fq = " << fq << std::endl); +#define LOG_CONSTANT_VALUE(name, value) \ + do { \ + if (auto const_node = ov::as_type_ptr(value.get_node_shared_ptr())) { \ + if (ov::shape_size(const_node->get_shape()) == 1) { \ + LOG_INFO("\t " << name << " = " << value \ + << " (constant value: " << const_node->cast_vector()[0] << ")" << std::endl); \ + } else { \ + LOG_INFO("\t " << name << " = " << value << std::endl); \ + } \ + } else { \ + LOG_INFO("\t " << name << " = " << value << std::endl); \ + } \ + } while (0); + + LOG_CONSTANT_VALUE("input_low", input_low); + LOG_CONSTANT_VALUE("input_high", input_high); + LOG_CONSTANT_VALUE("output_low", pattern_map.at(output_low_pattern)); + LOG_CONSTANT_VALUE("output_high", pattern_map.at(output_high_pattern)); + LOG_INFO("\t convert1 = " << convert1 << std::endl); + LOG_INFO("\t convert2 = " << convert2 << std::endl); + if (pattern_map.count(zero_point_pattern)) { + const auto& shift = pattern_map.at(zero_point_pattern); + const auto& subtract = pattern_map.at(sub_pattern); + LOG_CONSTANT_VALUE("shift", shift); + LOG_INFO("\t subtract = " << subtract << std::endl); + } else { + LOG_INFO("\t zero_point not present\n"); + } + LOG_CONSTANT_VALUE("scale", scale); + LOG_INFO("\t mul = " << mul << std::endl); + if (supported_intervals.count(type) == 0 || - supported_intervals.at(type) != std::make_pair(out_low_val, out_high_val)) + supported_intervals.at(type) != std::make_pair(out_low_val, out_high_val)) { + LOG_INFO("[ INFO ] ConvertQuantizeDequantize: unsupported intervals\n"); return false; + } - std::shared_ptr new_out_low = - std::make_shared(std::make_shared(output_low, zero_point), - scale); - std::shared_ptr new_out_high = - std::make_shared(std::make_shared(output_high, zero_point), - scale); + const bool has_zero_point = pattern_map.count(zero_point_pattern); + std::shared_ptr new_out_low = output_low, new_out_high = output_high; + if (has_zero_point) { + const auto& zero_point = pattern_map.at(zero_point_pattern); + new_out_low = std::make_shared(new_out_low, zero_point); + new_out_high = std::make_shared(new_out_high, zero_point); + } + new_out_low = std::make_shared(new_out_low, scale); + new_out_high = std::make_shared(new_out_high, scale); // check if new_out_low/high shapes are broadcastable to FQ's input auto data_shape = data.get_partial_shape(); @@ -166,6 +214,7 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( copy_runtime_info({fq, convert1.get_node_shared_ptr(), convert2.get_node_shared_ptr()}, new_fq); replace_node(mul, new_fq); + LOG_INFO("[ INFO ] ConvertQuantizeDequantize: transformation finished\n"); return true; }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 48232f11104af3..b6201dca0e64d6 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -392,11 +392,13 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto is_model_quantized = LowPrecision::isFunctionQuantized(func, std::set{levels::int8, levels::int8_narrow_range}); enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; { + using namespace ov::element; // QDQ stripping pipeline // 1. Transform DQ part to canonicalized form: Multiply->Add => Subtract->Multiply manager.register_pass(); // 2. Fuse FQ->Convert->DQ to a single FQ - manager.register_pass(ov::element::TypeVector{ov::element::i16, ov::element::u16}); + manager.register_pass(TypeVector{i16, u16, i32}, + TypeVector{f16, f32}); // 3. Strip FQ layers with unsupported levels bool replace_with_clamp = true; manager.register_pass(std::set{levels::int16}, replace_with_clamp); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/test.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/test.cpp new file mode 100644 index 00000000000000..2778043489ff90 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/test.cpp @@ -0,0 +1,24 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "openvino/core/core.hpp" + +namespace { + +class QDQStrippingTest : virtual public ov::test::SubgraphBaseStaticTest { +public: + +protected: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_GPU; + function = core->read_model("/home/guest/golubevv/openvino/bin/intel64/RelWithDebInfo/1_Conv.onnx"); + } +}; + +TEST_F(QDQStrippingTest, Inference) { + run(); +} + +} // namespace From b4c320ab7e7ec17f063c68da82e932eefddf5407 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 29 Oct 2025 22:51:32 +0900 Subject: [PATCH 08/28] Logging extended --- .../src/qdq_stripping.cpp | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/common/low_precision_transformations/src/qdq_stripping.cpp b/src/common/low_precision_transformations/src/qdq_stripping.cpp index fe695f201dfb7a..15c75730e839a0 100644 --- a/src/common/low_precision_transformations/src/qdq_stripping.cpp +++ b/src/common/low_precision_transformations/src/qdq_stripping.cpp @@ -24,6 +24,13 @@ namespace ov { namespace pass { namespace low_precision { +#define LOG_INFO(...) \ + do { \ + if (std::getenv("QDQ_STRIPPING_LOG")) { \ + std::cout << __VA_ARGS__; \ + } \ + } while (0) + FQStrippingTransformation::FQStrippingTransformation(const std::set& levels_to_strip, bool replace_with_clamp) { MATCHER_SCOPE(FQStrippingTransformation); auto is_scalar = [](const Output& output) -> bool { @@ -39,12 +46,15 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev ov::graph_rewrite_callback callback = [OV_CAPTURE_CPY_AND_THIS](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); auto node = ov::as_type_ptr(pattern_map.at(fq_m).get_node_shared_ptr()); + LOG_INFO("[ INFO ] FQStrippingTransformation matched\n"); if (!node) { return false; } + LOG_INFO("[ INFO ] fq = " << node << std::endl); const size_t levels = node->get_levels(); if (!levels_to_strip.count(levels)) { + LOG_INFO("\t check failed: levels are not matched\n"); return false; } @@ -55,6 +65,7 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev auto output_high = ov::as_type_ptr(pattern_map.at(output_high_m).get_node_shared_ptr()); if (!input_low || !input_high || !output_low || !output_high) { + LOG_INFO("\t check failed: il_ih/ol_oh are not constant\n"); return false; } auto constants_are_equal = [](const std::shared_ptr& lhs, @@ -65,18 +76,40 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev "constants_are_equal expects scalar constant as a comparison result"); return equal->get_vector()[0] == true; }; +#define LOG_CONSTANT_VALUE(name, value) \ + do { \ + if (auto const_node = ov::as_type_ptr(value)) { \ + if (ov::shape_size(const_node->get_shape()) == 1) { \ + LOG_INFO("\t " << name << " = " << value \ + << " (constant value: " << const_node->cast_vector()[0] << ")" << std::endl); \ + } else { \ + LOG_INFO("\t " << name << " = " << value << std::endl); \ + } \ + } else { \ + LOG_INFO("\t " << name << " = " << value << std::endl); \ + } \ + } while (0); + LOG_CONSTANT_VALUE("\t input_low", input_low); + LOG_CONSTANT_VALUE("\t output_low", output_low); + LOG_CONSTANT_VALUE("\t input_high", input_high); + LOG_CONSTANT_VALUE("\t output_high", output_high); if (!constants_are_equal(input_low, output_low) || !constants_are_equal(input_high, output_high)) { + LOG_INFO("\t check failed: constants_are_not equal\n"); return false; } + bool res = false; if (replace_with_clamp) { auto clamp = std::make_shared(input->output(0), output_low->cast_vector()[0], output_high->cast_vector()[0]); - return replace_node_update_name(node, clamp); + res = replace_node_update_name(node, clamp); } else { - return replace_output_update_name(node->output(0), node->input_value(0)); + res = replace_output_update_name(node->output(0), node->input_value(0)); } + OPENVINO_ASSERT(res, "FQ stripping failed"); + LOG_INFO("\t transformation finished\n"); + return res; }; auto m = std::make_shared(fq_m, matcher_name); From e771761786ad49322905f98cdd4fd57ef05be782 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 29 Oct 2025 23:24:22 +0900 Subject: [PATCH 09/28] added graphs serialization --- .../intel_gpu/src/plugin/transformations_pipeline.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index b6201dca0e64d6..777453e8990c9e 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -391,17 +391,27 @@ void TransformationsPipeline::apply(std::shared_ptr func) { using namespace ov::pass::low_precision; auto is_model_quantized = LowPrecision::isFunctionQuantized(func, std::set{levels::int8, levels::int8_narrow_range}); enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; +#define SERIALIZE_GRAPHS(name) \ + do { \ + if (std::getenv("QDQ_STRIPPING_SERIALIZE")) { \ + manager.register_pass(name, ""); \ + } \ + } while (0) { using namespace ov::element; // QDQ stripping pipeline // 1. Transform DQ part to canonicalized form: Multiply->Add => Subtract->Multiply + SERIALIZE_GRAPHS("before.xml"); manager.register_pass(); + SERIALIZE_GRAPHS("add_transformation.xml"); // 2. Fuse FQ->Convert->DQ to a single FQ manager.register_pass(TypeVector{i16, u16, i32}, TypeVector{f16, f32}); + SERIALIZE_GRAPHS("convert_qdq.xml"); // 3. Strip FQ layers with unsupported levels bool replace_with_clamp = true; manager.register_pass(std::set{levels::int16}, replace_with_clamp); + SERIALIZE_GRAPHS("fq_stripping.xml"); } manager.register_pass( From 8d46d495b721db04f9f39d55cf69e7963d772a91 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 30 Oct 2025 01:22:23 +0900 Subject: [PATCH 10/28] Further debug logging extending --- .../convert_quantize_dequantize.cpp | 6 +++++- .../src/plugin/transformations_pipeline.cpp | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 6b191293615242..a89dd3a81635c8 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -145,7 +145,11 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( LOG_INFO("\t fq = " << fq << std::endl); #define LOG_CONSTANT_VALUE(name, value) \ do { \ - if (auto const_node = ov::as_type_ptr(value.get_node_shared_ptr())) { \ + auto node = value.get_node_shared_ptr(); \ + if (ov::is_type(node)) { \ + node = node->get_input_node_shared_ptr(0); \ + } \ + if (auto const_node = ov::as_type_ptr(node)) { \ if (ov::shape_size(const_node->get_shape()) == 1) { \ LOG_INFO("\t " << name << " = " << value \ << " (constant value: " << const_node->cast_vector()[0] << ")" << std::endl); \ diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 777453e8990c9e..29319498c81050 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -391,27 +391,28 @@ void TransformationsPipeline::apply(std::shared_ptr func) { using namespace ov::pass::low_precision; auto is_model_quantized = LowPrecision::isFunctionQuantized(func, std::set{levels::int8, levels::int8_narrow_range}); enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; -#define SERIALIZE_GRAPHS(name) \ - do { \ - if (std::getenv("QDQ_STRIPPING_SERIALIZE")) { \ - manager.register_pass(name, ""); \ - } \ +#define SERIALIZE_GRAPHS(name) \ + do { \ + if (std::getenv("QDQ_STRIPPING_SERIALIZE")) { \ + manager.register_pass(std::string("qdq_stripping_dumps/") + name + std::string(".xml"), ""); \ + manager.register_pass(std::string("qdq_stripping_dumps/") + name + std::string(".svg")); \ + } \ } while (0) { using namespace ov::element; // QDQ stripping pipeline // 1. Transform DQ part to canonicalized form: Multiply->Add => Subtract->Multiply - SERIALIZE_GRAPHS("before.xml"); + SERIALIZE_GRAPHS("before"); manager.register_pass(); - SERIALIZE_GRAPHS("add_transformation.xml"); + SERIALIZE_GRAPHS("add_transformation"); // 2. Fuse FQ->Convert->DQ to a single FQ manager.register_pass(TypeVector{i16, u16, i32}, TypeVector{f16, f32}); - SERIALIZE_GRAPHS("convert_qdq.xml"); + SERIALIZE_GRAPHS("convert_qdq"); // 3. Strip FQ layers with unsupported levels bool replace_with_clamp = true; manager.register_pass(std::set{levels::int16}, replace_with_clamp); - SERIALIZE_GRAPHS("fq_stripping.xml"); + SERIALIZE_GRAPHS("fq_stripping"); } manager.register_pass( From dbba54b766c1d2bb07667ce3906ec88c65a43cbd Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 31 Oct 2025 03:19:20 +0900 Subject: [PATCH 11/28] ConvertQuantizeDequantize: ignore consumers_count check --- .../convert_quantize_dequantize.hpp | 3 ++- .../convert_quantize_dequantize.cpp | 20 ++++++++++++------- .../src/plugin/transformations_pipeline.cpp | 3 +-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp b/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp index dda3948612d4b3..6a09c035305f78 100644 --- a/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/convert_quantize_dequantize.hpp @@ -37,5 +37,6 @@ class ov::pass::ConvertQuantizeDequantize : public ov::pass::MatcherPass { ov::element::u8, ov::element::i16, ov::element::u16}, - const ov::element::TypeVector& supported_original_precisions = {ov::element::f32}); + const ov::element::TypeVector& supported_original_precisions = {ov::element::f32}, + const bool ignore_consumers_count_check = false); }; diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index a89dd3a81635c8..76a8073b42a4ff 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -72,7 +72,8 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( const ov::element::TypeVector& supported_low_precisions, - const ov::element::TypeVector& supported_original_precisions) { + const ov::element::TypeVector& supported_original_precisions, + const bool ignore_consumers_count_check) { MATCHER_SCOPE(ConvertQuantizeDequantize); using namespace ov::pass::pattern; @@ -85,13 +86,18 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( auto output_high_pattern = wrap_type(); auto fq_pattern = wrap_type( {data_pattern, input_low_pattern, input_high_pattern, output_low_pattern, output_high_pattern}); - auto convert1_pattern = - wrap_type({fq_pattern}, type_matches_any(supported_low_precisions) && consumers_count(1)); - auto convert2_pattern = - wrap_type({convert1_pattern}, - type_matches_any(supported_original_precisions) && consumers_count(1)); + op::Predicate convert1_predicate = ignore_consumers_count_check + ? type_matches_any(supported_low_precisions) + : type_matches_any(supported_low_precisions) && consumers_count(1); + auto convert1_pattern = wrap_type({fq_pattern}, convert1_predicate); + op::Predicate convert2_predicate = ignore_consumers_count_check + ? type_matches_any(supported_original_precisions) + : type_matches_any(supported_original_precisions) && consumers_count(1); + auto convert2_pattern = wrap_type({convert1_pattern}, convert2_predicate); + auto zero_point_pattern = any_input(); - auto sub_pattern = optional({convert2_pattern, zero_point_pattern}, consumers_count(1)); + op::Predicate sub_predicate = ignore_consumers_count_check ? op::Predicate() : consumers_count(1); + auto sub_pattern = optional({convert2_pattern, zero_point_pattern}, sub_predicate); auto scale_pattern = any_input(); auto mul_pattern = wrap_type({sub_pattern, scale_pattern}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 29319498c81050..694e0730dd4202 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -406,8 +406,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); SERIALIZE_GRAPHS("add_transformation"); // 2. Fuse FQ->Convert->DQ to a single FQ - manager.register_pass(TypeVector{i16, u16, i32}, - TypeVector{f16, f32}); + manager.register_pass(TypeVector{i16, u16, i32}, TypeVector{f16, f32}, true); SERIALIZE_GRAPHS("convert_qdq"); // 3. Strip FQ layers with unsupported levels bool replace_with_clamp = true; From 95de48171e3369a2dce4db30cc9df806f655df8d Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 31 Oct 2025 18:43:57 +0900 Subject: [PATCH 12/28] Cleanup --- .../src/qdq_stripping.cpp | 31 +------------ .../convert_quantize_dequantize.cpp | 43 +------------------ .../src/plugin/transformations_pipeline.cpp | 11 ----- 3 files changed, 2 insertions(+), 83 deletions(-) diff --git a/src/common/low_precision_transformations/src/qdq_stripping.cpp b/src/common/low_precision_transformations/src/qdq_stripping.cpp index 15c75730e839a0..b0422577ad1c31 100644 --- a/src/common/low_precision_transformations/src/qdq_stripping.cpp +++ b/src/common/low_precision_transformations/src/qdq_stripping.cpp @@ -24,13 +24,6 @@ namespace ov { namespace pass { namespace low_precision { -#define LOG_INFO(...) \ - do { \ - if (std::getenv("QDQ_STRIPPING_LOG")) { \ - std::cout << __VA_ARGS__; \ - } \ - } while (0) - FQStrippingTransformation::FQStrippingTransformation(const std::set& levels_to_strip, bool replace_with_clamp) { MATCHER_SCOPE(FQStrippingTransformation); auto is_scalar = [](const Output& output) -> bool { @@ -46,15 +39,12 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev ov::graph_rewrite_callback callback = [OV_CAPTURE_CPY_AND_THIS](pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); auto node = ov::as_type_ptr(pattern_map.at(fq_m).get_node_shared_ptr()); - LOG_INFO("[ INFO ] FQStrippingTransformation matched\n"); if (!node) { return false; } - LOG_INFO("[ INFO ] fq = " << node << std::endl); const size_t levels = node->get_levels(); if (!levels_to_strip.count(levels)) { - LOG_INFO("\t check failed: levels are not matched\n"); return false; } @@ -65,7 +55,6 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev auto output_high = ov::as_type_ptr(pattern_map.at(output_high_m).get_node_shared_ptr()); if (!input_low || !input_high || !output_low || !output_high) { - LOG_INFO("\t check failed: il_ih/ol_oh are not constant\n"); return false; } auto constants_are_equal = [](const std::shared_ptr& lhs, @@ -76,25 +65,8 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev "constants_are_equal expects scalar constant as a comparison result"); return equal->get_vector()[0] == true; }; -#define LOG_CONSTANT_VALUE(name, value) \ - do { \ - if (auto const_node = ov::as_type_ptr(value)) { \ - if (ov::shape_size(const_node->get_shape()) == 1) { \ - LOG_INFO("\t " << name << " = " << value \ - << " (constant value: " << const_node->cast_vector()[0] << ")" << std::endl); \ - } else { \ - LOG_INFO("\t " << name << " = " << value << std::endl); \ - } \ - } else { \ - LOG_INFO("\t " << name << " = " << value << std::endl); \ - } \ - } while (0); - LOG_CONSTANT_VALUE("\t input_low", input_low); - LOG_CONSTANT_VALUE("\t output_low", output_low); - LOG_CONSTANT_VALUE("\t input_high", input_high); - LOG_CONSTANT_VALUE("\t output_high", output_high); + if (!constants_are_equal(input_low, output_low) || !constants_are_equal(input_high, output_high)) { - LOG_INFO("\t check failed: constants_are_not equal\n"); return false; } @@ -108,7 +80,6 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev res = replace_output_update_name(node->output(0), node->input_value(0)); } OPENVINO_ASSERT(res, "FQ stripping failed"); - LOG_INFO("\t transformation finished\n"); return res; }; diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 76a8073b42a4ff..36beef4a769ae6 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -63,13 +63,6 @@ // v // -#define LOG_INFO(...) \ - do { \ - if (std::getenv("QDQ_STRIPPING_LOG")) { \ - std::cout << __VA_ARGS__; \ - } \ - } while (0) - ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( const ov::element::TypeVector& supported_low_precisions, const ov::element::TypeVector& supported_original_precisions, @@ -103,7 +96,6 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { auto pattern_map = m.get_pattern_value_map(); - LOG_INFO("[ INFO ] ConvertQuantizeDequantize matched\n"); if (transformation_callback(m.get_match_root())) { return false; @@ -146,47 +138,15 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( {ov::element::i16, {-32768.f, 32767.f}}, {ov::element::u16, {0.f, 65535.f}}}; const auto& type = convert1.get_element_type(); - LOG_INFO("[ INFO ] Matched nodes info\n"); - LOG_INFO("\t data = " << data << std::endl); - LOG_INFO("\t fq = " << fq << std::endl); -#define LOG_CONSTANT_VALUE(name, value) \ - do { \ - auto node = value.get_node_shared_ptr(); \ - if (ov::is_type(node)) { \ - node = node->get_input_node_shared_ptr(0); \ - } \ - if (auto const_node = ov::as_type_ptr(node)) { \ - if (ov::shape_size(const_node->get_shape()) == 1) { \ - LOG_INFO("\t " << name << " = " << value \ - << " (constant value: " << const_node->cast_vector()[0] << ")" << std::endl); \ - } else { \ - LOG_INFO("\t " << name << " = " << value << std::endl); \ - } \ - } else { \ - LOG_INFO("\t " << name << " = " << value << std::endl); \ - } \ - } while (0); - - LOG_CONSTANT_VALUE("input_low", input_low); - LOG_CONSTANT_VALUE("input_high", input_high); - LOG_CONSTANT_VALUE("output_low", pattern_map.at(output_low_pattern)); - LOG_CONSTANT_VALUE("output_high", pattern_map.at(output_high_pattern)); - LOG_INFO("\t convert1 = " << convert1 << std::endl); - LOG_INFO("\t convert2 = " << convert2 << std::endl); + if (pattern_map.count(zero_point_pattern)) { const auto& shift = pattern_map.at(zero_point_pattern); const auto& subtract = pattern_map.at(sub_pattern); - LOG_CONSTANT_VALUE("shift", shift); - LOG_INFO("\t subtract = " << subtract << std::endl); } else { - LOG_INFO("\t zero_point not present\n"); } - LOG_CONSTANT_VALUE("scale", scale); - LOG_INFO("\t mul = " << mul << std::endl); if (supported_intervals.count(type) == 0 || supported_intervals.at(type) != std::make_pair(out_low_val, out_high_val)) { - LOG_INFO("[ INFO ] ConvertQuantizeDequantize: unsupported intervals\n"); return false; } @@ -224,7 +184,6 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( copy_runtime_info({fq, convert1.get_node_shared_ptr(), convert2.get_node_shared_ptr()}, new_fq); replace_node(mul, new_fq); - LOG_INFO("[ INFO ] ConvertQuantizeDequantize: transformation finished\n"); return true; }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 694e0730dd4202..408ec3502015b4 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -391,27 +391,16 @@ void TransformationsPipeline::apply(std::shared_ptr func) { using namespace ov::pass::low_precision; auto is_model_quantized = LowPrecision::isFunctionQuantized(func, std::set{levels::int8, levels::int8_narrow_range}); enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; -#define SERIALIZE_GRAPHS(name) \ - do { \ - if (std::getenv("QDQ_STRIPPING_SERIALIZE")) { \ - manager.register_pass(std::string("qdq_stripping_dumps/") + name + std::string(".xml"), ""); \ - manager.register_pass(std::string("qdq_stripping_dumps/") + name + std::string(".svg")); \ - } \ - } while (0) { using namespace ov::element; // QDQ stripping pipeline // 1. Transform DQ part to canonicalized form: Multiply->Add => Subtract->Multiply - SERIALIZE_GRAPHS("before"); manager.register_pass(); - SERIALIZE_GRAPHS("add_transformation"); // 2. Fuse FQ->Convert->DQ to a single FQ manager.register_pass(TypeVector{i16, u16, i32}, TypeVector{f16, f32}, true); - SERIALIZE_GRAPHS("convert_qdq"); // 3. Strip FQ layers with unsupported levels bool replace_with_clamp = true; manager.register_pass(std::set{levels::int16}, replace_with_clamp); - SERIALIZE_GRAPHS("fq_stripping"); } manager.register_pass( From 8faa050c5ebcb534cd54e6ae695c188f49b8b675 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 31 Oct 2025 19:03:09 +0900 Subject: [PATCH 13/28] Introduced REPLACE_QDQ_WITH_CLAMP env variable --- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 408ec3502015b4..d6c6e4bdde51ff 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -399,7 +399,8 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // 2. Fuse FQ->Convert->DQ to a single FQ manager.register_pass(TypeVector{i16, u16, i32}, TypeVector{f16, f32}, true); // 3. Strip FQ layers with unsupported levels - bool replace_with_clamp = true; + bool replace_with_clamp = ov::util::getenv_bool("REPLACE_QDQ_WITH_CLAMP", true); + std::cout << "[ QDQ STRIPPING INFO ] replace_with_clamp = " << replace_with_clamp << std::endl; manager.register_pass(std::set{levels::int16}, replace_with_clamp); } From b18130686894f16448aa75868d5887da02dbf6a4 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 31 Oct 2025 21:35:02 +0900 Subject: [PATCH 14/28] Added QDQStrippingTest --- .../subgraph_tests/dynamic/qdq_stripping.cpp | 132 ++++++++++++++++++ .../tests/functional/subgraph_tests/test.cpp | 24 ---- 2 files changed, 132 insertions(+), 24 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp delete mode 100644 src/plugins/intel_gpu/tests/functional/subgraph_tests/test.cpp diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp new file mode 100644 index 00000000000000..8b32dc45108f70 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/node_builders/constant.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/convolution.hpp" +#include "openvino/op/fake_quantize.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/runtime/exec_model_info.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" + +namespace { +using namespace ov::test; +using ov::test::InputShape; + +using QDQStrippingParams = std::tuple; + +class QDQStrippingTest : public testing::WithParamInterface, virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + const auto& [input_shape, input_precision] = obj.param; + std::ostringstream result; + result << "input_shape=" << input_shape << "_input_precision=" << input_precision; + return result.str(); + } + +protected: + std::shared_ptr init_subgraph(const ov::PartialShape& input_shape) { + ov::ParameterVector params{std::make_shared(ov::element::f32, input_shape)}; + + const float i_l = 0.f, i_h = 10.f, o_l = 0.f, o_h = 65535.f; + auto input_low = ov::op::v0::Constant::create(ov::element::f32, {}, {i_l}); + auto input_high = ov::op::v0::Constant::create(ov::element::f32, {}, {i_h}); + auto output_low = ov::op::v0::Constant::create(ov::element::f32, {}, {o_l}); + auto output_high = ov::op::v0::Constant::create(ov::element::f32, {}, {o_h}); + + auto input_fq = std::make_shared(params[0], input_low, input_high, output_low, output_high, 65536); + + auto input_convert1 = std::make_shared(input_fq, ov::element::u16); + auto input_convert2 = std::make_shared(input_convert1, ov::element::f32); + + size_t seed = 1; + auto create_qdq_branch = [&](float weight_scale_value) { + auto input_scale = ov::op::v0::Constant::create(ov::element::f32, {}, {(i_h - i_l) / (o_h - o_l)}); + auto input_dequantized = std::make_shared(input_convert2, input_scale); + + ov::test::utils::InputGenerateData gen_data; + gen_data.seed = seed++; + auto weight_quantized = ov::test::utils::make_constant(ov::element::u8, ov::Shape{32, 3, 3, 3}, gen_data); + auto weight_convert = std::make_shared(weight_quantized, ov::element::f32); + auto weight_scale = ov::test::utils::make_constant(ov::element::f32, {}, gen_data); + auto weight_dequantized = std::make_shared(weight_convert, weight_scale); + + auto conv = std::make_shared(input_dequantized, + weight_dequantized, + ov::Strides{1, 1}, + ov::CoordinateDiff{1, 1}, + ov::CoordinateDiff{1, 1}, + ov::Strides{1, 1}); + + auto bias_const = ov::test::utils::make_constant(ov::element::f32, ov::Shape{1, 32, 1, 1}, gen_data); + auto conv_biased = std::make_shared(conv, bias_const); + + const float conv_i_l = -6.244578838348389f, conv_i_h = 6.347373962402344f, conv_o_l = 0.f, conv_o_h = 65535.f; + auto conv_input_low = ov::op::v0::Constant::create(ov::element::f32, {}, {conv_i_l}); + auto conv_input_high = ov::op::v0::Constant::create(ov::element::f32, {}, {conv_i_h}); + auto conv_output_low = ov::op::v0::Constant::create(ov::element::f32, {}, {conv_o_l}); + auto conv_output_high = ov::op::v0::Constant::create(ov::element::f32, {}, {conv_o_h}); + auto fake_quantize = + std::make_shared(conv_biased, conv_input_low, conv_input_high, conv_output_low, conv_output_high, 65536); + + auto act_quantized = std::make_shared(fake_quantize, ov::element::u16); + auto act_convert = std::make_shared(act_quantized, ov::element::f32); + + auto act_zero_point = ov::op::v0::Constant::create(ov::element::u16, {}, {32500}); + auto act_zp_convert = std::make_shared(act_zero_point, ov::element::f32); + + auto act_subtract = std::make_shared(act_convert, act_zp_convert); + auto act_scale = ov::op::v0::Constant::create(ov::element::f32, {}, {(conv_i_h - conv_i_l) / (conv_o_h - conv_o_l)}); + + return std::make_shared(act_subtract, act_scale); + }; + + auto left_branch = create_qdq_branch(0.01f); + auto right_branch = create_qdq_branch(0.001f); + auto add_branches = std::make_shared(left_branch, right_branch); + + auto model = std::make_shared(ov::OutputVector{add_branches}, params, "QDQStripping"); + return model; + } + + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_GPU; + const auto& [input_shape, input_precision] = GetParam(); + init_input_shapes({input_shape}); + inType = outType = input_precision; + + if (input_precision == ov::element::f16) { + abs_threshold = 1.0f; + } else { + abs_threshold = 1e-4f; + } + function = init_subgraph(input_shape.first); + } + + void validate() override { + ov::test::SubgraphBaseTest::validate(); + auto runtime_model = compiledModel.get_runtime_model(); + ASSERT_TRUE(runtime_model != nullptr) << "Runtime model should not be null"; + for (const auto& op : runtime_model->get_ordered_ops()) { + auto layer_type = op->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); + ASSERT_NE(layer_type, "Quantize") << "FakeQuantize node is not expected in the runtime model after QDQ stripping."; + } + } +}; + +TEST_P(QDQStrippingTest, Inference) { + run(); +} + +const std::vector input_shapes = {{{-1, -1, -1, -1}, {{1, 3, 128, 128}}}}; +const std::vector input_precisions = {ov::element::f32}; +INSTANTIATE_TEST_SUITE_P(smoke_QDQStripping, + QDQStrippingTest, + ::testing::Combine(::testing::ValuesIn(input_shapes), ::testing::ValuesIn(input_precisions)), + QDQStrippingTest::getTestCaseName); +} // namespace \ No newline at end of file diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/test.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/test.cpp deleted file mode 100644 index 2778043489ff90..00000000000000 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/test.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (C) 2018-2025 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "shared_test_classes/base/ov_subgraph.hpp" -#include "openvino/core/core.hpp" - -namespace { - -class QDQStrippingTest : virtual public ov::test::SubgraphBaseStaticTest { -public: - -protected: - void SetUp() override { - targetDevice = ov::test::utils::DEVICE_GPU; - function = core->read_model("/home/guest/golubevv/openvino/bin/intel64/RelWithDebInfo/1_Conv.onnx"); - } -}; - -TEST_F(QDQStrippingTest, Inference) { - run(); -} - -} // namespace From 0f7aa0a1280afbf40cb3617d63fb37417ab20e59 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 3 Nov 2025 18:41:16 +0900 Subject: [PATCH 15/28] Warning fixed --- .../common_optimizations/convert_quantize_dequantize.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 36beef4a769ae6..9b8218bd57e08b 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -138,13 +138,6 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( {ov::element::i16, {-32768.f, 32767.f}}, {ov::element::u16, {0.f, 65535.f}}}; const auto& type = convert1.get_element_type(); - - if (pattern_map.count(zero_point_pattern)) { - const auto& shift = pattern_map.at(zero_point_pattern); - const auto& subtract = pattern_map.at(sub_pattern); - } else { - } - if (supported_intervals.count(type) == 0 || supported_intervals.at(type) != std::make_pair(out_low_val, out_high_val)) { return false; From cbc6318f927ae5175274529cf3003afabf7d5a5d Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 3 Nov 2025 18:50:21 +0900 Subject: [PATCH 16/28] Compilation error fix --- .../convert_quantize_dequantize.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 9b8218bd57e08b..1d9dc9f540d125 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -79,17 +79,18 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( auto output_high_pattern = wrap_type(); auto fq_pattern = wrap_type( {data_pattern, input_low_pattern, input_high_pattern, output_low_pattern, output_high_pattern}); - op::Predicate convert1_predicate = ignore_consumers_count_check - ? type_matches_any(supported_low_precisions) - : type_matches_any(supported_low_precisions) && consumers_count(1); + ov::pass::pattern::op::Predicate convert1_predicate = + ignore_consumers_count_check ? type_matches_any(supported_low_precisions) + : type_matches_any(supported_low_precisions) && consumers_count(1); auto convert1_pattern = wrap_type({fq_pattern}, convert1_predicate); - op::Predicate convert2_predicate = ignore_consumers_count_check - ? type_matches_any(supported_original_precisions) - : type_matches_any(supported_original_precisions) && consumers_count(1); + ov::pass::pattern::op::Predicate convert2_predicate = + ignore_consumers_count_check ? type_matches_any(supported_original_precisions) + : type_matches_any(supported_original_precisions) && consumers_count(1); auto convert2_pattern = wrap_type({convert1_pattern}, convert2_predicate); auto zero_point_pattern = any_input(); - op::Predicate sub_predicate = ignore_consumers_count_check ? op::Predicate() : consumers_count(1); + ov::pass::pattern::op::Predicate sub_predicate = + ignore_consumers_count_check ? ov::pass::pattern::op::Predicate() : consumers_count(1); auto sub_pattern = optional({convert2_pattern, zero_point_pattern}, sub_predicate); auto scale_pattern = any_input(); auto mul_pattern = wrap_type({sub_pattern, scale_pattern}); From 0e403c23a4eb9f1697ded0f654120a330b9ddf61 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 3 Nov 2025 22:14:30 +0900 Subject: [PATCH 17/28] Transformation pipeline minor corrections --- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index d6c6e4bdde51ff..e6740e8478656c 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -397,11 +397,12 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // 1. Transform DQ part to canonicalized form: Multiply->Add => Subtract->Multiply manager.register_pass(); // 2. Fuse FQ->Convert->DQ to a single FQ - manager.register_pass(TypeVector{i16, u16, i32}, TypeVector{f16, f32}, true); + manager.register_pass(TypeVector{i16, u16}, TypeVector{f32}, true); // 3. Strip FQ layers with unsupported levels bool replace_with_clamp = ov::util::getenv_bool("REPLACE_QDQ_WITH_CLAMP", true); std::cout << "[ QDQ STRIPPING INFO ] replace_with_clamp = " << replace_with_clamp << std::endl; manager.register_pass(std::set{levels::int16}, replace_with_clamp); + manager.register_pass(); } manager.register_pass( From 521174043559999f849c60f8b6c4030c9a965fd5 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 3 Nov 2025 23:08:55 +0900 Subject: [PATCH 18/28] QDQStrippingTest extending --- .../subgraph_tests/dynamic/qdq_stripping.cpp | 104 +++++++++++------- 1 file changed, 62 insertions(+), 42 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp index 8b32dc45108f70..382dcc7cbdd8bf 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp @@ -18,40 +18,67 @@ namespace { using namespace ov::test; using ov::test::InputShape; -using QDQStrippingParams = std::tuple; +using QDQStrippingParams = std::tuple; + +class QuantizationParams { +public: + ov::Output build_fq(const ov::Output& input) const { + auto input_low = ov::op::v0::Constant::create(ov::element::f32, {}, {i_l}); + auto input_high = ov::op::v0::Constant::create(ov::element::f32, {}, {i_h}); + auto output_low = ov::op::v0::Constant::create(ov::element::f32, {}, {o_l}); + auto output_high = ov::op::v0::Constant::create(ov::element::f32, {}, {o_h}); + return std::make_shared(input, input_low, input_high, output_low, output_high, 65536); + } + + ov::Output build_dq(const ov::Output& input, const ov::element::Type& quantization_precision) const { + auto act_zero_point = ov::op::v0::Constant::create(quantization_precision, {}, {zero_point}); + auto act_zp_convert = std::make_shared(act_zero_point, ov::element::f32); + + auto act_subtract = std::make_shared(input, act_zp_convert); + auto act_scale = ov::op::v0::Constant::create(ov::element::f32, {}, {(i_h - i_l) / (o_h - o_l)}); + + return std::make_shared(act_subtract, act_scale); + } + + float i_l; + float i_h; + float o_l; + float o_h; + int zero_point; +}; class QDQStrippingTest : public testing::WithParamInterface, virtual public ov::test::SubgraphBaseTest { public: static std::string getTestCaseName(const testing::TestParamInfo& obj) { - const auto& [input_shape, input_precision] = obj.param; + const auto& [input_shape, input_precision, quantization_precision] = obj.param; std::ostringstream result; - result << "input_shape=" << input_shape << "_input_precision=" << input_precision; + result << "input_shape=" << input_shape << "_input_precision=" << input_precision << "_quantization_precision=" << quantization_precision; return result.str(); } protected: - std::shared_ptr init_subgraph(const ov::PartialShape& input_shape) { + std::shared_ptr init_subgraph(const ov::PartialShape& input_shape, const ov::element::Type& quantization_precision) { ov::ParameterVector params{std::make_shared(ov::element::f32, input_shape)}; + // Note: these params are taken from the real cases + const static std::unordered_map> quantization_params{ + {ov::element::Type_t::u16, {{0.f, 10.f, 0.f, 65535.f, 0}, {-6.244578838348389f, 6.347373962402344f, 0.f, 65535.f, 32500}}}, + {ov::element::Type_t::i16, + {{-5.000076293945312f, 4.999923706054688f, -32768.f, 32767.f, 0}, {-6.296072483062744f, 6.295880317687988f, -32768.f, 32767.f, 0}}}, + }; - const float i_l = 0.f, i_h = 10.f, o_l = 0.f, o_h = 65535.f; - auto input_low = ov::op::v0::Constant::create(ov::element::f32, {}, {i_l}); - auto input_high = ov::op::v0::Constant::create(ov::element::f32, {}, {i_h}); - auto output_low = ov::op::v0::Constant::create(ov::element::f32, {}, {o_l}); - auto output_high = ov::op::v0::Constant::create(ov::element::f32, {}, {o_h}); - - auto input_fq = std::make_shared(params[0], input_low, input_high, output_low, output_high, 65536); + const auto& q_params = quantization_params.at(quantization_precision); + const auto& qp_1 = q_params.first; + auto input_fq = qp_1.build_fq(params[0]); - auto input_convert1 = std::make_shared(input_fq, ov::element::u16); + auto input_convert1 = std::make_shared(input_fq, quantization_precision); auto input_convert2 = std::make_shared(input_convert1, ov::element::f32); size_t seed = 1; auto create_qdq_branch = [&](float weight_scale_value) { - auto input_scale = ov::op::v0::Constant::create(ov::element::f32, {}, {(i_h - i_l) / (o_h - o_l)}); - auto input_dequantized = std::make_shared(input_convert2, input_scale); - + auto input_dequantized = qp_1.build_dq(input_convert2, quantization_precision); ov::test::utils::InputGenerateData gen_data; gen_data.seed = seed++; - auto weight_quantized = ov::test::utils::make_constant(ov::element::u8, ov::Shape{32, 3, 3, 3}, gen_data); + auto weight_quantized = ov::test::utils::make_constant(ov::element::i8, ov::Shape{32, 3, 3, 3}, gen_data); auto weight_convert = std::make_shared(weight_quantized, ov::element::f32); auto weight_scale = ov::test::utils::make_constant(ov::element::f32, {}, gen_data); auto weight_dequantized = std::make_shared(weight_convert, weight_scale); @@ -66,24 +93,11 @@ class QDQStrippingTest : public testing::WithParamInterface, auto bias_const = ov::test::utils::make_constant(ov::element::f32, ov::Shape{1, 32, 1, 1}, gen_data); auto conv_biased = std::make_shared(conv, bias_const); - const float conv_i_l = -6.244578838348389f, conv_i_h = 6.347373962402344f, conv_o_l = 0.f, conv_o_h = 65535.f; - auto conv_input_low = ov::op::v0::Constant::create(ov::element::f32, {}, {conv_i_l}); - auto conv_input_high = ov::op::v0::Constant::create(ov::element::f32, {}, {conv_i_h}); - auto conv_output_low = ov::op::v0::Constant::create(ov::element::f32, {}, {conv_o_l}); - auto conv_output_high = ov::op::v0::Constant::create(ov::element::f32, {}, {conv_o_h}); - auto fake_quantize = - std::make_shared(conv_biased, conv_input_low, conv_input_high, conv_output_low, conv_output_high, 65536); - - auto act_quantized = std::make_shared(fake_quantize, ov::element::u16); + const auto& qp_2 = q_params.second; + auto fake_quantize = qp_2.build_fq(conv_biased); + auto act_quantized = std::make_shared(fake_quantize, quantization_precision); auto act_convert = std::make_shared(act_quantized, ov::element::f32); - - auto act_zero_point = ov::op::v0::Constant::create(ov::element::u16, {}, {32500}); - auto act_zp_convert = std::make_shared(act_zero_point, ov::element::f32); - - auto act_subtract = std::make_shared(act_convert, act_zp_convert); - auto act_scale = ov::op::v0::Constant::create(ov::element::f32, {}, {(conv_i_h - conv_i_l) / (conv_o_h - conv_o_l)}); - - return std::make_shared(act_subtract, act_scale); + return qp_2.build_dq(act_convert, quantization_precision); }; auto left_branch = create_qdq_branch(0.01f); @@ -96,26 +110,28 @@ class QDQStrippingTest : public testing::WithParamInterface, void SetUp() override { targetDevice = ov::test::utils::DEVICE_GPU; - const auto& [input_shape, input_precision] = GetParam(); + const auto& [input_shape, input_precision, quantization_precision] = GetParam(); init_input_shapes({input_shape}); inType = outType = input_precision; - if (input_precision == ov::element::f16) { - abs_threshold = 1.0f; - } else { - abs_threshold = 1e-4f; - } - function = init_subgraph(input_shape.first); + // Since the FQ are not executed in a strictly 'fair' manner, and just replaced with clamp ops, a small deviation in accuracy is expected. + abs_threshold = 1.f; + function = init_subgraph(input_shape.first, quantization_precision); } void validate() override { ov::test::SubgraphBaseTest::validate(); auto runtime_model = compiledModel.get_runtime_model(); ASSERT_TRUE(runtime_model != nullptr) << "Runtime model should not be null"; + size_t quantize_count = 0; for (const auto& op : runtime_model->get_ordered_ops()) { auto layer_type = op->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as(); - ASSERT_NE(layer_type, "Quantize") << "FakeQuantize node is not expected in the runtime model after QDQ stripping."; + if (layer_type == std::string("Quantize")) { + quantize_count++; + } } + const size_t expected_quantize_count = 0; + ASSERT_EQ(quantize_count, expected_quantize_count) << "Unexpected Quantize node count."; } }; @@ -125,8 +141,12 @@ TEST_P(QDQStrippingTest, Inference) { const std::vector input_shapes = {{{-1, -1, -1, -1}, {{1, 3, 128, 128}}}}; const std::vector input_precisions = {ov::element::f32}; +const std::vector quantization_precisions = {ov::element::u16, ov::element::i16}; + INSTANTIATE_TEST_SUITE_P(smoke_QDQStripping, QDQStrippingTest, - ::testing::Combine(::testing::ValuesIn(input_shapes), ::testing::ValuesIn(input_precisions)), + ::testing::Combine(::testing::ValuesIn(input_shapes), + ::testing::ValuesIn(input_precisions), + ::testing::ValuesIn(quantization_precisions)), QDQStrippingTest::getTestCaseName); } // namespace \ No newline at end of file From 0bb987099ed7f71b5de4b51121706ac2fa782472 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 4 Nov 2025 00:41:39 +0900 Subject: [PATCH 19/28] Added clarification comment to ConvertQuantizeDequantize --- .../common_optimizations/convert_quantize_dequantize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 1d9dc9f540d125..cc00e6ed1907c9 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -139,6 +139,7 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( {ov::element::i16, {-32768.f, 32767.f}}, {ov::element::u16, {0.f, 65535.f}}}; const auto& type = convert1.get_element_type(); + // check if (out_low_val, out_high_val) pair is mapped on the expected precision ranges if (supported_intervals.count(type) == 0 || supported_intervals.at(type) != std::make_pair(out_low_val, out_high_val)) { return false; From 6a6d9b8af7de2b50c40b4d23643d01f1b1ea7bbd Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 4 Nov 2025 01:37:31 +0900 Subject: [PATCH 20/28] Corrected LPT tests instances --- .../low_precision_transformations/fq_transformation.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp index a7ee8525d0eadc..0535751f88f7a8 100644 --- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp @@ -44,13 +44,14 @@ const std::vector fakeQuantizeOnDataValues = { { 256ul, {}, { -127.5f }, { 0.f }, { -127.5f }, { 0.f } }, "Pooling", "u8" }, + // Not expected FQ levels { { 16ul, {}, { 0.f }, { 1.5f }, { 0.f }, { 1.5f } }, - "Pooling", "u8" + "Pooling", "f32" }, { { 16ul, {}, { -8.f }, { 7.f }, { -0.8f }, { 0.7f } }, - "Pooling", "i8" + "Pooling", "f32" }, // nGraph: I8->FP32 Convert is not supported // { 256ul, {}, { -1.28f} , { 1.27f }, { -1.28f} , { 1.27f } }, From 5032aeab87db97a075b81bf3201a9faf10c59866 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 4 Nov 2025 19:39:51 +0900 Subject: [PATCH 21/28] codestyle --- .../tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp index 382dcc7cbdd8bf..d1064e664ddf94 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp @@ -60,7 +60,7 @@ class QDQStrippingTest : public testing::WithParamInterface, std::shared_ptr init_subgraph(const ov::PartialShape& input_shape, const ov::element::Type& quantization_precision) { ov::ParameterVector params{std::make_shared(ov::element::f32, input_shape)}; // Note: these params are taken from the real cases - const static std::unordered_map> quantization_params{ + static const std::unordered_map> quantization_params{ {ov::element::Type_t::u16, {{0.f, 10.f, 0.f, 65535.f, 0}, {-6.244578838348389f, 6.347373962402344f, 0.f, 65535.f, 32500}}}, {ov::element::Type_t::i16, {{-5.000076293945312f, 4.999923706054688f, -32768.f, 32767.f, 0}, {-6.296072483062744f, 6.295880317687988f, -32768.f, 32767.f, 0}}}, From 68a9e3bb80dcab76877c8be1c198b9e83169ed9f Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 5 Nov 2025 01:18:12 +0900 Subject: [PATCH 22/28] [GPU] Keep old behavior for non-i16 models --- .../intel_gpu/src/plugin/transformations_pipeline.cpp | 2 +- .../low_precision_transformations/fq_transformation.cpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index e6740e8478656c..644bd699c67bbf 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -389,7 +389,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { } using namespace ov::pass::low_precision; - auto is_model_quantized = LowPrecision::isFunctionQuantized(func, std::set{levels::int8, levels::int8_narrow_range}); + auto is_model_quantized = LowPrecision::isFunctionQuantized(func); enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; { using namespace ov::element; diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp index 0535751f88f7a8..a7ee8525d0eadc 100644 --- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/low_precision_transformations/fq_transformation.cpp @@ -44,14 +44,13 @@ const std::vector fakeQuantizeOnDataValues = { { 256ul, {}, { -127.5f }, { 0.f }, { -127.5f }, { 0.f } }, "Pooling", "u8" }, - // Not expected FQ levels { { 16ul, {}, { 0.f }, { 1.5f }, { 0.f }, { 1.5f } }, - "Pooling", "f32" + "Pooling", "u8" }, { { 16ul, {}, { -8.f }, { 7.f }, { -0.8f }, { 0.7f } }, - "Pooling", "f32" + "Pooling", "i8" }, // nGraph: I8->FP32 Convert is not supported // { 256ul, {}, { -1.28f} , { 1.27f }, { -1.28f} , { 1.27f } }, From 060e17dd0d3604bcd8795d2d24bbcd79b72b5a39 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 5 Nov 2025 01:58:53 +0900 Subject: [PATCH 23/28] Review comments applied --- .../src/qdq_stripping.cpp | 4 ++-- .../convert_quantize_dequantize.cpp | 21 +++++++++++++------ .../subgraph_tests/dynamic/qdq_stripping.cpp | 2 ++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/common/low_precision_transformations/src/qdq_stripping.cpp b/src/common/low_precision_transformations/src/qdq_stripping.cpp index b0422577ad1c31..72434decce0473 100644 --- a/src/common/low_precision_transformations/src/qdq_stripping.cpp +++ b/src/common/low_precision_transformations/src/qdq_stripping.cpp @@ -58,12 +58,12 @@ FQStrippingTransformation::FQStrippingTransformation(const std::set& lev return false; } auto constants_are_equal = [](const std::shared_ptr& lhs, - const std::shared_ptr& rhs) { + const std::shared_ptr& rhs) -> bool { auto equal = ov::as_type_ptr(ov::op::util::make_try_fold(lhs, rhs)); OPENVINO_ASSERT(equal && ov::shape_size(equal->get_shape()) == 1, "constants_are_equal expects scalar constant as a comparison result"); - return equal->get_vector()[0] == true; + return equal->get_vector()[0]; }; if (!constants_are_equal(input_low, output_low) || !constants_are_equal(input_high, output_high)) { diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index cc00e6ed1907c9..42e2d4d1f22553 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -133,15 +133,24 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( if (!ov::op::util::get_single_value(output_high, out_high_val)) return false; +#define PRECISION_LIMITS_FOR(type) \ + {ov::element::type}, { \ + static_cast(std::numeric_limits>::min()), \ + static_cast(std::numeric_limits>::max()) \ + } + static const std::unordered_map> supported_intervals{ - {ov::element::i8, {-128.f, 127.f}}, - {ov::element::u8, {0.f, 255.f}}, - {ov::element::i16, {-32768.f, 32767.f}}, - {ov::element::u16, {0.f, 65535.f}}}; + {PRECISION_LIMITS_FOR(i8)}, + {PRECISION_LIMITS_FOR(u8)}, + {PRECISION_LIMITS_FOR(i16)}, + {PRECISION_LIMITS_FOR(u16)}}; +#undef TYPE_INTERVAL_PAIR + const auto& type = convert1.get_element_type(); // check if (out_low_val, out_high_val) pair is mapped on the expected precision ranges - if (supported_intervals.count(type) == 0 || - supported_intervals.at(type) != std::make_pair(out_low_val, out_high_val)) { + auto interval_it = supported_intervals.find(type); + if (interval_it == supported_intervals.end() || + interval_it->second != std::make_pair(out_low_val, out_high_val)) { return false; } diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp index d1064e664ddf94..784cc29d312616 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp @@ -58,6 +58,8 @@ class QDQStrippingTest : public testing::WithParamInterface, protected: std::shared_ptr init_subgraph(const ov::PartialShape& input_shape, const ov::element::Type& quantization_precision) { + OPENVINO_ASSERT(quantization_precision == ov::element::i16 || quantization_precision == ov::element::u16, + "Only i16 and u16 quantization precisions are supported in the test"); ov::ParameterVector params{std::make_shared(ov::element::f32, input_shape)}; // Note: these params are taken from the real cases static const std::unordered_map> quantization_params{ From 4736a76d03b65d23e5a173e2257e8dd154af8028 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 5 Nov 2025 03:03:21 +0900 Subject: [PATCH 24/28] Test data generation is adapted to the case when clamp is not inserted --- .../subgraph_tests/dynamic/qdq_stripping.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp index 784cc29d312616..5e204edd08303d 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/qdq_stripping.cpp @@ -78,11 +78,11 @@ class QDQStrippingTest : public testing::WithParamInterface, size_t seed = 1; auto create_qdq_branch = [&](float weight_scale_value) { auto input_dequantized = qp_1.build_dq(input_convert2, quantization_precision); - ov::test::utils::InputGenerateData gen_data; - gen_data.seed = seed++; - auto weight_quantized = ov::test::utils::make_constant(ov::element::i8, ov::Shape{32, 3, 3, 3}, gen_data); + ov::test::utils::InputGenerateData weights_gen_data; + weights_gen_data.seed = seed; + auto weight_quantized = ov::test::utils::make_constant(ov::element::i8, ov::Shape{32, 3, 3, 3}, weights_gen_data); auto weight_convert = std::make_shared(weight_quantized, ov::element::f32); - auto weight_scale = ov::test::utils::make_constant(ov::element::f32, {}, gen_data); + auto weight_scale = ov::test::utils::make_constant(ov::element::f32, {}, std::vector{weight_scale_value}); auto weight_dequantized = std::make_shared(weight_convert, weight_scale); auto conv = std::make_shared(input_dequantized, @@ -92,7 +92,8 @@ class QDQStrippingTest : public testing::WithParamInterface, ov::CoordinateDiff{1, 1}, ov::Strides{1, 1}); - auto bias_const = ov::test::utils::make_constant(ov::element::f32, ov::Shape{1, 32, 1, 1}, gen_data); + ov::test::utils::InputGenerateData bias_gen_data(-2.0, 4, 100, seed++); + auto bias_const = ov::test::utils::make_constant(ov::element::f32, ov::Shape{1, 32, 1, 1}, bias_gen_data); auto conv_biased = std::make_shared(conv, bias_const); const auto& qp_2 = q_params.second; @@ -102,8 +103,8 @@ class QDQStrippingTest : public testing::WithParamInterface, return qp_2.build_dq(act_convert, quantization_precision); }; - auto left_branch = create_qdq_branch(0.01f); - auto right_branch = create_qdq_branch(0.001f); + auto left_branch = create_qdq_branch(1e-3f); + auto right_branch = create_qdq_branch(1e-4f); auto add_branches = std::make_shared(left_branch, right_branch); auto model = std::make_shared(ov::OutputVector{add_branches}, params, "QDQStripping"); @@ -116,8 +117,8 @@ class QDQStrippingTest : public testing::WithParamInterface, init_input_shapes({input_shape}); inType = outType = input_precision; - // Since the FQ are not executed in a strictly 'fair' manner, and just replaced with clamp ops, a small deviation in accuracy is expected. - abs_threshold = 1.f; + // Since the FQ are not executed in a strictly 'fair' manner, and just replaced with clamp ops, a small accuracy deviation is expected. + abs_threshold = 1e-3f; function = init_subgraph(input_shape.first, quantization_precision); } From 5b25b47326e5d37bf52afe6133a904813d7aea54 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 5 Nov 2025 03:04:22 +0900 Subject: [PATCH 25/28] Fixed undef --- .../common_optimizations/convert_quantize_dequantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp index 42e2d4d1f22553..e6ea1e477a71b6 100644 --- a/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/convert_quantize_dequantize.cpp @@ -144,7 +144,7 @@ ov::pass::ConvertQuantizeDequantize::ConvertQuantizeDequantize( {PRECISION_LIMITS_FOR(u8)}, {PRECISION_LIMITS_FOR(i16)}, {PRECISION_LIMITS_FOR(u16)}}; -#undef TYPE_INTERVAL_PAIR +#undef PRECISION_LIMITS_FOR const auto& type = convert1.get_element_type(); // check if (out_low_val, out_high_val) pair is mapped on the expected precision ranges From fd48bf4d0d9b5acc35d2ab88d36a49cb79584602 Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Wed, 5 Nov 2025 11:09:57 +0100 Subject: [PATCH 26/28] Remove unnecessary log message --- src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 644bd699c67bbf..ab36b09252d5d9 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -400,7 +400,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(TypeVector{i16, u16}, TypeVector{f32}, true); // 3. Strip FQ layers with unsupported levels bool replace_with_clamp = ov::util::getenv_bool("REPLACE_QDQ_WITH_CLAMP", true); - std::cout << "[ QDQ STRIPPING INFO ] replace_with_clamp = " << replace_with_clamp << std::endl; manager.register_pass(std::set{levels::int16}, replace_with_clamp); manager.register_pass(); } From 247122748a940c5c42aa44df19503aa22aa30b8b Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 5 Nov 2025 21:11:41 +0900 Subject: [PATCH 27/28] Isolate QDQ stripping changes to make sure that the feature doesn't affect non-i16 models --- .../src/plugin/transformations_pipeline.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index ab36b09252d5d9..74fb7e61831a21 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -391,16 +391,14 @@ void TransformationsPipeline::apply(std::shared_ptr func) { using namespace ov::pass::low_precision; auto is_model_quantized = LowPrecision::isFunctionQuantized(func); enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; - { + const auto enableQDQStripping = LowPrecision::isFunctionQuantized(func, std::set{levels::int16}); + if (enableQDQStripping) { using namespace ov::element; // QDQ stripping pipeline - // 1. Transform DQ part to canonicalized form: Multiply->Add => Subtract->Multiply - manager.register_pass(); - // 2. Fuse FQ->Convert->DQ to a single FQ + // 1. Fuse FQ->Convert->DQ to a single FQ manager.register_pass(TypeVector{i16, u16}, TypeVector{f32}, true); - // 3. Strip FQ layers with unsupported levels - bool replace_with_clamp = ov::util::getenv_bool("REPLACE_QDQ_WITH_CLAMP", true); - manager.register_pass(std::set{levels::int16}, replace_with_clamp); + // 2. Strip FQ layers with unsupported levels + manager.register_pass(std::set{levels::int16}, false); manager.register_pass(); } From fe97fc45bcf3a676a6d3642b4e28d9f88e57fc83 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 11 Nov 2025 09:28:26 +0900 Subject: [PATCH 28/28] Try to avoid unnecessary LPT passes running if QDQ stripping has been finished successfully --- .../src/plugin/transformations_pipeline.cpp | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 74fb7e61831a21..d672714820756b 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -348,6 +348,24 @@ void TransformationsPipeline::apply(std::shared_ptr func) { bool enableInt8; ov::element::Type infer_precision = ov::element::dynamic; bool unroll_loop = config.get_enable_loop_unrolling(); + auto is_model_quantized = ov::pass::low_precision::LowPrecision::isFunctionQuantized(func); + { + using namespace ov::pass::low_precision; + const auto enableQDQStripping = LowPrecision::isFunctionQuantized(func, std::set{levels::int16}); + if (enableQDQStripping) { + ov::pass::Manager qdq_stripping_manager("Plugin:GPU:QDQ_Stripping"); + using namespace ov::element; + // QDQ stripping pipeline + // 1. Fuse FQ->Convert->DQ to a single FQ + qdq_stripping_manager.register_pass(TypeVector{i16, u16}, TypeVector{f32}, true); + // 2. Strip FQ layers with unsupported levels + qdq_stripping_manager.register_pass(std::set{levels::int16}, false); + qdq_stripping_manager.run_passes(func); + is_model_quantized = LowPrecision::isFunctionQuantized(func); + } + } + enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; + { ov::pass::Manager manager("Plugin:GPU"); auto pass_config = manager.get_pass_config(); @@ -388,20 +406,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { ov::disable_keep_const_precision(node); } - using namespace ov::pass::low_precision; - auto is_model_quantized = LowPrecision::isFunctionQuantized(func); - enableInt8 = config.get_enable_lp_transformations() && is_model_quantized; - const auto enableQDQStripping = LowPrecision::isFunctionQuantized(func, std::set{levels::int16}); - if (enableQDQStripping) { - using namespace ov::element; - // QDQ stripping pipeline - // 1. Fuse FQ->Convert->DQ to a single FQ - manager.register_pass(TypeVector{i16, u16}, TypeVector{f32}, true); - // 2. Strip FQ layers with unsupported levels - manager.register_pass(std::set{levels::int16}, false); - manager.register_pass(); - } - manager.register_pass( std::vector{ ov::element::i8, ov::element::u8, ov::element::i4, ov::element::u4 }, !device_info.supports_immad);