add the additional config to the doc

ganyi1996ppo · ganyi1996ppo · commit c5b1dd22703b · 2025-08-15T16:20:42.000+08:00
Signed-off-by: ganyi &lt;pleaplusone.gy@gmail.com&gt;
diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md
@@ -27,6 +27,7 @@ The following table lists the additional configuration options available in vLLM
 | Name                          | Type | Default | Description                                                                                   |
 |-------------------------------| ---- |------|-----------------------------------------------------------------------------------------------|
 | `torchair_graph_config`       | dict | `{}` | The config options for torchair graph mode                                                    |
+| `ascend_compilation_config`   | dict | `{}` | The config options for torch.compile                                                          |
 | `ascend_scheduler_config`     | dict | `{}` | The config options for ascend scheduler                                                       |
 | `refresh`                     | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case.     |
 | `expert_map_path`             | str  | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
@@ -49,6 +50,13 @@ The details of each config option are as follows:
 | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
 | `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout. This option only takes effects on models using MLA (e.g., DeepSeek). |
 
+**ascend_compilation_config**
+| Name | Type | Default | Description |
+| ---- | ---- | ------- | ----------- |
+| `enable_graph_rewrite` | bool | `True` | Whether to enable the graph rewriter to rewrite the fx graph generated by torch.compile |
+| `enable_quantization_fusion` | bool | `True` | Whether to enable the fusion pass on op + quantize, this should remain open by default to benefit all users for performance boost |
+
+
 **ascend_scheduler_config**
 
 | Name | Type | Default | Description |
@@ -71,6 +79,10 @@ An example of additional configuration is as follows:
         "enable_multistream_moe": False,
         "enable_kv_nz": False
     },
+    "ascend_compilation_config": {
+        "enable_graph_rewriter": True,
+        "enable_quantization_fusion": True
+    },
     "ascend_scheduler_config": {
         "enabled": True,
         "enable_chunked_prefill": True,
diff --git a/tests/e2e/singlecard/test_graph_rewriter.py b/tests/e2e/singlecard/test_graph_rewriter.py
@@ -21,22 +21,24 @@
 import torch_npu
 import random
 import copy
+from vllm.config import VllmConfig
+from vllm_ascend.compilation.quant_fusion_pass import AscendQuantFusionPass
+from vllm_ascend.compilation.graph_rewrite_pass_manager import GraphRewritePassManager
 from vllm_ascend.quantization.w8a8 import quant_per_tensor
 
+
 class ModelWithRMSNormQuant(nn.Module):
     def __init__(self, hidden_size, eps=1e-6, quant_config=None, prefix=""):
         super().__init__()
         self.hidden_size = hidden_size
         self.eps = eps
         self.quant_config = quant_config
         self.prefix = prefix
-        self.former_linear = nn.Linear(hidden_size, hidden_size)  # float
-        self.post_linear = nn.Linear(hidden_size, hidden_size, dtype=torch.int8)    # quantized
-        self.deq_scale = 0.7
+        self.former_linear = nn.Linear(hidden_size, hidden_size)
         self.weight = nn.Parameter(torch.Tensor(hidden_size))
         self.bias = nn.Parameter(torch.Tensor(hidden_size))
-        self.quant_scale = 0.83
-        self.quant_offset = 3
+        self.quant_scale = nn.Parameter(torch.Tensor(hidden_size))
+        self.quant_offset = nn.Parameter(torch.Tensor(hidden_size))
 
     def forward(self, x):
         hidden_states = self.former_linear(x)
@@ -45,35 +47,37 @@ def forward(self, x):
         return quantized_output, residual
 
 
-def custom_graph_rewriter_backend(gm: torch.fx.GraphModule, example_inputs):
-    from torch.fx.subgraph_rewriter import replace_pattern
-    print("before fusion graph:", gm.graph)
-    def pattern(npu_quant_matmul, output_parallel, rms_norm_weight, scale, offset):
-        output = torch.ops.npu_add_rms_norm(npu_quant_matmul, output_parallel, rms_norm_weight, 1e-6)
-        out0 = output[0]
-        out1 = output[2]
-        new_out = torch.ops.npu.npu_quantize(out0, scale, offset, torch.qint8, -1, False)
-        return new_out, out1
-    
-    def replace(npu_quant_matmul, output_parallel, rms_norm_weight, scale, offset):
-        output = torch.ops.npu.npu_add_rms_norm_quantize(npu_quant_matmul, output_parallel, rms_norm_weight, scale, offset, epsilon=1e-6)
-        return output[0], output[2]
 
-    replace_pattern(gm, pattern, replace)
-    gm.recompile()
-    print("after fusion graph:", gm.graph)
-    return gm
+class CustomizeCompilationInterface:
+    def __init__(self, vllm_config):
+        self.vllm_config = vllm_config
+        self.graph_rewriter_manager = GraphRewritePassManager()
+        self.graph_rewriter_manager.configure(vllm_config)
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
+        gm = self.graph_rewriter_manager(gm)
+        gm.recompile()
+        return gm
+
 
-def test_graph_rewriter():
+def test_fusion_pass(
+    num_tokens: int = 20,
+    hidden_size: int = 4096,
+):
     # Create a random input tensor
-    num_tokens = 20
-    hidden_size = 4096
     input_tensor = torch.randn(num_tokens, hidden_size)
+    vllm_config = VllmConfig()
+    # Open the compilation fusion config and enable the graph rewriter on quantization
+    vllm_config.additional_config.ascend_compilation_config.enable_graph_rewriter = True
+    vllm_config.additional_config.ascend_compilation_config.enable_quantization_fusion = True
+    compilation_interface = CustomizeCompilationInterface(vllm_config)
+    for pass_ in compilation_interface.graph_rewriter_manager.passes:
+        
 
     # Initialize the model with RMSNorm quantization
     model = ModelWithRMSNormQuant(hidden_size=hidden_size)
     new_model = copy.deepcopy(model)
-    compiled_model = torch.compile(model, backend=custom_graph_rewriter_backend)
+    compiled_model = torch.compile(model, backend=CustomizeCompilationInterface(vllm_config))
     for i in range(3):
         output = compiled_model(input_tensor)
     # Check if the output is as expected
diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py
@@ -54,6 +54,10 @@ def test_init_ascend_config_without_additional_config(self):
         self.assertTrue(torchair_graph_config.enable_view_optimize)
         self.assertFalse(torchair_graph_config.enable_kv_nz)
 
+        ascend_compilation_config = ascend_config.ascend_compilation_config
+        self.assertTrue(ascend_compilation_config.enable_graph_rewriter)
+        self.assertTrue(ascend_compilation_config.enable_quantization_fusion)
+
         ascend_scheduler_config = ascend_config.ascend_scheduler_config
         self.assertFalse(ascend_scheduler_config.enabled)
 
@@ -71,6 +75,10 @@ def test_init_ascend_config_with_additional_config(self):
                 "enable_view_optimize": True,
                 "enable_kv_nz": True
             },
+            "ascend_compilation_config": {
+                "enable_graph_rewriter": False,
+                "enable_quantization_fusion": False,
+            },
             "ascend_scheduler_config": {
                 "enabled": True
             },
@@ -89,6 +97,10 @@ def test_init_ascend_config_with_additional_config(self):
         self.assertTrue(torchair_graph_config.enable_multistream_moe)
         self.assertTrue(torchair_graph_config.enable_view_optimize)
         self.assertTrue(torchair_graph_config.enable_kv_nz)
+        ascend_compilation_config = ascend_config.ascend_compilation_config
+        self.assertFalse(ascend_compilation_config.enable_graph_rewriter)
+        self.assertFalse(ascend_compilation_config.enable_quantization_fusion)
+        
 
         ascend_scheduler_config = ascend_config.ascend_scheduler_config
         self.assertTrue(ascend_scheduler_config.enabled)
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -39,6 +39,9 @@ def __init__(self, vllm_config):
                                                       {})
         self.torchair_graph_config = TorchairGraphConfig(torchair_graph_config)
 
+        ascend_compilation_config = additional_config.get("ascend_compilation_config", {})
+        self.ascend_compilation_config = AscendCompilationConfig(ascend_compilation_config)
+
         ascend_scheduler_config = additional_config.get(
             "ascend_scheduler_config", {})
         self.ascend_scheduler_config = AscendSchedulerConfig(
@@ -105,6 +108,19 @@ def __init__(self, torchair_graph_config):
                     "enable_kv_nz is valid only when Torchair graph mode is enabled"
                 )
 
+class AscendCompilationConfig:
+    """
+    Configuration Object for ascend_compilation_config from additional_config
+    """
+
+    def __init__(self, ascend_compilation_config: dict):
+        self.enable_graph_rewriter = ascend_compilation_config.get(
+            "enable_graph_rewriter", True)
+        self.enable_quantization_fusion = ascend_compilation_config.get(
+            "enable_quantization_fusion", True)
+        # Add more compilation related configs here as needed
+
+
 
 class AscendSchedulerConfig:
     """
@@ -175,6 +191,12 @@ def check_ascend_config(vllm_config, enforce_eager):
                     "it has been disabled automatically.")
         # aclgraph case
         else:
+            # This graph fusion can actually works on eager mode.
+            if ascend_config.ascend_compilation_config.enable_graph_rewriter:
+                logger.info("Graph rewriter enabled! Automatic kernel fusion is expected.")
+
+                if ascend_config.ascend_compilation_config.enable_quantization_fusion:
+                    logger.info("Quantization fusion enabled! op fusion on quantization are expected. ")
             # aclgraph doesn't work with deepseek model and only qwen model is well tested.
             if vllm_config.model_config:
                 model_type = vllm_config.model_config.hf_config.model_type
diff --git a/vllm_ascend/compilation/graph_rewrite_pass_manager.py b/vllm_ascend/compilation/graph_rewrite_pass_manager.py
@@ -19,11 +19,8 @@
 from torch import fx as fx
 
 from vllm.config import VllmConfig
-from vllm.platforms import current_platform
-from vllm.logger import init_logger
 from vllm.compilation.vllm_inductor_pass import VllmInductorPass
-from vllm.compilation.inductor_pass import get_pass_context, InductorPass
-from quant_fusion_pass import AscendQuantFusionPass
+from vllm.compilation.inductor_pass import get_pass_context
 
 
 class GraphRewritePassManager:
@@ -51,8 +48,8 @@ def add(self, pass_: VllmInductorPass):
         self.passes.append(pass_)
   
     def configure(self, config: VllmConfig):
-        self.pass_config = config.additional_config.ascend_pass_config
-        if self.pass_config.enable_addrms_norm_quant_fusion:
+        self.ascend_compilation_config = config.additional_config.ascend_compilation_config
+        if self.ascend_compilation_config.enable_quantization_fusion:
             from .quant_fusion_pass import AscendQuantFusionPass
             self.passes.append(AscendQuantFusionPass(config))
         # Add more passes here as needed