Skip to content

Commit e94435e

Browse files
authored
[ML][Pipelines] Validate pipeline node IO name on reserved word (Azure#28770)
* validate keyword in IO of node(s) in pipeline * add test * move io name validation to builder and log warning * fix warning error * update warning message * update warning message
1 parent c2264a1 commit e94435e

File tree

7 files changed

+139
-1
lines changed

7 files changed

+139
-1
lines changed

sdk/ml/azure-ai-ml/azure/ai/ml/constants/_job/pipeline.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,44 @@ class PipelineConstants:
2020

2121
class ValidationErrorCode:
2222
PARAMETER_TYPE_UNKNOWN = "ParameterTypeUnknown"
23+
24+
25+
# Methods in Python dictionary, when used as IO name, will actually get function rather than IO object,
26+
# resulting in validation error.
27+
# So print warning message on this and suggest user to access with syntax "d[key]" instead of "d.key".
28+
# Reference: builtins.py::dict
29+
COMPONENT_IO_KEYWORDS = {
30+
"clear",
31+
"copy",
32+
"fromkeys",
33+
"get",
34+
"items",
35+
"keys",
36+
"pop",
37+
"popitem",
38+
"setdefault",
39+
"update",
40+
"values",
41+
"__class_getitem__",
42+
"__contains__",
43+
"__delitem__",
44+
"__eq__",
45+
"__getattribute__",
46+
"__getitem__",
47+
"__ge__",
48+
"__init__",
49+
"__ior__",
50+
"__iter__",
51+
"__len__",
52+
"__le__",
53+
"__lt__",
54+
"__new__",
55+
"__ne__",
56+
"__or__",
57+
"__repr__",
58+
"__reversed__",
59+
"__ror__",
60+
"__setitem__",
61+
"__sizeof__",
62+
"__hash__",
63+
}

sdk/ml/azure-ai-ml/azure/ai/ml/dsl/_pipeline_component_builder.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# pylint: disable=protected-access
66
import copy
77
import inspect
8+
import logging
89
import typing
910
from collections import OrderedDict
1011
from inspect import Parameter, signature
@@ -18,6 +19,7 @@
1819
)
1920
from azure.ai.ml.constants import AssetTypes
2021
from azure.ai.ml.constants._component import ComponentSource, IOConstants
22+
from azure.ai.ml.constants._job.pipeline import COMPONENT_IO_KEYWORDS
2123
from azure.ai.ml.dsl._utils import _sanitize_python_variable_name
2224
from azure.ai.ml.entities import PipelineJob
2325
from azure.ai.ml.entities._builders import BaseNode
@@ -26,6 +28,7 @@
2628
from azure.ai.ml.entities._inputs_outputs import GroupInput, Input, Output, _get_param_with_standard_annotation
2729
from azure.ai.ml.entities._inputs_outputs.utils import _get_annotation_by_value, is_group
2830
from azure.ai.ml.entities._job.automl.automl_job import AutoMLJob
31+
from azure.ai.ml.entities._job.pipeline._attr_dict import has_attr_safe
2932
from azure.ai.ml.entities._job.pipeline._io import NodeOutput, PipelineInput, PipelineOutput, _GroupAttrDict
3033

3134
# We need to limit the depth of pipeline to avoid the built graph goes too deep and prevent potential
@@ -34,6 +37,8 @@
3437

3538
_BUILDER_STACK_MAX_DEPTH = 100
3639

40+
module_logger = logging.getLogger(__name__)
41+
3742

3843
class _PipelineComponentBuilderStack:
3944
def __init__(self):
@@ -390,6 +395,9 @@ def _get_name_or_component_name(node: Union[BaseNode, AutoMLJob]):
390395
final_name = id_name_dict[_id]
391396
node.name = final_name
392397
result[final_name] = node
398+
399+
# Validate IO name of node with correct node name, and log warning if there is keyword.
400+
self._validate_keyword_in_node_io(node)
393401
return result
394402

395403
def _update_inputs(self, pipeline_inputs):
@@ -468,6 +476,23 @@ def _validate_inferred_outputs(self, output_meta_dict: dict, output_dict: dict):
468476
if unmatched_outputs:
469477
raise UserErrorException(f"{error_prefix}: {unmatched_outputs}")
470478

479+
@staticmethod
480+
def _validate_keyword_in_node_io(node: Union[BaseNode, AutoMLJob]):
481+
if has_attr_safe(node, "inputs"):
482+
for input_name in set(node.inputs) & COMPONENT_IO_KEYWORDS:
483+
module_logger.warning(
484+
"Reserved word \"%s\" is used as input name in node \"%s\", "
485+
"can only be accessed with '%s.inputs[\"%s\"]'",
486+
input_name, node.name, node.name, input_name
487+
)
488+
if has_attr_safe(node, "outputs"):
489+
for output_name in set(node.outputs) & COMPONENT_IO_KEYWORDS:
490+
module_logger.warning(
491+
"Reserved word \"%s\" is used as output name in node \"%s\", "
492+
"can only be accessed with '%s.outputs[\"%s\"]'",
493+
output_name, node.name, node.name, output_name
494+
)
495+
471496

472497
def _build_pipeline_parameter(func, *, user_provided_kwargs, group_default_kwargs=None, non_pipeline_inputs=None):
473498
# Pass group defaults into kwargs to support group.item can be used even if no default on function.

sdk/ml/azure-ai-ml/tests/dsl/unittests/test_dsl_pipeline.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import os
23
from io import StringIO
34
from pathlib import Path
@@ -2803,4 +2804,26 @@ def register_node_output():
28032804
pipeline.settings.default_compute = "azureml:cpu-cluster"
28042805
with pytest.raises(UserErrorException) as e:
28052806
assert_job_cancel(pipeline, client)
2806-
assert 'The output name @ can only contain alphanumeric characters, dashes and underscores, with a limit of 255 characters.' in str(e.value)
2807+
assert 'The output name @ can only contain alphanumeric characters, dashes and underscores, with a limit of 255 characters.' in str(e.value)
2808+
2809+
def test_validate_pipeline_node_io_name_has_keyword(self, caplog):
2810+
# Refresh logger for pytest to capture log, otherwise the result is empty.
2811+
from azure.ai.ml.dsl import _pipeline_component_builder
2812+
2813+
_pipeline_component_builder.module_logger = logging.getLogger(__file__)
2814+
with caplog.at_level(logging.WARNING):
2815+
from test_configs.dsl_pipeline.pipeline_with_keyword_in_node_io.pipeline import pipeline_job
2816+
2817+
# validation should pass
2818+
assert pipeline_job._customized_validate().passed
2819+
2820+
warning_template = (
2821+
"Reserved word \"{io_name}\" is used as {io} name in node \"{node_name}\", "
2822+
"can only be accessed with '{node_name}.{io}s[\"{io_name}\"]'"
2823+
)
2824+
assert caplog.messages == [
2825+
warning_template.format(io_name="__contains__", io="output", node_name="node"),
2826+
warning_template.format(io_name="items", io="output", node_name="upstream_node"),
2827+
warning_template.format(io_name="keys", io="input", node_name="downstream_node"),
2828+
warning_template.format(io_name="__hash__", io="output", node_name="pipeline_component_func"),
2829+
]
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
2+
type: command
3+
name: component_with_keys_in_inputs
4+
command: echo ${{inputs.keys}}
5+
inputs:
6+
keys:
7+
type: uri_folder
8+
environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
2+
type: command
3+
name: component_with_keyword_in_outputs
4+
command: echo ${{outputs.__contains__}}
5+
outputs:
6+
__contains__:
7+
type: uri_folder
8+
environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from pathlib import Path
2+
3+
from azure.ai.ml import load_component
4+
from azure.ai.ml.dsl import pipeline
5+
6+
upstream_component = load_component(Path(__file__).parent / "upstream_node.yml")
7+
downstream_component = load_component(Path(__file__).parent / "downstream_node.yml")
8+
inner_component = load_component(Path(__file__).parent / "inner_node.yml")
9+
10+
11+
@pipeline
12+
def pipeline_component_func():
13+
node = inner_component()
14+
return {"__hash__": node.outputs["__contains__"]}
15+
16+
17+
@pipeline
18+
def pipeline_func():
19+
upstream_node = upstream_component()
20+
downstream_node = downstream_component(keys=upstream_node.outputs["items"]) # noqa: F841
21+
pipeline_component_func()
22+
23+
24+
pipeline_job = pipeline_func()
25+
pipeline_job.settings.default_compute = "cpu-cluster"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
2+
type: command
3+
name: component_with_items_in_outputs
4+
command: echo ${{outputs.items}}
5+
outputs:
6+
items:
7+
type: uri_folder
8+
environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu:1

0 commit comments

Comments
 (0)