From 2606715fa9942d593590cb8394bd9f554476bcc7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 10:53:35 +0100 Subject: [PATCH 01/10] Mark class-variable as such. This ensures that mypy understands it's intended to work this way. --- src/databricks/labs/blueprint/paths.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 1cf29f2..54ab0b6 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -15,7 +15,7 @@ from collections.abc import Generator, Iterable, Sequence from io import BytesIO, StringIO from pathlib import Path, PurePath -from typing import BinaryIO, Literal, NoReturn, TextIO, TypeVar +from typing import BinaryIO, ClassVar, Literal, NoReturn, TextIO, TypeVar from urllib.parse import quote_from_bytes as urlquote_from_bytes from databricks.sdk import WorkspaceClient @@ -127,7 +127,7 @@ class _DatabricksPath(Path, abc.ABC): # pylint: disable=too-many-public-methods _str: str _hash: int - parser = _posixpath + parser: ClassVar = _posixpath # Compatibility attribute, for when superclass implementations get invoked on python <= 3.11. _flavour = object() From be3be3aafb55dad5ec008c1df149250ec92b83e0 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 10:55:06 +0100 Subject: [PATCH 02/10] Update method signatures to match Python 3.13+ --- src/databricks/labs/blueprint/paths.py | 30 +++++++++++++++++++------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 54ab0b6..d220cfd 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -246,10 +246,10 @@ def open( ): ... @abstractmethod - def is_dir(self) -> bool: ... + def is_dir(self, *, follow_symlinks: bool = True) -> bool: ... @abstractmethod - def is_file(self) -> bool: ... + def is_file(self, *, follow_symlinks: bool = True) -> bool: ... @abstractmethod def rename(self: P, target: str | bytes | os.PathLike) -> P: ... @@ -587,7 +587,10 @@ def glob( pattern: str | bytes | os.PathLike, *, case_sensitive: bool | None = None, + recurse_symlinks: bool = False, ) -> Generator[P, None, None]: + if recurse_symlinks: + raise NotImplementedError("recurse_symlinks is not supported for Databricks paths") pattern_parts = self._prepare_pattern(pattern) if case_sensitive is None: case_sensitive = True @@ -599,7 +602,10 @@ def rglob( pattern: str | bytes | os.PathLike, *, case_sensitive: bool | None = None, + recurse_symlinks: bool = False, ) -> Generator[P, None, None]: + if recurse_symlinks: + raise NotImplementedError("recurse_symlinks is not supported for Databricks paths") pattern_parts = ("**", *self._prepare_pattern(pattern)) if case_sensitive is None: case_sensitive = True @@ -731,15 +737,19 @@ def stat(self, *, follow_symlinks=True) -> os.stat_result: ) # 8 return os.stat_result(seq) - def is_dir(self) -> bool: + def is_dir(self, *, follow_symlinks: bool = True) -> bool: """Return True if the path points to a DBFS directory.""" + if not follow_symlinks: + raise NotImplementedError("follow_symlinks is not supported for DBFS paths") try: return bool(self._file_info.is_dir) except DatabricksError: return False - def is_file(self) -> bool: + def is_file(self, *, follow_symlinks: bool = True) -> bool: """Return True if the path points to a file in Databricks Workspace.""" + if not follow_symlinks: + raise NotImplementedError("follow_symlinks is not supported for DBFS paths") return not self.is_dir() def iterdir(self) -> Generator[DBFSPath, None, None]: @@ -842,8 +852,8 @@ def open( return _TextUploadIO(self._ws, self.as_posix()) raise ValueError(f"invalid mode: {mode}") - def read_text(self, encoding=None, errors=None): - with self.open(mode="r", encoding=encoding, errors=errors) as f: + def read_text(self, encoding=None, errors=None, newline=None) -> str: + with self.open(mode="r", encoding=encoding, errors=errors, newline=newline) as f: return f.read() @property @@ -881,15 +891,19 @@ def stat(self, *, follow_symlinks=True) -> os.stat_result: seq[stat.ST_CTIME] = float(self._object_info.created_at) / 1000.0 if self._object_info.created_at else -1.0 # 9 return os.stat_result(seq) - def is_dir(self) -> bool: + def is_dir(self, *, follow_symlinks: bool = True) -> bool: """Return True if the path points to a directory in Databricks Workspace.""" + if not follow_symlinks: + raise NotImplementedError("follow_symlinks is not supported for Workspace paths") try: return self._object_info.object_type == ObjectType.DIRECTORY except DatabricksError: return False - def is_file(self) -> bool: + def is_file(self, *, follow_symlinks: bool = True) -> bool: """Return True if the path points to a file in Databricks Workspace.""" + if not follow_symlinks: + raise NotImplementedError("follow_symlinks is not supported for Workspace paths") try: return self._object_info.object_type == ObjectType.FILE except DatabricksError: From 6131c9567f321850892efa189f783526afefd380 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 10:57:59 +0100 Subject: [PATCH 03/10] Drop unnecessary (default) type parameters. --- src/databricks/labs/blueprint/paths.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index d220cfd..e350827 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -258,7 +258,7 @@ def rename(self: P, target: str | bytes | os.PathLike) -> P: ... def replace(self: P, target: str | bytes | os.PathLike) -> P: ... @abstractmethod - def iterdir(self: P) -> Generator[P, None, None]: ... + def iterdir(self: P) -> Generator[P]: ... def __reduce__(self) -> NoReturn: # Cannot support pickling because we can't pickle the workspace client. @@ -588,7 +588,7 @@ def glob( *, case_sensitive: bool | None = None, recurse_symlinks: bool = False, - ) -> Generator[P, None, None]: + ) -> Generator[P]: if recurse_symlinks: raise NotImplementedError("recurse_symlinks is not supported for Databricks paths") pattern_parts = self._prepare_pattern(pattern) @@ -603,7 +603,7 @@ def rglob( *, case_sensitive: bool | None = None, recurse_symlinks: bool = False, - ) -> Generator[P, None, None]: + ) -> Generator[P]: if recurse_symlinks: raise NotImplementedError("recurse_symlinks is not supported for Databricks paths") pattern_parts = ("**", *self._prepare_pattern(pattern)) @@ -752,7 +752,7 @@ def is_file(self, *, follow_symlinks: bool = True) -> bool: raise NotImplementedError("follow_symlinks is not supported for DBFS paths") return not self.is_dir() - def iterdir(self) -> Generator[DBFSPath, None, None]: + def iterdir(self) -> Generator[DBFSPath]: for child in self._ws.dbfs.list(self.as_posix()): yield self._from_file_info(self._ws, child) @@ -916,7 +916,7 @@ def is_notebook(self) -> bool: except DatabricksError: return False - def iterdir(self) -> Generator[WorkspacePath, None, None]: + def iterdir(self) -> Generator[WorkspacePath]: for child in self._ws.workspace.list(self.as_posix()): yield self._from_object_info(self._ws, child) From f5dcea4834c757d3cdb232a521e1e34fdc7f26eb Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 10:58:11 +0100 Subject: [PATCH 04/10] Fix copypasta in docstring. --- src/databricks/labs/blueprint/paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index e350827..f039da1 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -747,7 +747,7 @@ def is_dir(self, *, follow_symlinks: bool = True) -> bool: return False def is_file(self, *, follow_symlinks: bool = True) -> bool: - """Return True if the path points to a file in Databricks Workspace.""" + """Return True if the path points to a DBFS file.""" if not follow_symlinks: raise NotImplementedError("follow_symlinks is not supported for DBFS paths") return not self.is_dir() From a582f601acc9f4dfd935ea76501fb45d5d8e7828 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 10:58:36 +0100 Subject: [PATCH 05/10] Update to current versions of linting tools. --- pyproject.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 368d8de..8aa74c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,9 +41,9 @@ path = "src/databricks/labs/blueprint/__about__.py" dependencies = [ "databricks-labs-blueprint[yaml]", "coverage[toml]~=7.4.4", - "mypy~=1.9.0", - "pylint~=3.1.0", - "pylint-pytest==2.0.0a0", + "mypy~=1.18.0", + "pylint~=4.0.0", + # "pylint-pytest==2.0.0a0" # Incorrect dependency constraint (pylint<4), installed separately below. "databricks-labs-pylint~=0.3.0", "pytest~=8.1.0", "pytest-cov~=4.1.0", @@ -55,6 +55,10 @@ dependencies = [ "types-requests~=2.31.0", ] +post-install-commands = [ + "pip install --no-deps pylint-pytest==2.0.0a0", # See above; installed here to avoid dependency conflict. +] + # store virtual env as the child of this folder. Helps VSCode (and PyCharm) to run better path = ".venv" @@ -207,10 +211,6 @@ py-version = "3.10" # source root. # source-roots = -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode = true - # Allow loading of arbitrary C extensions. Extensions are imported into the # active Python interpreter and may run arbitrary code. # unsafe-load-any-extension = From e664b3ec2927325d1a562a19602e4020a66b146b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 12:06:27 +0100 Subject: [PATCH 06/10] Avoid type(None) because it triggers a pylint warning. --- src/databricks/labs/blueprint/installation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/blueprint/installation.py b/src/databricks/labs/blueprint/installation.py index fdb573f..b60ff60 100644 --- a/src/databricks/labs/blueprint/installation.py +++ b/src/databricks/labs/blueprint/installation.py @@ -775,7 +775,7 @@ def _unmarshal_union(cls, inst, path, type_ref): """The `_unmarshal_union` method is a private method that is used to deserialize a dictionary to an object of type `type_ref`. This method is called by the `load` method.""" for variant in get_args(type_ref): - if variant == type(None) and inst is None: + if variant == types.NoneType and inst is None: return None try: value = cls._unmarshal(inst, path, variant) From ec532d5088fda0276af8cd2b5875c655e5deeadc Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 12:07:17 +0100 Subject: [PATCH 07/10] Add some type annotations. --- src/databricks/labs/blueprint/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/blueprint/cli.py b/src/databricks/labs/blueprint/cli.py index e3de8c6..e2215c9 100644 --- a/src/databricks/labs/blueprint/cli.py +++ b/src/databricks/labs/blueprint/cli.py @@ -171,13 +171,13 @@ def _patch_databricks_host(self) -> None: self._logger.warning(f"Working around DATABRICKS_HOST normalization issue: {host} -> {fixed_host}") os.environ["DATABRICKS_HOST"] = fixed_host - def _account_client(self): + def _account_client(self) -> AccountClient: return AccountClient( product=self._product_info.product_name(), product_version=self._product_info.version(), ) - def _workspace_client(self): + def _workspace_client(self) -> WorkspaceClient: return WorkspaceClient( product=self._product_info.product_name(), product_version=self._product_info.version(), From 3094ea65d8b666e510ec8fe49bd72faea5c0dea1 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 12:07:31 +0100 Subject: [PATCH 08/10] Reduce the complexity of _route() by extracting argument parsing/handling. --- src/databricks/labs/blueprint/cli.py | 49 +++++++++++++++++----------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/databricks/labs/blueprint/cli.py b/src/databricks/labs/blueprint/cli.py index e2215c9..37a3b4d 100644 --- a/src/databricks/labs/blueprint/cli.py +++ b/src/databricks/labs/blueprint/cli.py @@ -51,6 +51,10 @@ def get_argument_type(self, argument_name: str) -> str | None: return annotation.__name__ +# The types of arguments that can be passed to commands. +_CommandArg = int | str | bool | float | WorkspaceClient | AccountClient | Prompts + + class App: def __init__(self, __file: str): self._mapping: dict[str, Command] = {} @@ -94,27 +98,9 @@ def _route(self, raw): log_level = "info" databricks_logger = logging.getLogger("databricks") databricks_logger.setLevel(log_level.upper()) - kwargs = {k.replace("-", "_"): v for k, v in flags.items() if v != ""} cmd = self._mapping[command] - # modify kwargs to match the type of the argument - for kwarg in list(kwargs.keys()): - match cmd.get_argument_type(kwarg): - case "int": - kwargs[kwarg] = int(kwargs[kwarg]) - case "bool": - kwargs[kwarg] = kwargs[kwarg].lower() == "true" - case "float": - kwargs[kwarg] = float(kwargs[kwarg]) + kwargs = self._build_args(cmd, flags) try: - if cmd.needs_workspace_client(): - self._patch_databricks_host() - kwargs["w"] = self._workspace_client() - elif cmd.is_account: - self._patch_databricks_host() - kwargs["a"] = self._account_client() - prompts_argument = cmd.prompts_argument_name() - if prompts_argument: - kwargs[prompts_argument] = Prompts() cmd.fn(**kwargs) except Exception as err: # pylint: disable=broad-exception-caught logger = self._logger.getChild(command) @@ -123,6 +109,31 @@ def _route(self, raw): else: logger.error(f"{err.__class__.__name__}: {err}") + def _build_args(self, cmd: Command, flags: dict[str, str]) -> dict[str, _CommandArg]: + kwargs: dict[str, _CommandArg] = {k.replace("-", "_"): v for k, v in flags.items() if v != ""} + # modify kwargs to match the type of the argument + for kwarg in list(kwargs.keys()): + value = kwargs[kwarg] + if not isinstance(value, str): + continue + match cmd.get_argument_type(kwarg): + case "int": + kwargs[kwarg] = int(value) + case "bool": + kwargs[kwarg] = value.lower() == "true" + case "float": + kwargs[kwarg] = float(value) + if cmd.needs_workspace_client(): + self._patch_databricks_host() + kwargs["w"] = self._workspace_client() + elif cmd.is_account: + self._patch_databricks_host() + kwargs["a"] = self._account_client() + prompts_argument = cmd.prompts_argument_name() + if prompts_argument: + kwargs[prompts_argument] = Prompts() + return kwargs + @classmethod def fix_databricks_host(cls, host: str) -> str: """Emulate the way the Go SDK fixes the Databricks host before using it. From 26cf8e01f740b176b103b89b0518cd128cc72d11 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 12:08:14 +0100 Subject: [PATCH 09/10] Reduce complexity of the method that sniffs an XML declaration. --- src/databricks/labs/blueprint/paths.py | 86 ++++++++++++++------------ 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index f039da1..950ec80 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -1082,47 +1082,56 @@ def _detect_encoding_bom( ) +def _read_xml_encoding(binary_io: BinaryIO) -> tuple[bytes, str] | None: + """Read the XML encoding from the start of a binary file, if present.""" + maybe_xml: bytes = binary_io.read(4) + # Useful to know here, an XML declaration must start with ' str | None: position = binary_io.tell() if preserve_position else None try: - maybe_xml: bytes = binary_io.read(4) - # Useful to know here, an XML declaration must start with ' str if encoding: logger.debug(f"XML declaration encoding detected: {encoding}") # TODO: XML encodings come from the IATA list, maybe they need to mapped/checked against Python's names. - else: - logger.debug("XML declaration without encoding detected, must be utf-8") - encoding = "utf-8" - return encoding + return encoding + logger.debug("XML declaration without encoding detected, must be utf-8") + return "utf-8" return None From 9653fe589bfc3afd6d1c616de2bd079ac6430b74 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 19 Nov 2025 12:08:35 +0100 Subject: [PATCH 10/10] Suppress some pylint warnings. These are backwards compatible interfaces from the standard Python APIs, and we're being compatible with later versions. --- src/databricks/labs/blueprint/paths.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 950ec80..7c064a8 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -236,7 +236,7 @@ def rmdir(self, recursive: bool = False) -> None: ... def unlink(self, missing_ok: bool = False) -> None: ... @abstractmethod - def open( + def open( # pylint: disable=too-many-positional-arguments self, mode: str = "r", buffering: int = -1, @@ -681,7 +681,7 @@ def unlink(self, missing_ok: bool = False) -> None: raise FileNotFoundError(f"{self.as_posix()} does not exist") self._ws.dbfs.delete(self.as_posix()) - def open( + def open( # pylint: disable=too-many-positional-arguments self, mode: str = "r", buffering: int = -1, @@ -831,7 +831,7 @@ def unlink(self, missing_ok: bool = False) -> None: if not missing_ok: raise FileNotFoundError(f"{self.as_posix()} does not exist") from e - def open( + def open( # pylint: disable=too-many-positional-arguments self, mode: str = "r", buffering: int = -1,