fixes athena refresh mode (#3313)

rudolfix · web-flow · commit 8bd0b116fb74 · 2025-11-21T10:58:54.000+01:00
* adds filter to exclude dropped tables in staging destination, implements for athena

* enables refresh mode tests for athena, fixes tests

* fixes staging_allowed_local_path on databricks, bumps databricks connector in lockfile

* passes dropped tables schemas to filter, adjust athena filter

* allows to disable lake formation
diff --git a/dlt/common/destination/client.py b/dlt/common/destination/client.py
@@ -44,6 +44,7 @@
     C_DLT_LOAD_ID,
     TLoaderReplaceStrategy,
     TTableFormat,
+    TTableSchema,
 )
 from dlt.common.destination.capabilities import DestinationCapabilitiesContext
 from dlt.common.destination.exceptions import (
@@ -686,6 +687,14 @@ def should_truncate_table_before_load_on_staging_destination(self, table_name: s
         """
         pass
 
+    def should_drop_table_on_staging_destination(self, dropped_table: TTableSchema) -> bool:
+        """Tells if `dropped_table` should be dropped on staging destination (regular dataset) in addition to dropping the table on
+        final destination. This stays False for all the destinations except Athena, non-iceberg where staging destination
+        holds actual data which needs to be deleted.
+        Note that `dropped_table` may not longer be present in schema. It is present only if it got recreated.
+        """
+        return False
+
 
 class SupportsOpenTables(ABC):
     """Provides access to data stored in one of open table formats (iceberg or delta) and intended to
diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
@@ -11,6 +11,8 @@
     TYPE_CHECKING,
 )
 
+from dlt.common.schema.exceptions import TableNotFound
+
 if TYPE_CHECKING:
     from mypy_boto3_lakeformation import LakeFormationClient
     from mypy_boto3_lakeformation.type_defs import (
@@ -35,6 +37,7 @@
     TColumnType,
     TSchemaTables,
     TSortOrder,
+    TTableSchema,
 )
 from dlt.common.destination import DestinationCapabilitiesContext, PreparedTableSchema
 from dlt.common.destination.client import FollowupJobRequest, SupportsStagingDestination, LoadJob
@@ -345,6 +348,7 @@ def update_stored_schema(
         if (
             self.config.lakeformation_config is not None
             and self.config.lakeformation_config.enabled
+            is not None  # both True and False are actionable
         ):
             self.manage_lf_tags()
         return applied_update
@@ -406,6 +410,16 @@ def should_truncate_table_before_load_on_staging_destination(self, table_name: s
             return True
         return False
 
+    def should_drop_table_on_staging_destination(self, dropped_table: TTableSchema) -> bool:
+        # in Athena we must drop table in glue and then we must drop data in staging if table is not iceberg
+        try:
+            existing_table = self.prepare_load_table(dropped_table["name"])
+            # do not drop data if new iceberg table got created - storage is handled by Athena
+            return not self._is_iceberg_table(existing_table)
+        except TableNotFound:
+            # table got dropped and is not recreated - drop staging destination
+            return True
+
     def should_load_data_to_staging_dataset_on_staging_destination(self, table_name: str) -> bool:
         """iceberg table data goes into staging on staging destination"""
         table = self.prepare_load_table(table_name)
diff --git a/dlt/destinations/impl/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py
@@ -8,7 +8,7 @@
 
 @configspec
 class LakeformationConfig:
-    enabled: bool = False
+    enabled: Optional[bool] = None
     tags: Optional[Dict[str, str]] = None
 
 
diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
@@ -72,10 +72,15 @@ def run(self) -> None:
         # decide if this is a local file or a staged file
         is_local_file = not ReferenceFollowupJobRequest.is_reference_job(self._file_path)
         if is_local_file:
-            # conn parameter staging_allowed_local_path must be set to use 'PUT/REMOVE volume_path' SQL statement
-            self._sql_client.native_connection.thrift_backend.staging_allowed_local_path = (
-                os.path.dirname(self._file_path)
-            )
+            # staging_allowed_local_path should be set when opening the connection but at that
+            # time we do not know this path so do it now
+            conn_ = self._sql_client.native_connection
+            file_dir = os.path.dirname(self._file_path)
+            if backend := getattr(conn_, "thrift_backend", None):
+                backend.staging_allowed_local_path = file_dir
+            else:
+                # thrift backend discontinued on newer databricks connector clients
+                conn_.staging_allowed_local_path = file_dir
             # local file by uploading to a temporary volume on Databricks
             from_clause, file_name, volume_path, volume_file_path = self._handle_local_file_upload(
                 self._file_path
diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py
@@ -148,6 +148,9 @@ def is_storage_initialized(self) -> bool:
     def drop_storage(self) -> None:
         pass
 
+    def drop_tables(self, *tables: str, delete_schema: bool = True) -> None:
+        pass
+
     def update_stored_schema(
         self,
         only_tables: Iterable[str] = None,
diff --git a/dlt/load/load.py b/dlt/load/load.py
@@ -539,6 +539,7 @@ def initialize_package(
                         if isinstance(job_client, WithStagingDataset)
                         else None
                     ),
+                    lambda table_name: True,  # drop all passed tables
                     drop_tables=dropped_tables,
                     truncate_tables=truncated_tables,
                 )
@@ -556,9 +557,11 @@ def initialize_package(
                             schema,
                             new_jobs,
                             expected_update,
-                            job_client.should_truncate_table_before_load_on_staging_destination,
                             # should_truncate_staging,
+                            job_client.should_truncate_table_before_load_on_staging_destination,
                             job_client.should_load_data_to_staging_dataset_on_staging_destination,
+                            # should we drop tables also on staging destination
+                            job_client.should_drop_table_on_staging_destination,
                             drop_tables=dropped_tables,
                             truncate_tables=truncated_tables,
                         )
diff --git a/dlt/load/utils.py b/dlt/load/utils.py
@@ -69,6 +69,7 @@ def init_client(
     expected_update: TSchemaTables,
     truncate_filter: Callable[[str], bool],
     load_staging_filter: Callable[[str], bool],
+    drop_staging_filter: Callable[[TTableSchema], bool],
     drop_tables: Optional[List[TTableSchema]] = None,
     truncate_tables: Optional[List[TTableSchema]] = None,
 ) -> TSchemaTables:
@@ -81,8 +82,9 @@ def init_client(
         schema (Schema): The schema as in load package
         new_jobs (Iterable[LoadJobInfo]): List of new jobs
         expected_update (TSchemaTables): Schema update as in load package. Always present even if empty
-        truncate_filter (Callable[[str], bool]): A filter that tells which table in destination dataset should be truncated
-        load_staging_filter (Callable[[str], bool]): A filter which tell which table in the staging dataset may be loaded into
+        truncate_filter (Callable[[str], bool]): A filter that tells if table should be truncated
+        load_staging_filter (Callable[[str], bool]): A filter which tell if table may be loaded into
+        drop_staging_filter (Callable[[str], bool]): A filter which tell if table may be dropped
         drop_tables (Optional[List[TTableSchema]]): List of tables to drop before initializing storage
         truncate_tables (Optional[List[TTableSchema]]): List of tables to truncate before initializing storage
 
@@ -111,8 +113,13 @@ def init_client(
         )
     )
 
-    # get tables to drop
-    drop_table_names = {table["name"] for table in drop_tables} if drop_tables else set()
+    # get tables to drop, note that drop_tables are not in schema and come from the package
+    # state
+    drop_table_names = (
+        {table["name"] for table in drop_tables if drop_staging_filter(table)}
+        if drop_tables
+        else set()
+    )
     job_client.verify_schema(only_tables=tables_with_jobs | dlt_tables, new_jobs=new_jobs)
     applied_update = _init_dataset_and_update_schema(
         job_client,
diff --git a/pyproject.toml b/pyproject.toml
@@ -142,7 +142,7 @@ qdrant = [
     "qdrant-client[fastembed]>=1.8"
 ]
 databricks = [
-    "databricks-sql-connector>=2.9.3,<4 ; python_version <= '3.12'",
+    "databricks-sql-connector>=2.9.3 ; python_version <= '3.12'",
     "databricks-sql-connector>=3.6.0 ; python_version >= '3.13'",
     "databricks-sdk>=0.38.0",
 ]
diff --git a/tests/load/athena_iceberg/test_lakeformation.py b/tests/load/athena_iceberg/test_lakeformation.py
@@ -186,7 +186,9 @@ def create_pipelines(
     )
     lf_disabled_pipeline = destination_config.setup_pipeline(
         pipeline_name,
-        destination=destination_config.destination_factory(),
+        destination=destination_config.destination_factory(
+            lakeformation_config=LakeformationConfig(enabled=False)
+        ),
         dataset_name=dataset_name,
         staging=staging_destination,
     )
diff --git a/tests/load/pipeline/test_athena.py b/tests/load/pipeline/test_athena.py
@@ -35,12 +35,15 @@
     ),
     ids=lambda x: x.name,
 )
-@pytest.mark.parametrize("lf_enabled", [True, False], ids=["lf-on", "lf-off"])
+@pytest.mark.parametrize(
+    "lf_enabled", [True, False, None], ids=["lf-on", "lf-off", "lf-passthrough"]
+)
 def test_athena_lakeformation_config_gating(
     destination_config: DestinationTestConfiguration, lf_enabled: bool, mocker, monkeypatch
 ) -> None:
     # Configure Lake Formation gating via env (read by client config)
-    monkeypatch.setenv("DESTINATION__LAKEFORMATION_CONFIG__ENABLED", str(lf_enabled))
+    if lf_enabled is not None:
+        monkeypatch.setenv("DESTINATION__LAKEFORMATION_CONFIG__ENABLED", str(lf_enabled))
 
     pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), dev_mode=True)
 
@@ -55,7 +58,8 @@ def test_athena_lakeformation_config_gating(
         )
 
         client.update_stored_schema()
-        if lf_enabled:
+        # disable and enable flag with add / remove tags respectively, None will skip
+        if lf_enabled is not None:
             mocked_manage.assert_called()
         else:
             mocked_manage.assert_not_called()
diff --git a/tests/load/pipeline/test_refresh_modes.py b/tests/load/pipeline/test_refresh_modes.py
@@ -99,7 +99,7 @@ def some_data_4():
     "destination_config",
     destinations_configs(
         default_sql_configs=True,
-        subset=["duckdb", "filesystem", "iceberg"],
+        subset=["duckdb", "filesystem", "iceberg", "athena"],
         local_filesystem_configs=True,
         table_format_local_configs=True,
     ),
@@ -110,20 +110,23 @@ def some_data_4():
 def test_refresh_drop_sources(
     destination_config: DestinationTestConfiguration, in_source: bool, with_wipe: bool
 ):
-    pipeline = destination_config.setup_pipeline("refresh_source")
+    pipeline_name = "refresh_source"
+    dataset_name = pipeline_name + uniq_id()
+    pipeline = destination_config.setup_pipeline(pipeline_name, dataset_name=dataset_name)
 
     data: Any = refresh_source(first_run=True, drop_sources=True)
     if not in_source:
         data = list(data.selected_resources.values())
 
-    # First run pipeline so destination so tables are created
+    # first run pipeline so destination so tables are created
     info = pipeline.run(data, refresh="drop_sources", **destination_config.run_kwargs)
     assert_load_info(info)
+    assert table_exists(pipeline, "some_data_3")
 
-    # Second run of pipeline with only selected resources
+    # second run of pipeline with only selected resources
     if with_wipe:
         pipeline._wipe_working_folder()
-        pipeline = destination_config.setup_pipeline("refresh_source")
+        pipeline = destination_config.setup_pipeline(pipeline_name, dataset_name=dataset_name)
 
     data = refresh_source(first_run=False, drop_sources=True).with_resources(
         "some_data_1", "some_data_2"
@@ -142,16 +145,16 @@ def test_refresh_drop_sources(
         "some_data_2",
     }
 
-    # No "name" column should exist as table was dropped and re-created without it
+    # no "name" column should exist as table was dropped and re-created without it
     assert_only_table_columns(pipeline, "some_data_1", ["id"])
     data = load_tables_to_dicts(pipeline, "some_data_1")["some_data_1"]
     result = sorted([row["id"] for row in data])
-    # Only rows from second run should exist
+    # only rows from second run should exist
     assert result == [3, 4]
 
-    # Confirm resource tables not selected on second run got dropped
+    # confirm resource tables not selected on second run got dropped
     assert not table_exists(pipeline, "some_data_3")
-    # Loaded state is wiped
+    # loaded state is wiped
     with pipeline.destination_client() as dest_client:
         destination_state = load_pipeline_state_from_destination(
             pipeline.pipeline_name, dest_client  # type: ignore[arg-type]
@@ -173,7 +176,9 @@ def test_existing_schema_hash(destination_config: DestinationTestConfiguration):
     """Test when new schema is identical to a previously stored schema after dropping and re-creating tables.
     The change should be detected regardless and tables are created again in destination db
     """
-    pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_sources")
+    pipeline = destination_config.setup_pipeline(
+        "refresh_full_test", refresh="drop_sources", dev_mode=True
+    )
 
     info = pipeline.run(
         refresh_source(first_run=True, drop_sources=True), **destination_config.run_kwargs
@@ -210,15 +215,13 @@ def test_existing_schema_hash(destination_config: DestinationTestConfiguration):
     assert new_schema_hash == first_schema_hash
 
 
-pytest.mark.essential
-
-
+@pytest.mark.essential
 @pytest.mark.parametrize(
     "destination_config",
     destinations_configs(
         default_sql_configs=True,
         local_filesystem_configs=True,
-        subset=["duckdb", "filesystem", "iceberg"],
+        subset=["duckdb", "filesystem", "iceberg", "athena"],
         table_format_local_configs=True,
     ),
     ids=lambda x: x.name,
@@ -232,19 +235,21 @@ def test_refresh_drop_resources(
         pytest.skip("not needed")
 
     # First run pipeline with load to destination so tables are created
-    pipeline = destination_config.setup_pipeline("refresh_source")
+    pipeline_name = "refresh_source"
+    dataset_name = pipeline_name + uniq_id()
+    pipeline = destination_config.setup_pipeline(pipeline_name, dataset_name=dataset_name)
 
     data: Any = refresh_source(first_run=True)
     if not in_source:
         data = list(data.selected_resources.values())
 
-    info = pipeline.run(data, refresh="drop_resources", **destination_config.run_kwargs)
+    info = pipeline.run(data, **destination_config.run_kwargs)
     assert_load_info(info)
 
     # Second run of pipeline with only selected resources
     if with_wipe:
         pipeline._wipe_working_folder()
-        pipeline = destination_config.setup_pipeline("refresh_source")
+        pipeline = destination_config.setup_pipeline(pipeline_name, dataset_name=dataset_name)
 
     data = refresh_source(first_run=False).with_resources("some_data_1", "some_data_2")
     if not in_source:
@@ -298,7 +303,9 @@ def test_refresh_drop_resources(
 def test_refresh_drop_data_only(destination_config: DestinationTestConfiguration):
     """Refresh drop_data should truncate all selected tables before load"""
     # First run pipeline with load to destination so tables are created
-    pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_data")
+    pipeline = destination_config.setup_pipeline(
+        "refresh_full_test", refresh="drop_data", dev_mode=True
+    )
 
     info = pipeline.run(
         refresh_source(first_run=True), write_disposition="append", **destination_config.run_kwargs
@@ -408,7 +415,9 @@ def source_2_data_2():
         yield source_2_data_1
         yield source_2_data_2
 
-    pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_sources")
+    pipeline = destination_config.setup_pipeline(
+        "refresh_full_test", refresh="drop_sources", dev_mode=True
+    )
 
     # Run both sources
     info = pipeline.run(
@@ -459,7 +468,7 @@ def source_2_data_2():
     ids=lambda x: x.name,
 )
 def test_refresh_argument_to_run(destination_config: DestinationTestConfiguration):
-    pipeline = destination_config.setup_pipeline("refresh_full_test")
+    pipeline = destination_config.setup_pipeline("refresh_full_test", dev_mode=True)
 
     info = pipeline.run(refresh_source(first_run=True), **destination_config.run_kwargs)
     assert_load_info(info)
@@ -495,7 +504,7 @@ def test_refresh_argument_to_run(destination_config: DestinationTestConfiguratio
     ids=lambda x: x.name,
 )
 def test_refresh_argument_to_extract(destination_config: DestinationTestConfiguration):
-    pipeline = destination_config.setup_pipeline("refresh_full_test")
+    pipeline = destination_config.setup_pipeline("refresh_full_test", dev_mode=True)
 
     info = pipeline.run(refresh_source(first_run=True), **destination_config.run_kwargs)
     assert_load_info(info)
diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ qdrant = [`
`142`	`142`	`"qdrant-client[fastembed]>=1.8"`
`143`	`143`	`]`
`144`	`144`	`databricks = [`
`145`		`- "databricks-sql-connector>=2.9.3,<4 ; python_version <= '3.12'",`
	`145`	`+ "databricks-sql-connector>=2.9.3 ; python_version <= '3.12'",`
`146`	`146`	`"databricks-sql-connector>=3.6.0 ; python_version >= '3.13'",`
`147`	`147`	`"databricks-sdk>=0.38.0",`
`148`	`148`	`]`
Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,9 @@ def create_pipelines(`
`186`	`186`	`)`
`187`	`187`	`lf_disabled_pipeline = destination_config.setup_pipeline(`
`188`	`188`	`pipeline_name,`
`189`		`- destination=destination_config.destination_factory(),`
	`189`	`+ destination=destination_config.destination_factory(`
	`190`	`+ lakeformation_config=LakeformationConfig(enabled=False)`
	`191`	`+ ),`
`190`	`192`	`dataset_name=dataset_name,`
`191`	`193`	`staging=staging_destination,`
`192`	`194`	`)`