Quantco · MoritzPotthoffQC · Sep 11, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/dataframely/_storage/parquet.py b/dataframely/_storage/parquet.py
@@ -31,6 +31,7 @@ def sink_frame(
     ) -> None:
         file = kwargs.pop("file")
         metadata = kwargs.pop("metadata", {})
+        file.parent.mkdir(parents=True, exist_ok=True)
         lf.sink_parquet(
             file,
             metadata={**metadata, SCHEMA_METADATA_KEY: serialized_schema},
@@ -42,6 +43,7 @@ def write_frame(
     ) -> None:
         file = kwargs.pop("file")
         metadata = kwargs.pop("metadata", {})
+        file.parent.mkdir(parents=True, exist_ok=True)
         df.write_parquet(
             file,
             metadata={**metadata, SCHEMA_METADATA_KEY: serialized_schema},

diff --git a/dataframely/testing/storage.py b/dataframely/testing/storage.py
@@ -67,6 +67,7 @@ def write_typed(
             schema.write_parquet(df, self._wrap_path(path))
 
     def write_untyped(self, df: pl.DataFrame, path: Path, lazy: bool) -> None:
+        path.mkdir(parents=True, exist_ok=True)
         if lazy:
             df.lazy().sink_parquet(self._wrap_path(path))
         else:

@@ -60,7 +60,10 @@ class MyCollection2(dy.Collection):
 @pytest.mark.parametrize("kwargs", [{}, {"partition_by": "a"}])
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write(
-    tester: CollectionStorageTester, tmp_path: Path, kwargs: dict[str, Any], lazy: bool
+    tester: CollectionStorageTester,
+    tmp_path_non_existent: Path,
+    kwargs: dict[str, Any],
+    lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.validate(
@@ -72,10 +75,10 @@ def test_read_write(
     )
 
     # Act
-    tester.write_typed(collection, tmp_path, lazy=lazy, **kwargs)
+    tester.write_typed(collection, tmp_path_non_existent, lazy=lazy, **kwargs)
 
     # Assert
-    out = tester.read(MyCollection, tmp_path, lazy)
+    out = tester.read(MyCollection, tmp_path_non_existent, lazy)
     assert_frame_equal(collection.first, out.first)
     assert collection.second is not None
     assert out.second is not None
@@ -86,7 +89,10 @@ def test_read_write(
 @pytest.mark.parametrize("kwargs", [{}, {"partition_by": "a"}])
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_optional(
-    tester: CollectionStorageTester, tmp_path: Path, kwargs: dict[str, Any], lazy: bool
+    tester: CollectionStorageTester,
+    tmp_path_non_existent: Path,
+    kwargs: dict[str, Any],
+    lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.validate(
@@ -95,10 +101,10 @@ def test_read_write_optional(
 
     # Act
     write_lazy = lazy and "partition_by" not in kwargs
-    tester.write_typed(collection, tmp_path, lazy=write_lazy, **kwargs)
+    tester.write_typed(collection, tmp_path_non_existent, lazy=write_lazy, **kwargs)
 
     # Assert
-    out = tester.read(MyCollection, tmp_path, lazy)
+    out = tester.read(MyCollection, tmp_path_non_existent, lazy)
     assert_frame_equal(collection.first, out.first)
     assert collection.second is None
     assert out.second is None
@@ -112,18 +118,18 @@ def test_read_write_optional(
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_if_schema_matches(
     tester: CollectionStorageTester,
-    tmp_path: Path,
+    tmp_path_non_existent: Path,
     mocker: pytest_mock.MockerFixture,
     validation: Any,
     lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.create_empty()
-    tester.write_typed(collection, tmp_path, lazy=lazy)
+    tester.write_typed(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     spy = mocker.spy(MyCollection, "validate")
-    tester.read(MyCollection, tmp_path, lazy=lazy, validation=validation)
+    tester.read(MyCollection, tmp_path_non_existent, lazy=lazy, validation=validation)
 
     # Assert
     spy.assert_not_called()
@@ -136,21 +142,21 @@ def test_read_write_if_schema_matches(
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_validation_warn_no_schema(
     tester: CollectionStorageTester,
-    tmp_path: Path,
+    tmp_path_non_existent: Path,
     mocker: pytest_mock.MockerFixture,
     lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.create_empty()
-    tester.write_untyped(collection, tmp_path, lazy=lazy)
+    tester.write_untyped(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     spy = mocker.spy(MyCollection, "validate")
     with pytest.warns(
         UserWarning,
         match=r"requires validation: no collection schema to check validity",
     ):
-        tester.read(MyCollection, tmp_path, lazy, validation="warn")
+        tester.read(MyCollection, tmp_path_non_existent, lazy, validation="warn")
 
     # Assert
     spy.assert_called_once()
@@ -160,21 +166,21 @@ def test_read_write_validation_warn_no_schema(
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_validation_warn_invalid_schema(
     tester: CollectionStorageTester,
-    tmp_path: Path,
+    tmp_path_non_existent: Path,
     mocker: pytest_mock.MockerFixture,
     lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.create_empty()
-    tester.write_typed(collection, tmp_path, lazy=lazy)
+    tester.write_typed(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     spy = mocker.spy(MyCollection2, "validate")
     with pytest.warns(
         UserWarning,
         match=r"requires validation: current collection schema does not match",
     ):
-        tester.read(MyCollection2, tmp_path, lazy)
+        tester.read(MyCollection2, tmp_path_non_existent, lazy)
 
     # Assert
     spy.assert_called_once()
@@ -185,17 +191,17 @@ def test_read_write_validation_warn_invalid_schema(
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_validation_allow_no_schema(
     tester: CollectionStorageTester,
-    tmp_path: Path,
+    tmp_path_non_existent: Path,
     mocker: pytest_mock.MockerFixture,
     lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.create_empty()
-    tester.write_untyped(collection, tmp_path, lazy=lazy)
+    tester.write_untyped(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     spy = mocker.spy(MyCollection, "validate")
-    tester.read(MyCollection, tmp_path, lazy, validation="allow")
+    tester.read(MyCollection, tmp_path_non_existent, lazy, validation="allow")
 
     # Assert
     spy.assert_called_once()
@@ -205,17 +211,17 @@ def test_read_write_validation_allow_no_schema(
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_validation_allow_invalid_schema(
     tester: CollectionStorageTester,
-    tmp_path: Path,
+    tmp_path_non_existent: Path,
     mocker: pytest_mock.MockerFixture,
     lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.create_empty()
-    tester.write_typed(collection, tmp_path, lazy=lazy)
+    tester.write_typed(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     spy = mocker.spy(MyCollection2, "validate")
-    tester.read(MyCollection2, tmp_path, lazy, validation="allow")
+    tester.read(MyCollection2, tmp_path_non_existent, lazy, validation="allow")
 
     # Assert
     spy.assert_called_once()
@@ -227,37 +233,37 @@ def test_read_write_validation_allow_invalid_schema(
 @pytest.mark.parametrize("tester", TESTERS)
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_validation_forbid_no_schema(
-    tester: CollectionStorageTester, tmp_path: Path, lazy: bool
+    tester: CollectionStorageTester, tmp_path_non_existent: Path, lazy: bool
 ) -> None:
     # Arrange
     collection = MyCollection.create_empty()
-    tester.write_untyped(collection, tmp_path, lazy=lazy)
+    tester.write_untyped(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     with pytest.raises(
         ValidationRequiredError,
         match=r"without validation: no collection schema to check validity",
     ):
-        tester.read(MyCollection, tmp_path, lazy, validation="forbid")
+        tester.read(MyCollection, tmp_path_non_existent, lazy, validation="forbid")
 
 
 @pytest.mark.parametrize("tester", TESTERS)
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_validation_forbid_invalid_schema(
-    tester: CollectionStorageTester, tmp_path: Path, lazy: bool
+    tester: CollectionStorageTester, tmp_path_non_existent: Path, lazy: bool
 ) -> None:
     # Arrange
 
     collection = MyCollection.create_empty()
 
-    tester.write_typed(collection, tmp_path, lazy=lazy)
+    tester.write_typed(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     with pytest.raises(
         ValidationRequiredError,
         match=r"without validation: current collection schema does not match",
     ):
-        tester.read(MyCollection2, tmp_path, lazy, validation="forbid")
+        tester.read(MyCollection2, tmp_path_non_existent, lazy, validation="forbid")
 
 
 # --------------------------------- VALIDATION "SKIP" -------------------------------- #
@@ -267,17 +273,17 @@ def test_read_write_validation_forbid_invalid_schema(
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_validation_skip_no_schema(
     tester: CollectionStorageTester,
-    tmp_path: Path,
+    tmp_path_non_existent: Path,
     mocker: pytest_mock.MockerFixture,
     lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.create_empty()
-    tester.write_untyped(collection, tmp_path, lazy=lazy)
+    tester.write_untyped(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     spy = mocker.spy(MyCollection, "validate")
-    tester.read(MyCollection, tmp_path, lazy, validation="skip")
+    tester.read(MyCollection, tmp_path_non_existent, lazy, validation="skip")
 
     # Assert
     spy.assert_not_called()
@@ -287,17 +293,17 @@ def test_read_write_validation_skip_no_schema(
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_validation_skip_invalid_schema(
     tester: CollectionStorageTester,
-    tmp_path: Path,
+    tmp_path_non_existent: Path,
     mocker: pytest_mock.MockerFixture,
     lazy: bool,
 ) -> None:
     # Arrange
     collection = MyCollection.create_empty()
-    tester.write_typed(collection, tmp_path, lazy=lazy)
+    tester.write_typed(collection, tmp_path_non_existent, lazy=lazy)
 
     # Act
     spy = mocker.spy(collection, "validate")
-    tester.read(MyCollection2, tmp_path, lazy, validation="skip")
+    tester.read(MyCollection2, tmp_path_non_existent, lazy, validation="skip")
 
     # Assert
     spy.assert_not_called()
@@ -332,7 +338,10 @@ def test_reconcile_collection_types(
 @pytest.mark.parametrize("validation", ["warn", "allow", "forbid", "skip"])
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_parquet_fallback_schema_json_success(
-    tmp_path: Path, mocker: pytest_mock.MockerFixture, validation: Any, lazy: bool
+    tmp_path_non_existent: Path,
+    mocker: pytest_mock.MockerFixture,
+    validation: Any,
+    lazy: bool,
 ) -> None:
     # In https://github.com/Quantco/dataframely/pull/107, the
     # mechanism for storing collection metadata was changed.
@@ -345,12 +354,12 @@ def test_read_write_parquet_fallback_schema_json_success(
     # Arrange
     tester = ParquetCollectionStorageTester()
     collection = MyCollection.create_empty()
-    tester.write_untyped(collection, tmp_path, lazy)
-    (tmp_path / "schema.json").write_text(collection.serialize())
+    tester.write_untyped(collection, tmp_path_non_existent, lazy)
+    (tmp_path_non_existent / "schema.json").write_text(collection.serialize())
 
     # Act
     spy = mocker.spy(MyCollection, "validate")
-    tester.read(MyCollection, tmp_path, lazy, validation=validation)
+    tester.read(MyCollection, tmp_path_non_existent, lazy, validation=validation)
 
     # Assert
     spy.assert_not_called()
@@ -359,21 +368,24 @@ def test_read_write_parquet_fallback_schema_json_success(
 @pytest.mark.parametrize("validation", ["allow", "warn"])
 @pytest.mark.parametrize("lazy", [True, False])
 def test_read_write_parquet_schema_json_fallback_corrupt(
-    tmp_path: Path, mocker: pytest_mock.MockerFixture, validation: Any, lazy: bool
+    tmp_path_non_existent: Path,
+    mocker: pytest_mock.MockerFixture,
+    validation: Any,
+    lazy: bool,
 ) -> None:
     """If the schema.json file is present, but corrupt, we should always fall back to
     validating."""
     # Arrange
     collection = MyCollection.create_empty()
     tester = ParquetCollectionStorageTester()
-    tester.write_untyped(collection, tmp_path, lazy)
-    (tmp_path / "schema.json").write_text("} this is not a valid JSON {")
+    tester.write_untyped(collection, tmp_path_non_existent, lazy)
+    (tmp_path_non_existent / "schema.json").write_text("} this is not a valid JSON {")
 
     # Act
     spy = mocker.spy(MyCollection, "validate")
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=UserWarning)
-        tester.read(MyCollection, tmp_path, lazy, validation=validation)
+        tester.read(MyCollection, tmp_path_non_existent, lazy, validation=validation)
 
     # Assert
     spy.assert_called_once()

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,12 @@
+# Copyright (c) QuantCo 2025-2025
+# SPDX-License-Identifier: BSD-3-Clause
+
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture()
+def tmp_path_non_existent(tmp_path: Path) -> Path:
+    """A path to a directory below `tmp_path` that does not exist yet."""
+    return tmp_path / "subdir"