add note on partial loading name overlaps

dougbrn · dougbrn · commit b3bcc44e7ddc · 2025-04-10T14:47:44.000-07:00
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -60,6 +60,12 @@ def read_parquet(
     of a nested column called "nested". Be aware that this will prohibit calls
     like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])```
     from working, as this implies both full and partial load of "nested".
+
+    Furthermore, there are some cases where subcolumns will have the same name
+    as a top-level column. For example, if you have a column "nested" with
+    subcolumns "nested.a" and "nested.b", and also a top-level column "a". In
+    these cases, keep in mind that if "nested" is in the reject_nesting list
+    the operation will fail (but nesting will still work normally).
     """
 
     # Type convergence for reject_nesting
@@ -100,7 +106,6 @@ def read_parquet(
                 "Please either remove the partial load or the full load."
             )
 
-    # TODO: Fix reject nesting when only partial loading
     # Build structs and replace columns in table
     for col, indices in nested_structures.items():
         # Build a struct column from the columns
@@ -113,7 +118,7 @@ def read_parquet(
         table = table.append_column(col, struct)
 
     # Convert to NestedFrame
-    # TODO: How much of a problem is it that this is not zero_copy?
+    # not zero-copy, but reduce memory pressure via the self_destruct kwarg
     # https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
     df = NestedFrame(table.to_pandas(types_mapper=lambda ty: pd.ArrowDtype(ty), self_destruct=True))
     del table
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
@@ -8,8 +8,7 @@
 
 
 @pytest.mark.parametrize("columns", [["a"], None])
-@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None])
-def test_read_parquet(tmp_path, columns, pack_columns):
+def test_read_parquet(tmp_path, columns):
     """Test nested parquet loading"""
     # Setup a temporary directory for files
     save_path = os.path.join(tmp_path, ".")
@@ -35,33 +34,22 @@ def test_read_parquet(tmp_path, columns, pack_columns):
     # Read from parquet
     nf = read_parquet(
         data=os.path.join(save_path, "base.parquet"),
-        to_pack={
-            "nested1": os.path.join(save_path, "nested1.parquet"),
-            "nested2": os.path.join(save_path, "nested2.parquet"),
-        },
         columns=columns,
-        pack_columns=pack_columns,
     )
 
+    nest1 = read_parquet(os.path.join(save_path, "nested1.parquet"))
+    nest2 = read_parquet(os.path.join(save_path, "nested1.parquet"))
+
+    nf = nf.add_nested(nest1, name="nested1").add_nested(nest2, name="nested2")
+
     # Check Base Columns
     if columns is not None:
         assert nf.columns.tolist() == columns + ["nested1", "nested2"]
     else:
         assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"]
 
-    # Check Nested Columns
-    if pack_columns is not None:
-        for nested_col in pack_columns:
-            assert nf[nested_col].nest.fields == pack_columns[nested_col]
-    else:
-        for nested_col in nf.nested_columns:
-            if nested_col == "nested1":
-                assert nf[nested_col].nest.fields == nested1.columns.tolist()
-            elif nested_col == "nested2":
-                assert nf[nested_col].nest.fields == nested2.columns.tolist()
-
 
-def test_write_packed_parquet():
+def test_write_parquet():
     """Tests writing a nested frame to a single parquet file."""
     # Generate some test data
     base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
@@ -86,50 +74,3 @@ def test_write_packed_parquet():
     # Read from parquet
     nf2 = read_parquet(temp.name)
     assert_frame_equal(nf, nf2)
-
-
-def test_write_parquet_by_layer():
-    """Tests writing a nested frame to multiple parquet files."""
-    base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
-
-    nested1 = pd.DataFrame(
-        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
-        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
-    )
-
-    nested2 = pd.DataFrame(
-        data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
-        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
-    )
-
-    # Construct the NestedFrame
-    nf = NestedFrame(base).add_nested(nested1, name="nested1").add_nested(nested2, name="nested2")
-
-    # Asser that a temporary file path must be a directory when by_layer is True
-    with pytest.raises(ValueError):
-        nf.to_parquet(tempfile.NamedTemporaryFile(suffix=".parquet").name, by_layer=True)
-
-    # Write to parquet using a named temporary file
-    tmp_dir = tempfile.TemporaryDirectory()
-    nf.to_parquet(tmp_dir.name, by_layer=True)
-
-    # Validate the individual layers were correctly saved as their own parquet files
-    read_base_frame = read_parquet(os.path.join(tmp_dir.name, "base.parquet"), to_pack=None)
-    assert_frame_equal(read_base_frame, nf.drop(columns=["nested1", "nested2"]))
-
-    read_nested1 = read_parquet(os.path.join(tmp_dir.name, "nested1.parquet"), to_pack=None)
-    assert_frame_equal(read_nested1, nf["nested1"].nest.to_flat())
-
-    read_nested2 = read_parquet(os.path.join(tmp_dir.name, "nested2.parquet"), to_pack=None)
-    assert_frame_equal(read_nested2, nf["nested2"].nest.to_flat())
-
-    # Validate the entire NestedFrame can be read
-    entire_nf = read_parquet(
-        data=os.path.join(tmp_dir.name, "base.parquet"),
-        to_pack={
-            "nested1": os.path.join(tmp_dir.name, "nested1.parquet"),
-            "nested2": os.path.join(tmp_dir.name, "nested2.parquet"),
-        },
-    )
-
-    assert_frame_equal(nf, entire_nf)