Skip to content

Commit b3bcc44

Browse files
committed
add note on partial loading name overlaps
1 parent a8a5719 commit b3bcc44

File tree

2 files changed

+14
-68
lines changed
  • src/nested_pandas/nestedframe
  • tests/nested_pandas/nestedframe

2 files changed

+14
-68
lines changed

src/nested_pandas/nestedframe/io.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ def read_parquet(
6060
of a nested column called "nested". Be aware that this will prohibit calls
6161
like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])```
6262
from working, as this implies both full and partial load of "nested".
63+
64+
Furthermore, there are some cases where subcolumns will have the same name
65+
as a top-level column. For example, if you have a column "nested" with
66+
subcolumns "nested.a" and "nested.b", and also a top-level column "a". In
67+
these cases, keep in mind that if "nested" is in the reject_nesting list
68+
the operation will fail (but nesting will still work normally).
6369
"""
6470

6571
# Type convergence for reject_nesting
@@ -100,7 +106,6 @@ def read_parquet(
100106
"Please either remove the partial load or the full load."
101107
)
102108

103-
# TODO: Fix reject nesting when only partial loading
104109
# Build structs and replace columns in table
105110
for col, indices in nested_structures.items():
106111
# Build a struct column from the columns
@@ -113,7 +118,7 @@ def read_parquet(
113118
table = table.append_column(col, struct)
114119

115120
# Convert to NestedFrame
116-
# TODO: How much of a problem is it that this is not zero_copy?
121+
# not zero-copy, but reduce memory pressure via the self_destruct kwarg
117122
# https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
118123
df = NestedFrame(table.to_pandas(types_mapper=lambda ty: pd.ArrowDtype(ty), self_destruct=True))
119124
del table

tests/nested_pandas/nestedframe/test_io.py

Lines changed: 7 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88

99

1010
@pytest.mark.parametrize("columns", [["a"], None])
11-
@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None])
12-
def test_read_parquet(tmp_path, columns, pack_columns):
11+
def test_read_parquet(tmp_path, columns):
1312
"""Test nested parquet loading"""
1413
# Setup a temporary directory for files
1514
save_path = os.path.join(tmp_path, ".")
@@ -35,33 +34,22 @@ def test_read_parquet(tmp_path, columns, pack_columns):
3534
# Read from parquet
3635
nf = read_parquet(
3736
data=os.path.join(save_path, "base.parquet"),
38-
to_pack={
39-
"nested1": os.path.join(save_path, "nested1.parquet"),
40-
"nested2": os.path.join(save_path, "nested2.parquet"),
41-
},
4237
columns=columns,
43-
pack_columns=pack_columns,
4438
)
4539

40+
nest1 = read_parquet(os.path.join(save_path, "nested1.parquet"))
41+
nest2 = read_parquet(os.path.join(save_path, "nested1.parquet"))
42+
43+
nf = nf.add_nested(nest1, name="nested1").add_nested(nest2, name="nested2")
44+
4645
# Check Base Columns
4746
if columns is not None:
4847
assert nf.columns.tolist() == columns + ["nested1", "nested2"]
4948
else:
5049
assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"]
5150

52-
# Check Nested Columns
53-
if pack_columns is not None:
54-
for nested_col in pack_columns:
55-
assert nf[nested_col].nest.fields == pack_columns[nested_col]
56-
else:
57-
for nested_col in nf.nested_columns:
58-
if nested_col == "nested1":
59-
assert nf[nested_col].nest.fields == nested1.columns.tolist()
60-
elif nested_col == "nested2":
61-
assert nf[nested_col].nest.fields == nested2.columns.tolist()
62-
6351

64-
def test_write_packed_parquet():
52+
def test_write_parquet():
6553
"""Tests writing a nested frame to a single parquet file."""
6654
# Generate some test data
6755
base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
@@ -86,50 +74,3 @@ def test_write_packed_parquet():
8674
# Read from parquet
8775
nf2 = read_parquet(temp.name)
8876
assert_frame_equal(nf, nf2)
89-
90-
91-
def test_write_parquet_by_layer():
92-
"""Tests writing a nested frame to multiple parquet files."""
93-
base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
94-
95-
nested1 = pd.DataFrame(
96-
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
97-
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
98-
)
99-
100-
nested2 = pd.DataFrame(
101-
data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
102-
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
103-
)
104-
105-
# Construct the NestedFrame
106-
nf = NestedFrame(base).add_nested(nested1, name="nested1").add_nested(nested2, name="nested2")
107-
108-
# Asser that a temporary file path must be a directory when by_layer is True
109-
with pytest.raises(ValueError):
110-
nf.to_parquet(tempfile.NamedTemporaryFile(suffix=".parquet").name, by_layer=True)
111-
112-
# Write to parquet using a named temporary file
113-
tmp_dir = tempfile.TemporaryDirectory()
114-
nf.to_parquet(tmp_dir.name, by_layer=True)
115-
116-
# Validate the individual layers were correctly saved as their own parquet files
117-
read_base_frame = read_parquet(os.path.join(tmp_dir.name, "base.parquet"), to_pack=None)
118-
assert_frame_equal(read_base_frame, nf.drop(columns=["nested1", "nested2"]))
119-
120-
read_nested1 = read_parquet(os.path.join(tmp_dir.name, "nested1.parquet"), to_pack=None)
121-
assert_frame_equal(read_nested1, nf["nested1"].nest.to_flat())
122-
123-
read_nested2 = read_parquet(os.path.join(tmp_dir.name, "nested2.parquet"), to_pack=None)
124-
assert_frame_equal(read_nested2, nf["nested2"].nest.to_flat())
125-
126-
# Validate the entire NestedFrame can be read
127-
entire_nf = read_parquet(
128-
data=os.path.join(tmp_dir.name, "base.parquet"),
129-
to_pack={
130-
"nested1": os.path.join(tmp_dir.name, "nested1.parquet"),
131-
"nested2": os.path.join(tmp_dir.name, "nested2.parquet"),
132-
},
133-
)
134-
135-
assert_frame_equal(nf, entire_nf)

0 commit comments

Comments
 (0)