Skip to content

Commit d3c2ced

Browse files
committed
autocast_lists kwarg and set_item nested combination
1 parent 1fc9e44 commit d3c2ced

File tree

2 files changed

+42
-11
lines changed

2 files changed

+42
-11
lines changed

src/nested_pandas/nestedframe/core.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def repack_row(chunk, header=True):
119119
# Grab length, then truncate to one row for display
120120
n_rows = len(chunk)
121121
chunk = chunk.head(1).round(8) # only show first row
122-
chunk.astype({col: "str" for col in chunk.columns}) # cast to string for info row
122+
chunk.astype({col: object for col in chunk.columns}) # cast to string for info row
123123

124124
# Add a row that shows the number of additional rows not shown
125125
len_row = pd.DataFrame(
@@ -257,15 +257,27 @@ def _getitem_list(self, item):
257257
def __setitem__(self, key, value):
258258
"""Custom __setitem__ for NestedFrame: auto-nest DataFrame assignment to new columns."""
259259
# If assigning a DataFrame to a new column, auto-nest it
260-
if (
261-
isinstance(key, str)
262-
and key not in self.columns
263-
and isinstance(value, (pd.DataFrame | NestedFrame))
264-
):
265-
# Note this uses the default approach for add_nested, which is a left join on index
266-
new_df = self.add_nested(value, name=key)
267-
self._update_inplace(new_df)
268-
return
260+
261+
# Special handling paths for assignment of dataframes to nested columns
262+
if isinstance(key, str) and isinstance(value, pd.DataFrame | NestedFrame):
263+
# if all columns are NestedDtype, combine them into a single nested column
264+
if np.array([isinstance(dtype, NestedDtype) for dtype in value.dtypes]).all():
265+
for i, col in enumerate(value.columns):
266+
if i == 0:
267+
new_nested = value[col]
268+
else:
269+
# there must be a better way than through list fields
270+
for field in value[col].nest.fields:
271+
new_nested = new_nested.nest.with_list_field(
272+
field, value[col].nest.get_list_series(field)
273+
)
274+
value = new_nested
275+
# Assign a DataFrame as a new column, auto-nesting it
276+
elif key not in self.columns:
277+
# Note this uses the default approach for add_nested, which is a left join on index
278+
new_df = self.add_nested(value, name=key)
279+
self._update_inplace(new_df)
280+
return
269281

270282
components = self._parse_hierarchical_components(key)
271283
# Replacing or adding columns to a nested structure

src/nested_pandas/nestedframe/io.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from upath import UPath
1010

1111
from ..series.dtype import NestedDtype
12+
from ..series.packer import pack_lists
1213
from ..series.utils import table_to_struct_array
1314
from .core import NestedFrame
1415

@@ -17,6 +18,7 @@ def read_parquet(
1718
data: str | UPath | bytes,
1819
columns: list[str] | None = None,
1920
reject_nesting: list[str] | str | None = None,
21+
autocast_list: bool = False,
2022
**kwargs,
2123
) -> NestedFrame:
2224
"""
@@ -39,6 +41,8 @@ def read_parquet(
3941
is castable to a nested column. However, this assumption is invalid if
4042
the lists within the struct have mismatched lengths for any given item.
4143
Columns specified here will be read using the corresponding pandas.ArrowDtype.
44+
autocast_list: bool, default=True
45+
If True, automatically cast list columns to nested columns with NestedDType.
4246
kwargs: dict
4347
Keyword arguments passed to `pyarrow.parquet.read_table`
4448
@@ -152,12 +156,13 @@ def read_parquet(
152156
for col, struct in structs.items():
153157
table = table.append_column(col, struct)
154158

155-
return from_pyarrow(table, reject_nesting=reject_nesting)
159+
return from_pyarrow(table, reject_nesting=reject_nesting, autocast_list=autocast_list)
156160

157161

158162
def from_pyarrow(
159163
table: pa.Table,
160164
reject_nesting: list[str] | str | None = None,
165+
autocast_list: bool = False,
161166
) -> NestedFrame:
162167
"""
163168
Load a pyarrow Table object into a NestedFrame.
@@ -172,6 +177,8 @@ def from_pyarrow(
172177
is castable to a nested column. However, this assumption is invalid if
173178
the lists within the struct have mismatched lengths for any given item.
174179
Columns specified here will be read using the corresponding pandas.ArrowDtype.
180+
autocast_list: bool, default=False
181+
If True, automatically cast list columns to nested columns with NestedDType.
175182
176183
Returns
177184
-------
@@ -191,6 +198,10 @@ def from_pyarrow(
191198
# Attempt to cast struct columns to NestedDTypes
192199
df = _cast_struct_cols_to_nested(df, reject_nesting)
193200

201+
# If autocast_list is True, cast list columns to NestedDTypes
202+
if autocast_list:
203+
df = _cast_list_cols_to_nested(df)
204+
194205
return df
195206

196207

@@ -221,3 +232,11 @@ def _cast_struct_cols_to_nested(df, reject_nesting):
221232
f" read_parquet(..., reject_nesting=['{col}'])"
222233
) from err
223234
return df
235+
236+
237+
def _cast_list_cols_to_nested(df):
238+
"""cast list columns to nested dtype"""
239+
for col, dtype in df.dtypes.items():
240+
if pa.types.is_list(dtype.pyarrow_dtype):
241+
df[col] = pack_lists(df[[col]])
242+
return df

0 commit comments

Comments
 (0)