autocast_lists kwarg and set_item nested combination

dougbrn · dougbrn · commit d3c2ced1b26e · 2025-06-16T10:59:45.000-07:00
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -119,7 +119,7 @@ def repack_row(chunk, header=True):
             # Grab length, then truncate to one row for display
             n_rows = len(chunk)
             chunk = chunk.head(1).round(8)  # only show first row
-            chunk.astype({col: "str" for col in chunk.columns})  # cast to string for info row
+            chunk.astype({col: object for col in chunk.columns})  # cast to string for info row
 
             # Add a row that shows the number of additional rows not shown
             len_row = pd.DataFrame(
@@ -257,15 +257,27 @@ def _getitem_list(self, item):
     def __setitem__(self, key, value):
         """Custom __setitem__ for NestedFrame: auto-nest DataFrame assignment to new columns."""
         # If assigning a DataFrame to a new column, auto-nest it
-        if (
-            isinstance(key, str)
-            and key not in self.columns
-            and isinstance(value, (pd.DataFrame | NestedFrame))
-        ):
-            # Note this uses the default approach for add_nested, which is a left join on index
-            new_df = self.add_nested(value, name=key)
-            self._update_inplace(new_df)
-            return
+
+        # Special handling paths for assignment of dataframes to nested columns
+        if isinstance(key, str) and isinstance(value, pd.DataFrame | NestedFrame):
+            # if all columns are NestedDtype, combine them into a single nested column
+            if np.array([isinstance(dtype, NestedDtype) for dtype in value.dtypes]).all():
+                for i, col in enumerate(value.columns):
+                    if i == 0:
+                        new_nested = value[col]
+                    else:
+                        # there must be a better way than through list fields
+                        for field in value[col].nest.fields:
+                            new_nested = new_nested.nest.with_list_field(
+                                field, value[col].nest.get_list_series(field)
+                            )
+                value = new_nested
+            # Assign a DataFrame as a new column, auto-nesting it
+            elif key not in self.columns:
+                # Note this uses the default approach for add_nested, which is a left join on index
+                new_df = self.add_nested(value, name=key)
+                self._update_inplace(new_df)
+                return
 
         components = self._parse_hierarchical_components(key)
         # Replacing or adding columns to a nested structure
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -9,6 +9,7 @@
 from upath import UPath
 
 from ..series.dtype import NestedDtype
+from ..series.packer import pack_lists
 from ..series.utils import table_to_struct_array
 from .core import NestedFrame
 
@@ -17,6 +18,7 @@ def read_parquet(
     data: str | UPath | bytes,
     columns: list[str] | None = None,
     reject_nesting: list[str] | str | None = None,
+    autocast_list: bool = False,
     **kwargs,
 ) -> NestedFrame:
     """
@@ -39,6 +41,8 @@ def read_parquet(
         is castable to a nested column. However, this assumption is invalid if
         the lists within the struct have mismatched lengths for any given item.
         Columns specified here will be read using the corresponding pandas.ArrowDtype.
+    autocast_list: bool, default=True
+        If True, automatically cast list columns to nested columns with NestedDType.
     kwargs: dict
         Keyword arguments passed to `pyarrow.parquet.read_table`
 
@@ -152,12 +156,13 @@ def read_parquet(
         for col, struct in structs.items():
             table = table.append_column(col, struct)
 
-    return from_pyarrow(table, reject_nesting=reject_nesting)
+    return from_pyarrow(table, reject_nesting=reject_nesting, autocast_list=autocast_list)
 
 
 def from_pyarrow(
     table: pa.Table,
     reject_nesting: list[str] | str | None = None,
+    autocast_list: bool = False,
 ) -> NestedFrame:
     """
     Load a pyarrow Table object into a NestedFrame.
@@ -172,6 +177,8 @@ def from_pyarrow(
         is castable to a nested column. However, this assumption is invalid if
         the lists within the struct have mismatched lengths for any given item.
         Columns specified here will be read using the corresponding pandas.ArrowDtype.
+    autocast_list: bool, default=False
+        If True, automatically cast list columns to nested columns with NestedDType.
 
     Returns
     -------
@@ -191,6 +198,10 @@ def from_pyarrow(
     # Attempt to cast struct columns to NestedDTypes
     df = _cast_struct_cols_to_nested(df, reject_nesting)
 
+    # If autocast_list is True, cast list columns to NestedDTypes
+    if autocast_list:
+        df = _cast_list_cols_to_nested(df)
+
     return df
 
 
@@ -221,3 +232,11 @@ def _cast_struct_cols_to_nested(df, reject_nesting):
                     f" read_parquet(..., reject_nesting=['{col}'])"
                 ) from err
     return df
+
+
+def _cast_list_cols_to_nested(df):
+    """cast list columns to nested dtype"""
+    for col, dtype in df.dtypes.items():
+        if pa.types.is_list(dtype.pyarrow_dtype):
+            df[col] = pack_lists(df[[col]])
+    return df