Merge branch 'main' into flat_brackets

wilsonbb · wilsonbb · commit 7d45201e895c · 2025-06-12T13:09:24.000-07:00
diff --git a/.copier-answers.yml b/.copier-answers.yml
@@ -1,5 +1,5 @@
 # Changes here will be overwritten by Copier
-_commit: v2.0.6
+_commit: v2.0.7
 _src_path: gh:lincc-frameworks/python-project-template
 author_email: brantd@uw.edu
 author_name: LINCC Frameworks
@@ -18,8 +18,8 @@ project_license: MIT
 project_name: nested-pandas
 project_organization: lincc-frameworks
 python_versions:
-- '3.9'
 - '3.10'
 - '3.11'
 - '3.12'
 - '3.13'
+test_lowest_version: all
diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
+        python-version: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/testing-and-coverage.yml b/.github/workflows/testing-and-coverage.yml
@@ -1,3 +1,4 @@
+
 # This workflow will install Python dependencies, run tests and report code coverage with a variety of Python versions
 # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 
@@ -15,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
+        python-version: ['3.10', '3.11', '3.12', '3.13']
 
     steps:
     - uses: actions/checkout@v4
@@ -36,26 +37,23 @@ jobs:
       uses: codecov/codecov-action@v5
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
-
   test-lowest-versions:
-    
     runs-on: ubuntu-latest
-
     steps:
     - uses: actions/checkout@v4
-    - name: Set up Python 3.9
+    - name: Set up Python 3.10
       uses: actions/setup-python@v5
       with:
-        python-version: '3.9'
+        python-version: '3.10'
     - name: Install dependencies
       run: |
         sudo apt-get update
         python -m pip install --upgrade uv
         uv venv venv
         source venv/bin/activate
         uv pip compile --resolution=lowest -o requirements_lowest.txt pyproject.toml
-        uv pip install --constraint=requirements_lowest.txt -e '.[dev]'
+        uv pip install --constraint=requirements_lowest.txt -e .[dev]
     - name: Run unit tests with pytest
       run: |
         source venv/bin/activate
-        python -m pytest
+        python -m pytest
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,11 +16,11 @@ classifiers = [
     "Programming Language :: Python",
 ]
 dynamic = ["version"]
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "numpy>=2",
     # We use internal pd._libs.missing and experimental ArrowExtensionArray
-    "pandas>=2.2.3,<2.3",
+    "pandas>=2.2.3,<2.4",
     "pyarrow>=18",
     "universal_pathlib>=0.2",
 ]
@@ -60,18 +60,9 @@ testpaths = [
 ]
 addopts = "--doctest-modules --doctest-glob=*.rst"
 
-[tool.black]
-line-length = 110
-target-version = ["py39"]
-
-[tool.isort]
-profile = "black"
-line_length = 110
-
 [tool.ruff]
 line-length = 110
-target-version = "py39"
-
+target-version = "py310"
 [tool.ruff.lint]
 select = [
     # pycodestyle
@@ -103,7 +94,6 @@ select = [
     # Numpy v2.0 compatibility
     "NPY201",
 ]
-
 ignore = [
     "UP006", # Allow non standard library generics in type hints
     "UP007", # Allow Union in type hints
@@ -113,6 +103,7 @@ ignore = [
     "UP015", # Allow redundant open parameters
     "UP028", # Allow yield in for loop
 ]
+
 [tool.setuptools.package-data]
 nested_pandas = ["py.typed"]
 
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -549,7 +549,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
                 # This is a simple heuristic but infers more than its dtype
                 # which will probably be an object.
                 sample_val = df[col].iloc[0]
-                if not hasattr(sample_val, "__iter__") and not isinstance(sample_val, (str, bytes)):
+                if not hasattr(sample_val, "__iter__") and not isinstance(sample_val, str | bytes):
                     raise ValueError(
                         f"Cannot pack column {col} which does not contain an iterable list based "
                         "on its first value, {sample_val}."
@@ -1301,7 +1301,7 @@ def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame:  # t
             else:
                 iterators.append(self[layer].array.iter_field_lists(col))
 
-        results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators)]
+        results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators, strict=True)]
         results_nf = NestedFrame(results, index=self.index)
 
         if infer_nesting:
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -87,7 +87,7 @@ def read_parquet(
     # First load through pyarrow
     # Check if `data` is a file-like object or a sequence
     if hasattr(data, "read") or (
-        isinstance(data, Sequence) and not isinstance(data, (str, bytes, bytearray))
+        isinstance(data, Sequence) and not isinstance(data, str | bytes | bytearray)
     ):
         # If `data` is a file-like object or a sequence, pass it directly to pyarrow
         table = pq.read_table(data, columns=columns, **kwargs)
@@ -103,7 +103,7 @@ def read_parquet(
     # was from a nested column.
     if columns is not None:
         nested_structures: dict[str, list[int]] = {}
-        for i, (col_in, col_pa) in enumerate(zip(columns, table.column_names)):
+        for i, (col_in, col_pa) in enumerate(zip(columns, table.column_names, strict=True)):
             # if the column name is not the same, it was a partial load
             if col_in != col_pa:
                 # get the top-level column name
@@ -152,11 +152,42 @@ def read_parquet(
         for col, struct in structs.items():
             table = table.append_column(col, struct)
 
+    return from_pyarrow(table, reject_nesting=reject_nesting)
+
+
+def from_pyarrow(
+    table: pa.Table,
+    reject_nesting: list[str] | str | None = None,
+) -> NestedFrame:
+    """
+    Load a pyarrow Table object into a NestedFrame.
+
+    Parameters
+    ----------
+    table: pa.Table
+        PyArrow Table object to load NestedFrame from
+    reject_nesting: list or str, default=None
+        Column(s) to reject from being cast to a nested dtype. By default,
+        nested-pandas assumes that any struct column with all fields being lists
+        is castable to a nested column. However, this assumption is invalid if
+        the lists within the struct have mismatched lengths for any given item.
+        Columns specified here will be read using the corresponding pandas.ArrowDtype.
+
+    Returns
+    -------
+    NestedFrame
+
+    """
+
+    if reject_nesting is None:
+        reject_nesting = []
+    elif isinstance(reject_nesting, str):
+        reject_nesting = [reject_nesting]
+
     # Convert to NestedFrame
     # not zero-copy, but reduce memory pressure via the self_destruct kwarg
     # https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
     df = NestedFrame(table.to_pandas(types_mapper=pd.ArrowDtype, split_blocks=True, self_destruct=True))
-    del table
     # Attempt to cast struct columns to NestedDTypes
     df = _cast_struct_cols_to_nested(df, reject_nesting)
 
diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -35,8 +35,8 @@
 # typing.Self and "|" union syntax don't exist in Python 3.9
 from __future__ import annotations
 
-from collections.abc import Generator, Iterable, Iterator, Sequence
-from typing import Any, Callable, cast
+from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
+from typing import Any, cast
 
 import numpy as np
 import pandas as pd
@@ -551,7 +551,7 @@ def format_series(series):
                     return series.apply(repr)
 
                 def format_row(row):
-                    return ", ".join(f"{name}: {value}" for name, value in zip(row.index, row))
+                    return ", ".join(f"{name}: {value}" for name, value in zip(row.index, row, strict=True))
 
                 # Format series to strings
                 df = df.apply(format_series, axis=0)
@@ -665,7 +665,7 @@ def _box_pa_array(cls, value, *, pa_type: pa.DataType | None) -> pa.Array | pa.C
         """Convert a value to a PyArrow array with the specified type."""
         if isinstance(value, cls):
             pa_array = value.struct_array
-        elif isinstance(value, (pa.Array, pa.ChunkedArray)):
+        elif isinstance(value, pa.Array | pa.ChunkedArray):
             pa_array = value
         else:
             try:
@@ -700,7 +700,7 @@ def _from_arrow_like(cls, arraylike, dtype: NestedDtype | None = None) -> Self:
             if dtype is None or dtype == arraylike.dtype:
                 return arraylike
             array = arraylike.list_array
-        elif isinstance(arraylike, (pa.Array, pa.ChunkedArray)):
+        elif isinstance(arraylike, pa.Array | pa.ChunkedArray):
             array = arraylike
         else:
             array = pa.array(arraylike)
@@ -1112,7 +1112,7 @@ def fill_field_lists(self, field: str, value: ArrayLike, *, keep_dtype: bool = F
             )
         if np.size(value) != len(self):
             raise ValueError("The length of the input array must be equal to the length of the series")
-        if isinstance(value, (pa.ChunkedArray, pa.Array)):
+        if isinstance(value, pa.ChunkedArray | pa.Array):
             value = pa.compute.take(value, self.get_list_index())
         else:
             value = np.repeat(value, self.list_lengths)
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
@@ -7,6 +7,7 @@
 import pytest
 from nested_pandas import read_parquet
 from nested_pandas.datasets import generate_data
+from nested_pandas.nestedframe.io import from_pyarrow
 from pandas.testing import assert_frame_equal
 from upath import UPath
 
@@ -221,6 +222,42 @@ def test_read_parquet_test_mixed_struct():
         assert len(nf.nested_columns) == 0
 
 
+def test_from_pyarrow_test_mixed_struct():
+    """Test reading a pyarrow table with mixed struct types"""
+    # Create the pure-list StructArray
+    field1 = pa.array([[1, 2], [3, 4], [5, 6]])
+    field2 = pa.array([["a", "b"], ["b", "c"], ["c", "d"]])
+    field3 = pa.array([[True, False], [True, False], [True, False]])
+    struct_array_list = pa.StructArray.from_arrays([field1, field2, field3], ["list1", "list2", "list3"])
+
+    # Create the value StructArray
+    field1 = pa.array([1, 2, 3])
+    field2 = pa.array(["a", "b", "c"])
+    field3 = pa.array([True, False, True])
+    struct_array_val = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "val3"])
+
+    # Create the mixed-list StructArray
+    field1 = pa.array([1, 2, 3])
+    field2 = pa.array(["a", "b", "c"])
+    field3 = pa.array([[True, False], [True, False], [True, False]])
+    struct_array_mix = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "list3"])
+
+    # Create a PyArrow Table with the StructArray as one of the columns
+    table = pa.table(
+        {
+            "id": pa.array([100, 101, 102]),  # Another column
+            "struct_list": struct_array_list,  # Struct column
+            "struct_value": struct_array_val,
+            "struct_mix": struct_array_mix,
+        }
+    )
+
+    # Test full read
+    nf = from_pyarrow(table)
+    assert nf.columns.tolist() == ["id", "struct_list", "struct_value", "struct_mix"]
+    assert nf.nested_columns == ["struct_list"]
+
+
 def test_to_parquet():
     """Test writing a parquet file with no columns specified"""
     # Load in the example file
diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py
@@ -1375,9 +1375,9 @@ def test_iter_field_lists():
     )
     ext_array = NestedExtensionArray(struct_array)
 
-    for actual, desired in zip(ext_array.iter_field_lists("a"), a):
+    for actual, desired in zip(ext_array.iter_field_lists("a"), a, strict=True):
         assert_array_equal(actual, desired)
-    for actual, desired in zip(ext_array.iter_field_lists("b"), b):
+    for actual, desired in zip(ext_array.iter_field_lists("b"), b, strict=True):
         assert_array_equal(actual, desired)
 
 

Original file line number	Diff line number	Diff line change
`@@ -1375,9 +1375,9 @@ def test_iter_field_lists():`
`1375`	`1375`	`)`
`1376`	`1376`	`ext_array = NestedExtensionArray(struct_array)`
`1377`	`1377`
`1378`		`- for actual, desired in zip(ext_array.iter_field_lists("a"), a):`
	`1378`	`+ for actual, desired in zip(ext_array.iter_field_lists("a"), a, strict=True):`
`1379`	`1379`	`assert_array_equal(actual, desired)`
`1380`		`- for actual, desired in zip(ext_array.iter_field_lists("b"), b):`
	`1380`	`+ for actual, desired in zip(ext_array.iter_field_lists("b"), b, strict=True):`
`1381`	`1381`	`assert_array_equal(actual, desired)`
`1382`	`1382`
`1383`	`1383`