Skip to content

Commit 7d45201

Browse files
committed
Merge branch 'main' into flat_brackets
2 parents 523a675 + 54944e1 commit 7d45201

File tree

9 files changed

+94
-37
lines changed

9 files changed

+94
-37
lines changed

.copier-answers.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Changes here will be overwritten by Copier
2-
_commit: v2.0.6
2+
_commit: v2.0.7
33
_src_path: gh:lincc-frameworks/python-project-template
44
author_email: [email protected]
55
author_name: LINCC Frameworks
@@ -18,8 +18,8 @@ project_license: MIT
1818
project_name: nested-pandas
1919
project_organization: lincc-frameworks
2020
python_versions:
21-
- '3.9'
2221
- '3.10'
2322
- '3.11'
2423
- '3.12'
2524
- '3.13'
25+
test_lowest_version: all

.github/workflows/smoke-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
runs-on: ubuntu-latest
2121
strategy:
2222
matrix:
23-
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
23+
python-version: ['3.10', '3.11', '3.12', '3.13']
2424

2525
steps:
2626
- uses: actions/checkout@v4

.github/workflows/testing-and-coverage.yml

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
# This workflow will install Python dependencies, run tests and report code coverage with a variety of Python versions
23
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
34

@@ -15,7 +16,7 @@ jobs:
1516
runs-on: ubuntu-latest
1617
strategy:
1718
matrix:
18-
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
19+
python-version: ['3.10', '3.11', '3.12', '3.13']
1920

2021
steps:
2122
- uses: actions/checkout@v4
@@ -36,26 +37,23 @@ jobs:
3637
uses: codecov/codecov-action@v5
3738
with:
3839
token: ${{ secrets.CODECOV_TOKEN }}
39-
4040
test-lowest-versions:
41-
4241
runs-on: ubuntu-latest
43-
4442
steps:
4543
- uses: actions/checkout@v4
46-
- name: Set up Python 3.9
44+
- name: Set up Python 3.10
4745
uses: actions/setup-python@v5
4846
with:
49-
python-version: '3.9'
47+
python-version: '3.10'
5048
- name: Install dependencies
5149
run: |
5250
sudo apt-get update
5351
python -m pip install --upgrade uv
5452
uv venv venv
5553
source venv/bin/activate
5654
uv pip compile --resolution=lowest -o requirements_lowest.txt pyproject.toml
57-
uv pip install --constraint=requirements_lowest.txt -e '.[dev]'
55+
uv pip install --constraint=requirements_lowest.txt -e .[dev]
5856
- name: Run unit tests with pytest
5957
run: |
6058
source venv/bin/activate
61-
python -m pytest
59+
python -m pytest

pyproject.toml

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ classifiers = [
1616
"Programming Language :: Python",
1717
]
1818
dynamic = ["version"]
19-
requires-python = ">=3.9"
19+
requires-python = ">=3.10"
2020
dependencies = [
2121
"numpy>=2",
2222
# We use internal pd._libs.missing and experimental ArrowExtensionArray
23-
"pandas>=2.2.3,<2.3",
23+
"pandas>=2.2.3,<2.4",
2424
"pyarrow>=18",
2525
"universal_pathlib>=0.2",
2626
]
@@ -60,18 +60,9 @@ testpaths = [
6060
]
6161
addopts = "--doctest-modules --doctest-glob=*.rst"
6262

63-
[tool.black]
64-
line-length = 110
65-
target-version = ["py39"]
66-
67-
[tool.isort]
68-
profile = "black"
69-
line_length = 110
70-
7163
[tool.ruff]
7264
line-length = 110
73-
target-version = "py39"
74-
65+
target-version = "py310"
7566
[tool.ruff.lint]
7667
select = [
7768
# pycodestyle
@@ -103,7 +94,6 @@ select = [
10394
# Numpy v2.0 compatibility
10495
"NPY201",
10596
]
106-
10797
ignore = [
10898
"UP006", # Allow non standard library generics in type hints
10999
"UP007", # Allow Union in type hints
@@ -113,6 +103,7 @@ ignore = [
113103
"UP015", # Allow redundant open parameters
114104
"UP028", # Allow yield in for loop
115105
]
106+
116107
[tool.setuptools.package-data]
117108
nested_pandas = ["py.typed"]
118109

src/nested_pandas/nestedframe/core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
549549
# This is a simple heuristic but infers more than its dtype
550550
# which will probably be an object.
551551
sample_val = df[col].iloc[0]
552-
if not hasattr(sample_val, "__iter__") and not isinstance(sample_val, (str, bytes)):
552+
if not hasattr(sample_val, "__iter__") and not isinstance(sample_val, str | bytes):
553553
raise ValueError(
554554
f"Cannot pack column {col} which does not contain an iterable list based "
555555
"on its first value, {sample_val}."
@@ -1301,7 +1301,7 @@ def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame: # t
13011301
else:
13021302
iterators.append(self[layer].array.iter_field_lists(col))
13031303

1304-
results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators)]
1304+
results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators, strict=True)]
13051305
results_nf = NestedFrame(results, index=self.index)
13061306

13071307
if infer_nesting:

src/nested_pandas/nestedframe/io.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def read_parquet(
8787
# First load through pyarrow
8888
# Check if `data` is a file-like object or a sequence
8989
if hasattr(data, "read") or (
90-
isinstance(data, Sequence) and not isinstance(data, (str, bytes, bytearray))
90+
isinstance(data, Sequence) and not isinstance(data, str | bytes | bytearray)
9191
):
9292
# If `data` is a file-like object or a sequence, pass it directly to pyarrow
9393
table = pq.read_table(data, columns=columns, **kwargs)
@@ -103,7 +103,7 @@ def read_parquet(
103103
# was from a nested column.
104104
if columns is not None:
105105
nested_structures: dict[str, list[int]] = {}
106-
for i, (col_in, col_pa) in enumerate(zip(columns, table.column_names)):
106+
for i, (col_in, col_pa) in enumerate(zip(columns, table.column_names, strict=True)):
107107
# if the column name is not the same, it was a partial load
108108
if col_in != col_pa:
109109
# get the top-level column name
@@ -152,11 +152,42 @@ def read_parquet(
152152
for col, struct in structs.items():
153153
table = table.append_column(col, struct)
154154

155+
return from_pyarrow(table, reject_nesting=reject_nesting)
156+
157+
158+
def from_pyarrow(
159+
table: pa.Table,
160+
reject_nesting: list[str] | str | None = None,
161+
) -> NestedFrame:
162+
"""
163+
Load a pyarrow Table object into a NestedFrame.
164+
165+
Parameters
166+
----------
167+
table: pa.Table
168+
PyArrow Table object to load NestedFrame from
169+
reject_nesting: list or str, default=None
170+
Column(s) to reject from being cast to a nested dtype. By default,
171+
nested-pandas assumes that any struct column with all fields being lists
172+
is castable to a nested column. However, this assumption is invalid if
173+
the lists within the struct have mismatched lengths for any given item.
174+
Columns specified here will be read using the corresponding pandas.ArrowDtype.
175+
176+
Returns
177+
-------
178+
NestedFrame
179+
180+
"""
181+
182+
if reject_nesting is None:
183+
reject_nesting = []
184+
elif isinstance(reject_nesting, str):
185+
reject_nesting = [reject_nesting]
186+
155187
# Convert to NestedFrame
156188
# not zero-copy, but reduce memory pressure via the self_destruct kwarg
157189
# https://arrow.apache.org/docs/python/pandas.html#reducing-memory-use-in-table-to-pandas
158190
df = NestedFrame(table.to_pandas(types_mapper=pd.ArrowDtype, split_blocks=True, self_destruct=True))
159-
del table
160191
# Attempt to cast struct columns to NestedDTypes
161192
df = _cast_struct_cols_to_nested(df, reject_nesting)
162193

src/nested_pandas/series/ext_array.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@
3535
# typing.Self and "|" union syntax don't exist in Python 3.9
3636
from __future__ import annotations
3737

38-
from collections.abc import Generator, Iterable, Iterator, Sequence
39-
from typing import Any, Callable, cast
38+
from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
39+
from typing import Any, cast
4040

4141
import numpy as np
4242
import pandas as pd
@@ -551,7 +551,7 @@ def format_series(series):
551551
return series.apply(repr)
552552

553553
def format_row(row):
554-
return ", ".join(f"{name}: {value}" for name, value in zip(row.index, row))
554+
return ", ".join(f"{name}: {value}" for name, value in zip(row.index, row, strict=True))
555555

556556
# Format series to strings
557557
df = df.apply(format_series, axis=0)
@@ -665,7 +665,7 @@ def _box_pa_array(cls, value, *, pa_type: pa.DataType | None) -> pa.Array | pa.C
665665
"""Convert a value to a PyArrow array with the specified type."""
666666
if isinstance(value, cls):
667667
pa_array = value.struct_array
668-
elif isinstance(value, (pa.Array, pa.ChunkedArray)):
668+
elif isinstance(value, pa.Array | pa.ChunkedArray):
669669
pa_array = value
670670
else:
671671
try:
@@ -700,7 +700,7 @@ def _from_arrow_like(cls, arraylike, dtype: NestedDtype | None = None) -> Self:
700700
if dtype is None or dtype == arraylike.dtype:
701701
return arraylike
702702
array = arraylike.list_array
703-
elif isinstance(arraylike, (pa.Array, pa.ChunkedArray)):
703+
elif isinstance(arraylike, pa.Array | pa.ChunkedArray):
704704
array = arraylike
705705
else:
706706
array = pa.array(arraylike)
@@ -1112,7 +1112,7 @@ def fill_field_lists(self, field: str, value: ArrayLike, *, keep_dtype: bool = F
11121112
)
11131113
if np.size(value) != len(self):
11141114
raise ValueError("The length of the input array must be equal to the length of the series")
1115-
if isinstance(value, (pa.ChunkedArray, pa.Array)):
1115+
if isinstance(value, pa.ChunkedArray | pa.Array):
11161116
value = pa.compute.take(value, self.get_list_index())
11171117
else:
11181118
value = np.repeat(value, self.list_lengths)

tests/nested_pandas/nestedframe/test_io.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pytest
88
from nested_pandas import read_parquet
99
from nested_pandas.datasets import generate_data
10+
from nested_pandas.nestedframe.io import from_pyarrow
1011
from pandas.testing import assert_frame_equal
1112
from upath import UPath
1213

@@ -221,6 +222,42 @@ def test_read_parquet_test_mixed_struct():
221222
assert len(nf.nested_columns) == 0
222223

223224

225+
def test_from_pyarrow_test_mixed_struct():
226+
"""Test reading a pyarrow table with mixed struct types"""
227+
# Create the pure-list StructArray
228+
field1 = pa.array([[1, 2], [3, 4], [5, 6]])
229+
field2 = pa.array([["a", "b"], ["b", "c"], ["c", "d"]])
230+
field3 = pa.array([[True, False], [True, False], [True, False]])
231+
struct_array_list = pa.StructArray.from_arrays([field1, field2, field3], ["list1", "list2", "list3"])
232+
233+
# Create the value StructArray
234+
field1 = pa.array([1, 2, 3])
235+
field2 = pa.array(["a", "b", "c"])
236+
field3 = pa.array([True, False, True])
237+
struct_array_val = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "val3"])
238+
239+
# Create the mixed-list StructArray
240+
field1 = pa.array([1, 2, 3])
241+
field2 = pa.array(["a", "b", "c"])
242+
field3 = pa.array([[True, False], [True, False], [True, False]])
243+
struct_array_mix = pa.StructArray.from_arrays([field1, field2, field3], ["val1", "va12", "list3"])
244+
245+
# Create a PyArrow Table with the StructArray as one of the columns
246+
table = pa.table(
247+
{
248+
"id": pa.array([100, 101, 102]), # Another column
249+
"struct_list": struct_array_list, # Struct column
250+
"struct_value": struct_array_val,
251+
"struct_mix": struct_array_mix,
252+
}
253+
)
254+
255+
# Test full read
256+
nf = from_pyarrow(table)
257+
assert nf.columns.tolist() == ["id", "struct_list", "struct_value", "struct_mix"]
258+
assert nf.nested_columns == ["struct_list"]
259+
260+
224261
def test_to_parquet():
225262
"""Test writing a parquet file with no columns specified"""
226263
# Load in the example file

tests/nested_pandas/series/test_ext_array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,9 +1375,9 @@ def test_iter_field_lists():
13751375
)
13761376
ext_array = NestedExtensionArray(struct_array)
13771377

1378-
for actual, desired in zip(ext_array.iter_field_lists("a"), a):
1378+
for actual, desired in zip(ext_array.iter_field_lists("a"), a, strict=True):
13791379
assert_array_equal(actual, desired)
1380-
for actual, desired in zip(ext_array.iter_field_lists("b"), b):
1380+
for actual, desired in zip(ext_array.iter_field_lists("b"), b, strict=True):
13811381
assert_array_equal(actual, desired)
13821382

13831383

0 commit comments

Comments
 (0)