Skip to content

Commit 5f43b2d

Browse files
authored
Merge pull request #254 from lincc-frameworks/list-struct-backend
List-struct backend
2 parents 7791e5d + 7da8f06 commit 5f43b2d

File tree

14 files changed

+535
-222
lines changed

14 files changed

+535
-222
lines changed

docs/gettingstarted/quickstart.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@
345345
],
346346
"metadata": {
347347
"kernelspec": {
348-
"display_name": "nested",
348+
"display_name": "Python 3 (ipykernel)",
349349
"language": "python",
350350
"name": "python3"
351351
},
@@ -359,9 +359,9 @@
359359
"name": "python",
360360
"nbconvert_exporter": "python",
361361
"pygments_lexer": "ipython3",
362-
"version": "3.10.4"
362+
"version": "3.13.3"
363363
}
364364
},
365365
"nbformat": 4,
366-
"nbformat_minor": 2
366+
"nbformat_minor": 4
367367
}

docs/reference/ext_array.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,16 @@ Functions
1515
.. autosummary::
1616
:toctree: api/
1717

18+
series.ext_array.NestedExtensionArray.dtype
19+
series.ext_array.NestedExtensionArray.nbytes
20+
series.ext_array.NestedExtensionArray.list_array
21+
series.ext_array.NestedExtensionArray.struct_array
22+
series.ext_array.NestedExtensionArray.py_table
23+
series.ext_array.NestedExtensionArray.list_offsets
24+
series.ext_array.NestedExtensionArray.field_names
25+
series.ext_array.NestedExtensionArray.list_lengths
26+
series.ext_array.NestedExtensionArray.flat_length
27+
series.ext_array.NestedExtensionArray.num_chunks
1828
series.ext_array.NestedExtensionArray.to_numpy
1929
series.ext_array.NestedExtensionArray.isna
2030
series.ext_array.NestedExtensionArray.take
@@ -24,6 +34,7 @@ Functions
2434
series.ext_array.NestedExtensionArray.from_sequence
2535
series.ext_array.NestedExtensionArray.to_arrow_ext_array
2636
series.ext_array.NestedExtensionArray.to_pyarrow_scalar
37+
series.ext_array.NestedExtensionArray.get_list_index
2738
series.ext_array.NestedExtensionArray.iter_field_lists
2839
series.ext_array.NestedExtensionArray.view_fields
2940
series.ext_array.NestedExtensionArray.set_flat_field
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .list_struct_storage import ListStructStorage # noqa: F401
2+
from .struct_list_storage import StructListStorage # noqa: F401
3+
from .table_storage import TableStorage # noqa: F401
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from __future__ import annotations # Python 3.9 requires it for X | Y type hints
2+
3+
from typing import TYPE_CHECKING, Any
4+
5+
import pyarrow as pa
6+
7+
from nested_pandas.series.utils import transpose_struct_list_chunked, validate_list_struct_type
8+
9+
if TYPE_CHECKING:
10+
from nested_pandas.series._storage.struct_list_storage import StructListStorage
11+
from nested_pandas.series._storage.table_storage import TableStorage
12+
13+
14+
class ListStructStorage:
15+
"""Store nested data as a PyArrow list-struct array.
16+
17+
Parameters
18+
----------
19+
array : pa.ListArray or pa.ChunkedArray
20+
Pyarrow list-array with a struct value type. An array or a chunk-array
21+
"""
22+
23+
_data: pa.ChunkedArray
24+
25+
def __init__(self, array: pa.ListArray | pa.ChunkedArray) -> None:
26+
if isinstance(array, pa.ListArray):
27+
array = pa.chunked_array([array])
28+
if not isinstance(array, pa.ChunkedArray):
29+
raise ValueError("array must be of type pa.ChunkedArray")
30+
validate_list_struct_type(array.type)
31+
self._data = array
32+
33+
@property
34+
def data(self) -> pa.ChunkedArray:
35+
return self._data
36+
37+
@classmethod
38+
def from_struct_list_storage(cls, struct_list_storage: StructListStorage) -> Self: # type: ignore # noqa: F821
39+
"""Construct from a StructListStorage object.
40+
41+
Parameters
42+
----------
43+
struct_list_storage : StructListStorage
44+
StructListStorage object.
45+
"""
46+
data = transpose_struct_list_chunked(struct_list_storage.data, validate=False)
47+
return cls(data)
48+
49+
@classmethod
50+
def from_table_storage(cls, table_storage: TableStorage) -> Self: # type: ignore # noqa: F821
51+
"""Construct from a TableStorage object.
52+
53+
Parameters
54+
----------
55+
table_storage : TableStorage
56+
TableStorage object.
57+
"""
58+
from nested_pandas.series._storage import StructListStorage
59+
60+
struct_list_storage = StructListStorage.from_table_storage(table_storage)
61+
return cls.from_struct_list_storage(struct_list_storage)
62+
63+
def __len__(self) -> int:
64+
return len(self._data)
65+
66+
def __eq__(self, other: Any) -> bool:
67+
if not isinstance(other, type(self)):
68+
return False
69+
return self._data == other._data
70+
71+
@property
72+
def nbytes(self) -> int:
73+
"""Number of bytes consumed by the data in memory."""
74+
return self._data.nbytes
75+
76+
@property
77+
def type(self) -> pa.ListType:
78+
"""Pyarrow type of the underlying array."""
79+
return self._data.type
80+
81+
@property
82+
def num_chunks(self) -> int:
83+
"""Number of chunks in the underlying array."""
84+
return self._data.num_chunks
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from __future__ import annotations # Python 3.9 requires it for X | Y type hints
2+
3+
from collections.abc import Iterator
4+
from typing import TYPE_CHECKING
5+
6+
import pyarrow as pa
7+
8+
from nested_pandas.series.utils import (
9+
table_to_struct_array,
10+
transpose_list_struct_chunked,
11+
validate_struct_list_array_for_equal_lengths,
12+
)
13+
14+
if TYPE_CHECKING:
15+
from nested_pandas.series._storage.list_struct_storage import ListStructStorage
16+
from nested_pandas.series._storage.table_storage import TableStorage
17+
18+
19+
class StructListStorage:
20+
"""Store nested data as a PyArrow struct-list array.
21+
22+
Parameters
23+
----------
24+
array : pa.StructArray or pa.ChunkedArray
25+
Pyarrow struct-array with all fields to be list-arrays.
26+
All list-values must be "aligned", e.g., have the same length.
27+
validate : bool (default True)
28+
Check that all the lists have the same lengths for each struct-value.
29+
"""
30+
31+
_data: pa.ChunkedArray
32+
33+
def __init__(self, array: pa.StructArray | pa.ChunkedArray, *, validate: bool = True) -> None:
34+
if isinstance(array, pa.StructArray):
35+
array = pa.chunked_array([array])
36+
if not isinstance(array, pa.ChunkedArray):
37+
raise ValueError("array must be a StructArray or ChunkedArray")
38+
39+
if validate:
40+
for chunk in array.chunks:
41+
validate_struct_list_array_for_equal_lengths(chunk)
42+
43+
self._data = array
44+
45+
@property
46+
def data(self) -> pa.ChunkedArray:
47+
return self._data
48+
49+
@classmethod
50+
def from_list_struct_storage(cls, list_struct_storage: ListStructStorage) -> Self: # type: ignore # noqa: F821
51+
"""Construct from a ListStructStorage object.
52+
53+
Parameters
54+
----------
55+
list_struct_storage : ListStructStorage
56+
ListStructStorage object.
57+
"""
58+
data = transpose_list_struct_chunked(list_struct_storage.data)
59+
return cls(data, validate=False)
60+
61+
@classmethod
62+
def from_table_storage(cls, table_storage: TableStorage) -> Self: # type: ignore # noqa: F821
63+
"""Construct from a TableStorage object.
64+
65+
Parameters
66+
----------
67+
table_storage : TableStorage
68+
TableStorage object.
69+
"""
70+
data = table_to_struct_array(table_storage.data)
71+
return cls(data, validate=False)
72+
73+
def __iter__(self) -> Iterator[pa.StructScalar]:
74+
return iter(self._data)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from __future__ import annotations # Python 3.9 requires it for X | Y type hints
2+
3+
from typing import TYPE_CHECKING
4+
5+
import pyarrow as pa
6+
7+
from nested_pandas.series.utils import table_to_struct_array, validate_struct_list_array_for_equal_lengths
8+
9+
if TYPE_CHECKING:
10+
from nested_pandas.series._storage.list_struct_storage import ListStructStorage
11+
from nested_pandas.series._storage.struct_list_storage import StructListStorage
12+
13+
14+
class TableStorage:
15+
"""Store nested data as a PyArrow table with list-columns.
16+
17+
Parameters
18+
----------
19+
table : pa.Table
20+
PyArrow table, all columns must be list-columns.
21+
All list-values must be "aligned", e.g., have the same length.
22+
"""
23+
24+
_data: pa.Table
25+
26+
def __init__(self, table: pa.Table, validate: bool = True) -> None:
27+
if validate:
28+
struct_array = table_to_struct_array(table)
29+
for chunk in struct_array.iterchunks():
30+
validate_struct_list_array_for_equal_lengths(chunk)
31+
32+
self._data = table
33+
34+
@property
35+
def data(self) -> pa.Table:
36+
return self._data
37+
38+
@classmethod
39+
def from_list_struct_storage(cls, list_storage: ListStructStorage) -> Self: # type: ignore # noqa: F821
40+
"""Construct from a StructListStorage object.
41+
42+
Parameters
43+
----------
44+
list_storage : ListStructStorage
45+
StructListStorage object.
46+
"""
47+
from nested_pandas.series._storage import StructListStorage
48+
49+
struct_list_storage = StructListStorage.from_list_struct_storage(list_storage)
50+
return cls.from_struct_list_storage(struct_list_storage)
51+
52+
@classmethod
53+
def from_struct_list_storage(cls, struct_list_storage: StructListStorage) -> Self: # type: ignore # noqa: F821
54+
"""Construct from a StructListStorage object.
55+
56+
Parameters
57+
----------
58+
struct_list_storage : StructListStorage
59+
StructListStorage object.
60+
"""
61+
table = pa.Table.from_struct_array(struct_list_storage.data)
62+
return cls(table, validate=False)

src/nested_pandas/series/accessor.py

Lines changed: 7 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from nested_pandas.series.dtype import NestedDtype
1515
from nested_pandas.series.packer import pack_sorted_df_into_struct
16+
from nested_pandas.series.utils import nested_types_mapper
1617

1718
__all__ = ["NestSeriesAccessor"]
1819

@@ -70,25 +71,10 @@ def to_lists(self, fields: list[str] | None = None) -> pd.DataFrame:
7071
if len(fields) == 0:
7172
raise ValueError("Cannot convert a struct with no fields to lists")
7273

73-
list_chunks = defaultdict(list)
74-
for chunk in self._series.array._chunked_array.iterchunks():
75-
struct_array = cast(pa.StructArray, chunk)
76-
for field in fields:
77-
list_array = cast(pa.ListArray, struct_array.field(field))
78-
list_chunks[field].append(list_array)
74+
list_df = self._series.array.pa_table.select(fields).to_pandas(types_mapper=nested_types_mapper)
75+
list_df.index = self._series.index
7976

80-
list_series = {}
81-
for field, chunks in list_chunks.items():
82-
chunked_array = pa.chunked_array(chunks)
83-
list_series[field] = pd.Series(
84-
chunked_array,
85-
dtype=pd.ArrowDtype(chunked_array.type),
86-
index=self._series.index,
87-
name=field,
88-
copy=False,
89-
)
90-
91-
return pd.DataFrame(list_series)
77+
return list_df
9278

9379
def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
9480
"""Convert nested series into dataframe of flat arrays
@@ -130,7 +116,7 @@ def to_flat(self, fields: list[str] | None = None) -> pd.DataFrame:
130116
index = pd.Series(self.get_flat_index(), name=self._series.index.name)
131117

132118
flat_chunks = defaultdict(list)
133-
for chunk in self._series.array._chunked_array.iterchunks():
119+
for chunk in self._series.array.struct_array.iterchunks():
134120
struct_array = cast(pa.StructArray, chunk)
135121
for field in fields:
136122
list_array = cast(pa.ListArray, struct_array.field(field))
@@ -439,7 +425,7 @@ def get_flat_series(self, field: str) -> pd.Series:
439425
"""
440426

441427
flat_chunks = []
442-
for nested_chunk in self._series.array._chunked_array.iterchunks():
428+
for nested_chunk in self._series.array.struct_array.iterchunks():
443429
struct_array = cast(pa.StructArray, nested_chunk)
444430
list_array = cast(pa.ListArray, struct_array.field(field))
445431
flat_array = list_array.flatten()
@@ -483,12 +469,7 @@ def get_list_series(self, field: str) -> pd.Series:
483469
Name: flux, dtype: list<item: double>[pyarrow]
484470
"""
485471

486-
list_chunks = []
487-
for nested_chunk in self._series.array._chunked_array.iterchunks():
488-
struct_array = cast(pa.StructArray, nested_chunk)
489-
list_array = struct_array.field(field)
490-
list_chunks.append(list_array)
491-
list_chunked_array = pa.chunked_array(list_chunks)
472+
list_chunked_array = self._series.array.pa_table[field]
492473
return pd.Series(
493474
list_chunked_array,
494475
dtype=pd.ArrowDtype(list_chunked_array.type),

src/nested_pandas/series/dtype.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,12 @@ def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray:
154154
pyarrow_dtype: pa.StructType
155155

156156
def __init__(self, pyarrow_dtype: pa.DataType) -> None:
157-
self.pyarrow_dtype, self.list_struct_pyarrow_dtype = self._validate_dtype(pyarrow_dtype)
157+
self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype)
158+
159+
@property
160+
def struct_list_pa_dtype(self) -> pa.StructType:
161+
"""Struct-list pyarrow type representing the nested type."""
162+
return self.pyarrow_dtype
158163

159164
@classmethod
160165
def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821
@@ -261,5 +266,5 @@ def to_pandas_arrow_dtype(self, list_struct: bool = False) -> ArrowDtype:
261266
The corresponding pandas.ArrowDtype.
262267
"""
263268
if list_struct:
264-
return ArrowDtype(self.list_struct_pyarrow_dtype)
269+
return ArrowDtype(self.list_struct_pa_dtype)
265270
return ArrowDtype(self.pyarrow_dtype)

0 commit comments

Comments
 (0)