Skip to content

Commit 7f82094

Browse files
committed
struct-list transpose functions
1 parent a416b6b commit 7f82094

File tree

1 file changed

+149
-1
lines changed

1 file changed

+149
-1
lines changed

src/nested_pandas/series/utils.py

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
from __future__ import annotations # Python 3.9 requires it for X | Y type hints
2+
13
from collections.abc import Generator
4+
from typing import cast
25

36
import pyarrow as pa
47

58

6-
def is_pa_type_a_list(pa_type: type[pa.Array]) -> bool:
9+
def is_pa_type_a_list(pa_type: pa.DataType) -> bool:
710
"""Check if the given pyarrow type is a list type.
811
912
I.e. one of the following types: ListArray, LargeListArray,
@@ -39,3 +42,148 @@ def enumerate_chunks(array: pa.ChunkedArray) -> Generator[tuple[slice, pa.Array]
3942
index_stop = index_start + len(chunk)
4043
yield slice(index_start, index_stop), chunk
4144
index_start = index_stop
45+
46+
47+
def validate_struct_list_array_for_equal_lengths(array: pa.StructArray) -> None:
48+
"""Check if the given struct array has lists of equal length.
49+
50+
Parameters
51+
----------
52+
array : pa.StructArray
53+
Input struct array.
54+
55+
Raises
56+
------
57+
ValueError
58+
If the struct array has lists of unequal length or type of the input
59+
array is not a StructArray or fields are not ListArrays.
60+
"""
61+
if not pa.types.is_struct(array.type):
62+
raise ValueError(f"Expected a StructArray, got {array.type}")
63+
64+
first_list_array: pa.ListArray | None = None
65+
for field in array.type:
66+
inner_array = array.field(field.name)
67+
if not is_pa_type_a_list(inner_array.type):
68+
raise ValueError(f"Expected a ListArray, got {inner_array.type}")
69+
list_array = cast(pa.ListArray, inner_array)
70+
71+
if first_list_array is None:
72+
first_list_array = list_array
73+
continue
74+
# compare offsets from the first list array with the current one
75+
if not first_list_array.offsets.equals(list_array.offsets):
76+
raise ValueError("Offsets of all ListArrays must be the same")
77+
78+
79+
def transpose_struct_list_type(t: pa.StructType) -> pa.ListType:
80+
"""Converts a type of struct-list array into a type of list-struct array.
81+
82+
Parameters
83+
----------
84+
t : pa.DataType
85+
Input type of struct-list array.
86+
87+
Returns
88+
-------
89+
pa.DataType
90+
Type of list-struct array.
91+
92+
Raises
93+
------
94+
ValueError
95+
If the input type is not a struct-list type.
96+
"""
97+
if not pa.types.is_struct(t):
98+
raise ValueError(f"Expected a StructType, got {t}")
99+
100+
fields = []
101+
for field in t:
102+
if not is_pa_type_a_list(field.type):
103+
raise ValueError(f"Expected a ListType, got {field.type}")
104+
list_type = cast(pa.ListType, field.type)
105+
fields.append(pa.field(field.name, list_type.value_type))
106+
107+
list_type = cast(pa.ListType, pa.list_(pa.struct(fields)))
108+
return list_type
109+
110+
111+
def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) -> pa.ListArray:
112+
"""Converts a struct-array of lists into a list-array of structs.
113+
114+
Parameters
115+
----------
116+
array : pa.StructArray
117+
Input struct array, each scalar must have lists of equal length.
118+
validate : bool, default True
119+
Whether to validate the input array for list lengths. Raises ValueError
120+
if something is wrong.
121+
122+
Returns
123+
-------
124+
pa.ListArray
125+
List array of structs.
126+
"""
127+
if validate:
128+
validate_struct_list_array_for_equal_lengths(array)
129+
130+
# Since we know that all lists have the same length, we can use the first list to get offsets
131+
offsets = array.field(0).offsets
132+
struct_flat_array = pa.StructArray.from_arrays(
133+
[field.values for field in array.flatten()],
134+
names=array.type.names,
135+
)
136+
return pa.ListArray.from_arrays(offsets, struct_flat_array)
137+
138+
139+
def transpose_list_struct_type(t: pa.ListType) -> pa.StructType:
140+
"""Converts a type of list-struct array into a type of struct-list array.
141+
142+
Parameters
143+
----------
144+
t : pa.DataType
145+
Input type of list-struct array.
146+
147+
Returns
148+
-------
149+
pa.DataType
150+
Type of struct-list array.
151+
152+
Raises
153+
------
154+
ValueError
155+
If the input type is not a list-struct type.
156+
"""
157+
if not is_pa_type_a_list(t):
158+
raise ValueError(f"Expected a ListType, got {t}")
159+
160+
struct_type = cast(pa.StructType, t.value_type)
161+
fields = []
162+
for field in struct_type:
163+
fields.append(pa.field(field.name, pa.list_(field.type)))
164+
165+
struct_type = cast(pa.StructType, pa.struct(fields))
166+
return struct_type
167+
168+
169+
def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray:
170+
"""Converts a list-array of structs into a struct-array of lists.
171+
172+
Parameters
173+
----------
174+
array : pa.ListArray
175+
Input list array of structs.
176+
177+
Returns
178+
-------
179+
pa.StructArray
180+
Struct array of lists.
181+
"""
182+
offsets, values = array.offsets, array.values
183+
184+
fields = []
185+
for field_values in values.flatten():
186+
list_array = pa.ListArray.from_arrays(offsets, field_values)
187+
fields.append(list_array)
188+
189+
return pa.StructArray.from_arrays(fields, names=array.type.value_type.names)

0 commit comments

Comments
 (0)