Skip to content

Commit cf6d152

Browse files
authored
Merge pull request #331 from lincc-frameworks/init_nestedseries
NestedSeries Implementation
2 parents 34cd4d6 + adc5874 commit cf6d152

File tree

9 files changed

+461
-100
lines changed

9 files changed

+461
-100
lines changed

src/nested_pandas/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# Import for registering
66
from .series.accessor import NestSeriesAccessor # noqa: F401
77
from .series.dtype import NestedDtype
8+
from .series.nestedseries import NestedSeries
89

910

10-
__all__ = ["NestedDtype", "NestedFrame", "read_parquet"]
11+
__all__ = ["NestedDtype", "NestedFrame", "read_parquet", "NestedSeries"]

src/nested_pandas/nestedframe/core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
_subexprs_by_nest,
2323
)
2424
from nested_pandas.series.dtype import NestedDtype
25+
from nested_pandas.series.nestedseries import NestedSeries
2526
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct
2627
from nested_pandas.series.utils import is_pa_type_a_list
2728

@@ -226,6 +227,8 @@ def __getitem__(self, item):
226227
return super().__getitem__(item)
227228

228229
def _getitem_str(self, item):
230+
if item in self.nested_columns:
231+
return NestedSeries(super().__getitem__(item))
229232
# Preempt the nested check if the item is a base column, with or without
230233
# dots and backticks.
231234
if item in self.columns:

src/nested_pandas/series/accessor.py

Lines changed: 62 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
from numpy.typing import ArrayLike
1111
from pandas.api.extensions import register_series_accessor
1212

13-
from nested_pandas.nestedframe.core import NestedFrame
1413
from nested_pandas.series.dtype import NestedDtype
14+
from nested_pandas.series.nestedseries import NestedSeries
1515
from nested_pandas.series.packer import pack_flat, pack_sorted_df_into_struct
1616
from nested_pandas.series.utils import nested_types_mapper
1717

@@ -152,7 +152,7 @@ def fields(self) -> list[str]:
152152
"""Names of the nested columns"""
153153
return self._series.array.field_names
154154

155-
def with_field(self, field: str, value: ArrayLike) -> pd.Series:
155+
def with_field(self, field: str, value: ArrayLike) -> NestedSeries:
156156
"""Set the field from flat-array of values and return a new series
157157
158158
It is an alias for `.nest.with_flat_field`.
@@ -167,7 +167,7 @@ def with_field(self, field: str, value: ArrayLike) -> pd.Series:
167167
168168
Returns
169169
-------
170-
pd.Series
170+
NestedSeries
171171
The new series with the field set.
172172
173173
Examples
@@ -185,7 +185,7 @@ def with_field(self, field: str, value: ArrayLike) -> pd.Series:
185185
"""
186186
return self.with_flat_field(field, value)
187187

188-
def with_flat_field(self, field: str, value: ArrayLike) -> pd.Series:
188+
def with_flat_field(self, field: str, value: ArrayLike) -> NestedSeries:
189189
"""Set the field from flat-array of values and return a new series
190190
191191
Parameters
@@ -198,7 +198,7 @@ def with_flat_field(self, field: str, value: ArrayLike) -> pd.Series:
198198
199199
Returns
200200
-------
201-
pd.Series
201+
NestedSeries
202202
The new series with the field set.
203203
204204
Examples
@@ -217,9 +217,9 @@ def with_flat_field(self, field: str, value: ArrayLike) -> pd.Series:
217217
"""
218218
new_array = self._series.array.copy()
219219
new_array.set_flat_field(field, value)
220-
return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)
220+
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)
221221

222-
def with_list_field(self, field: str, value: ArrayLike) -> pd.Series:
222+
def with_list_field(self, field: str, value: ArrayLike) -> NestedSeries:
223223
"""Set the field from list-array of values and return a new series
224224
225225
Parameters
@@ -232,7 +232,7 @@ def with_list_field(self, field: str, value: ArrayLike) -> pd.Series:
232232
233233
Returns
234234
-------
235-
pd.Series
235+
NestedSeries
236236
The new series with the field set.
237237
238238
Examples
@@ -253,9 +253,9 @@ def with_list_field(self, field: str, value: ArrayLike) -> pd.Series:
253253
"""
254254
new_array = self._series.array.copy()
255255
new_array.set_list_field(field, value)
256-
return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)
256+
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)
257257

258-
def with_filled_field(self, field: str, value: ArrayLike) -> pd.Series:
258+
def with_filled_field(self, field: str, value: ArrayLike) -> NestedSeries:
259259
"""Set the field by repeating values and return a new series
260260
261261
The input value array must have as many elements as the Series,
@@ -273,7 +273,7 @@ def with_filled_field(self, field: str, value: ArrayLike) -> pd.Series:
273273
274274
Returns
275275
-------
276-
pd.Series
276+
NestedSeries
277277
The new series with the field set.
278278
279279
Examples
@@ -292,9 +292,9 @@ def with_filled_field(self, field: str, value: ArrayLike) -> pd.Series:
292292
"""
293293
new_array = self._series.array.copy()
294294
new_array.fill_field_lists(field, value)
295-
return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)
295+
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)
296296

297-
def without_field(self, field: str | list[str]) -> pd.Series:
297+
def without_field(self, field: str | list[str]) -> NestedSeries:
298298
"""Remove the field(s) from the series and return a new series
299299
300300
Note, that at least one field must be left in the series.
@@ -306,7 +306,7 @@ def without_field(self, field: str | list[str]) -> pd.Series:
306306
307307
Returns
308308
-------
309-
pd.Series
309+
NestedSeries
310310
The new series without the field(s).
311311
312312
Examples
@@ -328,9 +328,9 @@ def without_field(self, field: str | list[str]) -> pd.Series:
328328

329329
new_array = self._series.array.copy()
330330
new_array.pop_fields(field)
331-
return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name)
331+
return NestedSeries(new_array, copy=False, index=self._series.index, name=self._series.name)
332332

333-
def query_flat(self, query: str) -> pd.Series:
333+
def query_flat(self, query: str) -> NestedSeries:
334334
"""Query the flat arrays with a boolean expression
335335
336336
Currently, it will remove empty rows from the output series.
@@ -343,7 +343,7 @@ def query_flat(self, query: str) -> pd.Series:
343343
344344
Returns
345345
-------
346-
pd.Series
346+
NestedSeries
347347
The filtered series.
348348
349349
Examples
@@ -363,8 +363,10 @@ def query_flat(self, query: str) -> pd.Series:
363363
flat = self.to_flat().query(query)
364364

365365
if len(flat) == 0:
366-
return pd.Series(
367-
[], dtype=self._series.dtype, index=pd.Index([], dtype=flat.index.dtype, name=flat.index.name)
366+
return NestedSeries(
367+
[],
368+
dtype=self._series.dtype,
369+
index=pd.Index([], dtype=flat.index.dtype, name=flat.index.name),
368370
)
369371
return pack_sorted_df_into_struct(flat)
370372

@@ -393,7 +395,7 @@ def get_flat_index(self) -> pd.Index:
393395
return flat_index
394396

395397
def get_flat_series(self, field: str) -> pd.Series:
396-
"""Get the flat-array field as a Series
398+
"""Get the flat-array field as a pd.Series
397399
398400
Parameters
399401
----------
@@ -434,13 +436,16 @@ def get_flat_series(self, field: str) -> pd.Series:
434436

435437
flat_chunked_array = pa.chunked_array(flat_chunks, type=self._series.dtype.fields[field])
436438

437-
return pd.Series(
439+
flat_series = pd.Series(
438440
flat_chunked_array,
439441
dtype=self._series.dtype.field_dtype(field),
440442
index=self.get_flat_index(),
441443
name=field,
442444
copy=False,
443445
)
446+
if isinstance(self._series.dtype.field_dtype(field), NestedDtype):
447+
return NestedSeries(flat_series, copy=False)
448+
return flat_series
444449

445450
def get_list_series(self, field: str) -> pd.Series:
446451
"""Get the list-array field as a Series
@@ -479,20 +484,26 @@ def get_list_series(self, field: str) -> pd.Series:
479484
copy=False,
480485
)
481486

482-
def __getitem__(self, key: str | list[str]) -> pd.Series:
487+
def __getitem__(self, key: str | list[str]) -> NestedSeries:
483488
# Allow boolean masking given a Series of booleans
484489
if isinstance(key, pd.Series) and pd.api.types.is_bool_dtype(key.dtype):
485490
flat_df = self.to_flat() # Use the flat representation
486491
if not key.index.equals(flat_df.index):
487492
raise ValueError("Boolean mask must have the same index as the flattened nested dataframe.")
488-
# Apply the mask to the series, return a new NestedFrame
489-
return NestedFrame(index=self._series.index).add_nested(flat_df[key], name=self._series.name)
493+
# Apply the mask to the series
494+
return NestedSeries(
495+
pack_flat(flat_df[key]),
496+
index=self._series.index,
497+
name=self._series.name,
498+
)
490499

491-
# If the key is a single string, return the flat series for that field
500+
# A list of fields may return a pd.Series or a NestedSeries depending
501+
# on the number of fields requested and their dtypes
492502
if isinstance(key, list):
493503
new_array = self._series.array.view_fields(key)
494-
return pd.Series(new_array, index=self._series.index, name=self._series.name)
504+
return NestedSeries(new_array, index=self._series.index, name=self._series.name)
495505

506+
# If the key is a single string, return the flat series for that field
496507
return self.get_flat_series(key)
497508

498509
def __setitem__(self, key: str, value: ArrayLike) -> None:
@@ -551,8 +562,8 @@ def clear(self) -> None:
551562
"""
552563
raise NotImplementedError("Cannot delete fields from nested series")
553564

554-
def to_flatten_inner(self, field: str) -> pd.Series:
555-
"""Explode the nested inner field and return as a pd.Series
565+
def to_flatten_inner(self, field: str) -> NestedSeries:
566+
"""Explode the nested inner field and return as a NestedSeries
556567
557568
Works for the case of multiple nesting only, the field must represent
558569
a nested series.
@@ -576,7 +587,7 @@ def to_flatten_inner(self, field: str) -> pd.Series:
576587
577588
Returns
578589
-------
579-
pd.Series
590+
NestedSeries
580591
This series object, but with the inner field exploded.
581592
582593
Examples
@@ -585,17 +596,18 @@ def to_flatten_inner(self, field: str) -> pd.Series:
585596
>>> from nested_pandas import NestedFrame
586597
>>> from nested_pandas.datasets import generate_data
587598
>>> nf = generate_data(5, 2, seed=1).rename(columns={"nested": "inner"})
599+
>>> nf["b"] = "b" # Shorten width of example output
588600
589601
Assign a repeated ID to double-nest on
590602
591603
>>> nf["id"] = [0, 0, 0, 1, 1]
592604
>>> nf
593-
a b inner id
594-
0 0.417022 0.184677 [{t: 8.38389, flux: 80.074457, band: 'r'}; …] ... 0
595-
1 0.720324 0.372520 [{t: 13.70439, flux: 96.826158, band: 'g'}; …]... 0
596-
2 0.000114 0.691121 [{t: 4.089045, flux: 31.342418, band: 'g'}; …]... 0
597-
3 0.302333 0.793535 [{t: 17.562349, flux: 69.232262, band: 'r'}; …... 1
598-
4 0.146756 1.077633 [{t: 0.547752, flux: 87.638915, band: 'g'}; …]... 1
605+
a b inner id
606+
0 0.417022 b [{t: 8.38389, flux: 80.074457, band: 'r'}; …] ... 0
607+
1 0.720324 b [{t: 13.70439, flux: 96.826158, band: 'g'}; …]... 0
608+
2 0.000114 b [{t: 4.089045, flux: 31.342418, band: 'g'}; …]... 0
609+
3 0.302333 b [{t: 17.562349, flux: 69.232262, band: 'r'}; …... 1
610+
4 0.146756 b [{t: 0.547752, flux: 87.638915, band: 'g'}; …]... 1
599611
600612
>>> nf.inner.nest.to_flat()
601613
t flux band
@@ -620,23 +632,23 @@ def to_flatten_inner(self, field: str) -> pd.Series:
620632
>>> concated_nf_series = dnf["outer"].nest.to_flatten_inner("inner")
621633
>>> concated_nf_series
622634
id
623-
0 [{a: 0.417022, b: 0.184677, t: 8.38389, flux: ...
624-
1 [{a: 0.302333, b: 0.793535, t: 17.562349, flux...
625-
Name: outer, dtype: nested<a: [double], b: [double], t: [double], flux: [double], band: [string]>
635+
0 [{a: 0.417022, b: 'b', t: 8.38389, flux: 80.07...
636+
1 [{a: 0.302333, b: 'b', t: 17.562349, flux: 69....
637+
Name: outer, dtype: nested<a: [double], b: [string], t: [double], flux: [double], band: [string]>
626638
627639
>>> concated_nf_series.nest.to_flat() # doctest: +NORMALIZE_WHITESPACE
628-
a b t flux band
640+
a b t flux band
629641
id
630-
0 0.417022 0.184677 8.38389 80.074457 r
631-
0 0.417022 0.184677 13.40935 89.460666 g
632-
0 0.720324 0.37252 13.70439 96.826158 g
633-
0 0.720324 0.37252 8.346096 8.504421 g
634-
0 0.000114 0.691121 4.089045 31.342418 g
635-
0 0.000114 0.691121 11.173797 3.905478 g
636-
1 0.302333 0.793535 17.562349 69.232262 r
637-
1 0.302333 0.793535 2.807739 16.983042 r
638-
1 0.146756 1.077633 0.547752 87.638915 g
639-
1 0.146756 1.077633 3.96203 87.81425 r
642+
0 0.417022 b 8.38389 80.074457 r
643+
0 0.417022 b 13.40935 89.460666 g
644+
0 0.720324 b 13.70439 96.826158 g
645+
0 0.720324 b 8.346096 8.504421 g
646+
0 0.000114 b 4.089045 31.342418 g
647+
0 0.000114 b 11.173797 3.905478 g
648+
1 0.302333 b 17.562349 69.232262 r
649+
1 0.302333 b 2.807739 16.983042 r
650+
1 0.146756 b 0.547752 87.638915 g
651+
1 0.146756 b 3.96203 87.81425 r
640652
"""
641653
if not isinstance(self._series.dtype.field_dtype(field), NestedDtype):
642654
raise ValueError(
@@ -669,7 +681,7 @@ def to_flatten_inner(self, field: str) -> pd.Series:
669681

670682
# Some indexes may be missed if the original series had some NULLs
671683
if len(result) < len(series):
672-
nulls = pd.Series(None, index=series.index, dtype=result.dtype)
684+
nulls = NestedSeries(None, index=series.index, dtype=result.dtype)
673685
nulls[result.index] = result
674686
result = nulls
675687

0 commit comments

Comments
 (0)