Skip to content

Commit fe45496

Browse files
authored
Merge pull request #169 from lincc-frameworks/add_nested_tests
Tests for using the 'on' keyword with add_nested and packing
2 parents a6b83d8 + efcadff commit fe45496

File tree

5 files changed

+249
-24
lines changed

5 files changed

+249
-24
lines changed

src/nested_pandas/nestedframe/core.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from pandas._libs import lib
1111
from pandas._typing import Any, AnyAll, Axis, IndexLabel
1212
from pandas.api.extensions import no_default
13-
from pandas.api.types import is_bool_dtype
1413
from pandas.core.computation.expr import PARSERS, PandasExprVisitor
1514

1615
from nested_pandas.nestedframe.utils import extract_nest_names
@@ -271,8 +270,8 @@ def add_nested(
271270
index, and sort it lexicographically.
272271
- inner: form intersection of calling frame's index with other
273272
frame's index, preserving the order of the calling index.
274-
on : str, list of str, default: None
275-
Columns to join on.
273+
on : str, default: None
274+
A column in the list
276275
dtype : dtype or None
277276
NestedDtype to use for the nested column; pd.ArrowDtype or
278277
pa.DataType can also be used to specify the nested dtype. If None,
@@ -283,10 +282,13 @@ def add_nested(
283282
NestedFrame
284283
A new NestedFrame with the added nested column.
285284
"""
285+
if on is not None and not isinstance(on, str):
286+
raise ValueError("Currently we only support a single column for 'on'")
286287
# Add sources to objects
287288
packed = pack(obj, name=name, on=on, dtype=dtype)
288289
new_df = self.copy()
289-
return new_df.join(packed, how=how)
290+
res = new_df.join(packed, how=how, on=on)
291+
return res
290292

291293
@classmethod
292294
def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
@@ -519,14 +521,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
519521
# to the nest and repack. Otherwise, apply it to this instance as usual,
520522
# since it operated on the base attributes.
521523
if isinstance(result, _SeriesFromNest):
522-
if not is_bool_dtype(result.dtype):
523-
raise ValueError("Query condition must evaluate to a boolean Series")
524-
525524
nest_name, flat_nest = result.nest_name, result.flat_nest
526-
527525
# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
528-
flat_nest = flat_nest.set_index(self[nest_name].array.list_index)
529-
query_result = result.set_axis(self[nest_name].array.list_index)
526+
list_index = self[nest_name].array.get_list_index()
527+
flat_nest = flat_nest.set_index(list_index)
528+
query_result = result.set_axis(list_index)
530529
# Selecting flat values matching the query result
531530
new_flat_nest = flat_nest[query_result]
532531
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
@@ -679,7 +678,7 @@ def dropna(
679678
if subset is not None:
680679
subset = [col.split(".")[-1] for col in subset]
681680
target_flat = self[target].nest.to_flat()
682-
target_flat = target_flat.set_index(self[target].array.list_index)
681+
target_flat = target_flat.set_index(self[target].array.get_list_index())
683682
if inplace:
684683
target_flat.dropna(
685684
axis=axis,

src/nested_pandas/series/ext_array.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -648,9 +648,11 @@ def num_chunks(self) -> int:
648648
"""Number of chunks in underlying pyarrow.ChunkedArray"""
649649
return self._chunked_array.num_chunks
650650

651-
@property
652-
def list_index(self) -> np.ndarray:
651+
def get_list_index(self) -> np.ndarray:
653652
"""Keys mapping values to lists"""
653+
if len(self) == 0:
654+
# Since we have no list offests, return an empty array
655+
return np.array([], dtype=int)
654656
list_index = np.arange(len(self))
655657
return np.repeat(list_index, np.diff(self.list_offsets))
656658

tests/nested_pandas/nestedframe/test_nestedframe.py

Lines changed: 115 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import pandas as pd
33
import pyarrow as pa
44
import pytest
5+
from pandas.testing import assert_frame_equal
6+
57
from nested_pandas import NestedFrame
68
from nested_pandas.datasets import generate_data
79
from nested_pandas.nestedframe.core import _SeriesFromNest
8-
from pandas.testing import assert_frame_equal
910

1011

1112
def test_nestedframe_construction():
@@ -187,10 +188,16 @@ def test_add_nested_with_flat_df():
187188
def test_add_nested_with_flat_df_and_mismatched_index():
188189
"""Test add_nested when index values of base are missing matches in nested"""
189190

190-
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
191+
base = NestedFrame(
192+
data={"a": [1, 2, 3], "b": [2, 4, 6], "new_index": [0, 1, 3] }, index=[0, 1, 2])
191193

192194
nested = pd.DataFrame(
193-
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
195+
data={
196+
"c": [0, 2, 4, 1, 4, 3, 1, 4, 1],
197+
"d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
198+
# A column we can have as an alternative joining index with 'on'
199+
"new_index": [1, 1, 1, 1, 2, 2, 5, 5, 5],
200+
},
194201
# no data for base index value of "2" and introduces new index value "4"
195202
index=[0, 0, 0, 1, 1, 1, 1, 4, 4],
196203
)
@@ -212,6 +219,35 @@ def test_add_nested_with_flat_df_and_mismatched_index():
212219
default_res = base.add_nested(nested, "nested")
213220
assert_frame_equal(left_res, default_res)
214221

222+
# Test still adding the nested frame in a "left" fashion but on the "new_index" column
223+
224+
# We currently don't support a list of columns for the 'on' argument
225+
with pytest.raises(ValueError):
226+
left_res_on = base.add_nested(nested, "nested", how="left", on=["new_index"])
227+
# Instead we should pass a single column name, "new_index" which exists in both frames.
228+
left_res_on = base.add_nested(nested, "nested", how="left", on="new_index")
229+
assert "nested" in left_res_on.columns
230+
# Check that the index of the base layer is still being used
231+
assert (left_res_on.index == base.index).all()
232+
# Assert that the new_index column we joined on was dropped from the nested layer
233+
# but is present in the base layer
234+
assert "new_index" in left_res_on.columns
235+
assert "new_index" not in left_res_on["nested"].nest.to_flat().columns
236+
237+
# For each index in the columns we joined on, check that values are aligned correctly
238+
for i in range(len(left_res_on.new_index)):
239+
# The actual "index" value we "joined" on.
240+
join_idx = left_res_on.new_index.iloc[i]
241+
# Check that the nested column is aligned correctly to the base layer
242+
if join_idx in nested["new_index"].values:
243+
assert left_res_on.iloc[i]["nested"] is not None
244+
# Check that it is present in new the index we constructed for the nested layer
245+
assert join_idx in left_res_on["nested"].nest.to_flat().index
246+
else:
247+
# Use an iloc
248+
assert left_res_on.iloc[i]["nested"] is None
249+
assert join_idx not in left_res_on["nested"].nest.to_flat().index
250+
215251
# Test adding the nested frame in a "right" fashion, where the index of the "right"
216252
# frame (our nested layer) is preserved
217253
right_res = base.add_nested(nested, "nested", how="right")
@@ -235,6 +271,35 @@ def test_add_nested_with_flat_df_and_mismatched_index():
235271
else:
236272
assert not pd.isna(right_res.loc[idx][col])
237273

274+
# Test still adding the nested frame in a "right" fashion but on the "new_index" column
275+
right_res_on = base.add_nested(nested, "nested", how="right", on="new_index")
276+
assert "nested" in right_res_on.columns
277+
# Check that rows were dropped if the base layer's "new_index" value is not present
278+
# in the "right" nested layer
279+
assert (right_res_on.new_index.values == np.unique(nested.new_index.values)).all()
280+
281+
# Check that the new_index column we joined on was dropped from the nested layer
282+
assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
283+
# Check that the flattend nested layer has the same index as the original column we joined on
284+
all(right_res_on.nested.nest.to_flat().index.values == nested.new_index.values)
285+
286+
# For each index check that the base layer is aligned correctly to the nested layer
287+
for i in range(len(right_res_on)):
288+
# The actual "index" value we "joined" on. Since it was a right join, guaranteed to
289+
# be in the "new_index" column of the orignal frame we wanted to nest
290+
join_idx = right_res_on.new_index.iloc[i]
291+
assert join_idx in nested["new_index"].values
292+
293+
# Check the values for each column in our "base" layer
294+
for col in base.columns:
295+
if col != "new_index":
296+
assert col in right_res_on.columns
297+
if join_idx not in base.new_index.values:
298+
# We expect a NaN value in the base layer due to the "right" join
299+
assert pd.isna(right_res_on.iloc[i][col])
300+
else:
301+
assert not pd.isna(right_res_on.iloc[i][col])
302+
238303
# Test the "outer" behavior
239304
outer_res = base.add_nested(nested, "nested", how="outer")
240305
assert "nested" in outer_res.columns
@@ -255,6 +320,38 @@ def test_add_nested_with_flat_df_and_mismatched_index():
255320
else:
256321
assert not pd.isna(outer_res.loc[idx][col])
257322

323+
# Test still adding the nested frame in an "outer" fashion but with on the "new_index" column
324+
outer_res_on = base.add_nested(nested, "nested", how="outer", on="new_index")
325+
assert "nested" in outer_res_on.columns
326+
# We expect the result's new_index column to be the set union of the values of that column
327+
# in the base and nested frames
328+
assert set(outer_res_on.new_index) == set(base.new_index).union(set(nested.new_index))
329+
330+
# Check that the new_index column we joined on was dropped from the nested layer
331+
assert "new_index" not in outer_res_on["nested"].nest.to_flat().columns
332+
# Check that the flattend nested layer has the same index as the original column we joined on
333+
# Note that it does not have index values only present in the base layer since those empty rows
334+
# are dropped when we flatten the nested frame.
335+
all(outer_res_on.nested.nest.to_flat().index.values == nested.new_index.values)
336+
337+
for i in range(len(outer_res_on)):
338+
# The actual "index" value we "joined" on.
339+
join_idx = outer_res_on.new_index.iloc[i]
340+
# Check that the nested column is aligned correctly to the base layer
341+
if join_idx not in nested["new_index"].values:
342+
assert outer_res_on.iloc[i]["nested"] is None
343+
else:
344+
assert outer_res_on.iloc[i]["nested"] is not None
345+
# Check the values for each column in our "base" layer
346+
for col in base.columns:
347+
if col != "new_index":
348+
assert col in outer_res_on.columns
349+
if join_idx in base.new_index.values:
350+
# We expect a NaN value in the base layer due to the "outer" join
351+
assert not pd.isna(outer_res_on.iloc[i][col])
352+
else:
353+
assert pd.isna(outer_res_on.iloc[i][col])
354+
258355
# Test the "inner" behavior
259356
inner_res = base.add_nested(nested, "nested", how="inner")
260357
assert "nested" in inner_res.columns
@@ -268,6 +365,18 @@ def test_add_nested_with_flat_df_and_mismatched_index():
268365
assert col in inner_res.columns
269366
assert not pd.isna(inner_res.loc[idx][col])
270367

368+
# Test still adding the nested frame in a "inner" fashion but on the "new_index" column
369+
inner_res_on = base.add_nested(nested, "nested", how="inner", on="new_index")
370+
assert "nested" in inner_res_on.columns
371+
# We expect the new index to be the set intersection of the base and nested column we used
372+
# for the 'on' argument
373+
assert set(inner_res_on.new_index) == set(base.new_index).intersection(set(nested.new_index))
374+
# Check that the new_index column we joined on was dropped from the nested layer
375+
assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
376+
377+
# Since we have confirmed that the "nex_index" column was the intersection that we expected
378+
# we know that none of the joined values should be none
379+
assert not inner_res_on.isnull().values.any()
271380

272381
def test_add_nested_with_series():
273382
"""Test that add_nested correctly adds a nested column to the base df"""
@@ -433,7 +542,7 @@ def test_from_lists():
433542
def test_query():
434543
"""Test that NestedFrame.query handles nested queries correctly"""
435544

436-
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
545+
base = NestedFrame(data={"a": [1, 2, 2, 3], "b": [2, 3, 4, 6]}, index=[0, 1, 1, 2])
437546

438547
nested = pd.DataFrame(
439548
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
@@ -455,10 +564,10 @@ def test_query():
455564

456565
# Test nested queries
457566
nest_queried = base.query("nested.c > 1")
458-
assert len(nest_queried.nested.nest.to_flat()) == 5
567+
assert len(nest_queried.nested.nest.to_flat()) == 7
459568

460569
nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
461-
assert len(nest_queried.nested.nest.to_flat()) == 4
570+
assert len(nest_queried.nested.nest.to_flat()) == 5
462571

463572
# Check edge conditions
464573
with pytest.raises(ValueError):

tests/nested_pandas/series/test_accessor.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1-
import nested_pandas as npd
21
import numpy as np
32
import pandas as pd
43
import pyarrow as pa
54
import pytest
5+
from numpy.testing import assert_array_equal
6+
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
7+
8+
import nested_pandas as npd
69
from nested_pandas import NestedDtype
710
from nested_pandas.series.ext_array import NestedExtensionArray
811
from nested_pandas.series.packer import pack_flat, pack_seq
9-
from numpy.testing import assert_array_equal
10-
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
1112

1213

1314
def test_registered():
@@ -981,3 +982,27 @@ def test_values():
981982
series = pack_seq([{"a": [1, 2, 3], "b": [3, 2, 1]}, {"a": [4, None], "b": [7, 8]}])
982983
for value in series.nest.values():
983984
assert_series_equal(value, series.nest[value.name])
985+
986+
def test_get_list_index():
987+
"""Test that the get_list_index() method works."""
988+
# First check that an empty NestedSeries returns an empty list index.
989+
empty_struct_array = pa.StructArray.from_arrays(arrays=[],names=[])
990+
empty_series = pd.Series(empty_struct_array, dtype=NestedDtype(empty_struct_array.type), index=[])
991+
assert len(empty_series) == 0
992+
assert len(empty_series.array.get_list_index()) == 0
993+
994+
# Create a NestedType series
995+
struct_array = pa.StructArray.from_arrays(
996+
arrays=[
997+
pa.array([np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7])]),
998+
pa.array([np.array([7, 6, 4, 2]), np.array([0, 1, 2, 3])]),
999+
pa.array([np.array([8, 9, 1, 9]), np.array([0, 0, 2, 3])]),
1000+
],
1001+
names=["a", "b", "c"],
1002+
)
1003+
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
1004+
1005+
# Validate the generation of a flat length ordinal array
1006+
list_index = series.array.get_list_index()
1007+
assert len(list_index) == series.nest.flat_length
1008+
assert np.equal(list_index, [0, 0, 0, 0, 1, 1, 1, 1]).all()

0 commit comments

Comments
 (0)