22import pandas as pd
33import pyarrow as pa
44import pytest
5+ from pandas .testing import assert_frame_equal
6+
57from nested_pandas import NestedFrame
68from nested_pandas .datasets import generate_data
79from nested_pandas .nestedframe .core import _SeriesFromNest
8- from pandas .testing import assert_frame_equal
910
1011
1112def test_nestedframe_construction ():
@@ -187,10 +188,16 @@ def test_add_nested_with_flat_df():
187188def test_add_nested_with_flat_df_and_mismatched_index ():
188189 """Test add_nested when index values of base are missing matches in nested"""
189190
190- base = NestedFrame (data = {"a" : [1 , 2 , 3 ], "b" : [2 , 4 , 6 ]}, index = [0 , 1 , 2 ])
191+ base = NestedFrame (
192+ data = {"a" : [1 , 2 , 3 ], "b" : [2 , 4 , 6 ], "new_index" : [0 , 1 , 3 ] }, index = [0 , 1 , 2 ])
191193
192194 nested = pd .DataFrame (
193- data = {"c" : [0 , 2 , 4 , 1 , 4 , 3 , 1 , 4 , 1 ], "d" : [5 , 4 , 7 , 5 , 3 , 1 , 9 , 3 , 4 ]},
195+ data = {
196+ "c" : [0 , 2 , 4 , 1 , 4 , 3 , 1 , 4 , 1 ],
197+ "d" : [5 , 4 , 7 , 5 , 3 , 1 , 9 , 3 , 4 ],
198+ # A column we can have as an alternative joining index with 'on'
199+ "new_index" : [1 , 1 , 1 , 1 , 2 , 2 , 5 , 5 , 5 ],
200+ },
194201 # no data for base index value of "2" and introduces new index value "4"
195202 index = [0 , 0 , 0 , 1 , 1 , 1 , 1 , 4 , 4 ],
196203 )
@@ -212,6 +219,35 @@ def test_add_nested_with_flat_df_and_mismatched_index():
212219 default_res = base .add_nested (nested , "nested" )
213220 assert_frame_equal (left_res , default_res )
214221
222+ # Test still adding the nested frame in a "left" fashion but on the "new_index" column
223+
224+ # We currently don't support a list of columns for the 'on' argument
225+ with pytest .raises (ValueError ):
226+ left_res_on = base .add_nested (nested , "nested" , how = "left" , on = ["new_index" ])
227+ # Instead we should pass a single column name, "new_index" which exists in both frames.
228+ left_res_on = base .add_nested (nested , "nested" , how = "left" , on = "new_index" )
229+ assert "nested" in left_res_on .columns
230+ # Check that the index of the base layer is still being used
231+ assert (left_res_on .index == base .index ).all ()
232+ # Assert that the new_index column we joined on was dropped from the nested layer
233+ # but is present in the base layer
234+ assert "new_index" in left_res_on .columns
235+ assert "new_index" not in left_res_on ["nested" ].nest .to_flat ().columns
236+
237+ # For each index in the columns we joined on, check that values are aligned correctly
238+ for i in range (len (left_res_on .new_index )):
239+ # The actual "index" value we "joined" on.
240+ join_idx = left_res_on .new_index .iloc [i ]
241+ # Check that the nested column is aligned correctly to the base layer
242+ if join_idx in nested ["new_index" ].values :
243+ assert left_res_on .iloc [i ]["nested" ] is not None
244+ # Check that it is present in new the index we constructed for the nested layer
245+ assert join_idx in left_res_on ["nested" ].nest .to_flat ().index
246+ else :
247+ # Use an iloc
248+ assert left_res_on .iloc [i ]["nested" ] is None
249+ assert join_idx not in left_res_on ["nested" ].nest .to_flat ().index
250+
215251 # Test adding the nested frame in a "right" fashion, where the index of the "right"
216252 # frame (our nested layer) is preserved
217253 right_res = base .add_nested (nested , "nested" , how = "right" )
@@ -235,6 +271,35 @@ def test_add_nested_with_flat_df_and_mismatched_index():
235271 else :
236272 assert not pd .isna (right_res .loc [idx ][col ])
237273
274+ # Test still adding the nested frame in a "right" fashion but on the "new_index" column
275+ right_res_on = base .add_nested (nested , "nested" , how = "right" , on = "new_index" )
276+ assert "nested" in right_res_on .columns
277+ # Check that rows were dropped if the base layer's "new_index" value is not present
278+ # in the "right" nested layer
279+ assert (right_res_on .new_index .values == np .unique (nested .new_index .values )).all ()
280+
281+ # Check that the new_index column we joined on was dropped from the nested layer
282+ assert "new_index" not in right_res_on ["nested" ].nest .to_flat ().columns
283+ # Check that the flattend nested layer has the same index as the original column we joined on
284+ all (right_res_on .nested .nest .to_flat ().index .values == nested .new_index .values )
285+
286+ # For each index check that the base layer is aligned correctly to the nested layer
287+ for i in range (len (right_res_on )):
288+ # The actual "index" value we "joined" on. Since it was a right join, guaranteed to
289+ # be in the "new_index" column of the orignal frame we wanted to nest
290+ join_idx = right_res_on .new_index .iloc [i ]
291+ assert join_idx in nested ["new_index" ].values
292+
293+ # Check the values for each column in our "base" layer
294+ for col in base .columns :
295+ if col != "new_index" :
296+ assert col in right_res_on .columns
297+ if join_idx not in base .new_index .values :
298+ # We expect a NaN value in the base layer due to the "right" join
299+ assert pd .isna (right_res_on .iloc [i ][col ])
300+ else :
301+ assert not pd .isna (right_res_on .iloc [i ][col ])
302+
238303 # Test the "outer" behavior
239304 outer_res = base .add_nested (nested , "nested" , how = "outer" )
240305 assert "nested" in outer_res .columns
@@ -255,6 +320,38 @@ def test_add_nested_with_flat_df_and_mismatched_index():
255320 else :
256321 assert not pd .isna (outer_res .loc [idx ][col ])
257322
323+ # Test still adding the nested frame in an "outer" fashion but with on the "new_index" column
324+ outer_res_on = base .add_nested (nested , "nested" , how = "outer" , on = "new_index" )
325+ assert "nested" in outer_res_on .columns
326+ # We expect the result's new_index column to be the set union of the values of that column
327+ # in the base and nested frames
328+ assert set (outer_res_on .new_index ) == set (base .new_index ).union (set (nested .new_index ))
329+
330+ # Check that the new_index column we joined on was dropped from the nested layer
331+ assert "new_index" not in outer_res_on ["nested" ].nest .to_flat ().columns
332+ # Check that the flattend nested layer has the same index as the original column we joined on
333+ # Note that it does not have index values only present in the base layer since those empty rows
334+ # are dropped when we flatten the nested frame.
335+ all (outer_res_on .nested .nest .to_flat ().index .values == nested .new_index .values )
336+
337+ for i in range (len (outer_res_on )):
338+ # The actual "index" value we "joined" on.
339+ join_idx = outer_res_on .new_index .iloc [i ]
340+ # Check that the nested column is aligned correctly to the base layer
341+ if join_idx not in nested ["new_index" ].values :
342+ assert outer_res_on .iloc [i ]["nested" ] is None
343+ else :
344+ assert outer_res_on .iloc [i ]["nested" ] is not None
345+ # Check the values for each column in our "base" layer
346+ for col in base .columns :
347+ if col != "new_index" :
348+ assert col in outer_res_on .columns
349+ if join_idx in base .new_index .values :
350+ # We expect a NaN value in the base layer due to the "outer" join
351+ assert not pd .isna (outer_res_on .iloc [i ][col ])
352+ else :
353+ assert pd .isna (outer_res_on .iloc [i ][col ])
354+
258355 # Test the "inner" behavior
259356 inner_res = base .add_nested (nested , "nested" , how = "inner" )
260357 assert "nested" in inner_res .columns
@@ -268,6 +365,18 @@ def test_add_nested_with_flat_df_and_mismatched_index():
268365 assert col in inner_res .columns
269366 assert not pd .isna (inner_res .loc [idx ][col ])
270367
368+ # Test still adding the nested frame in a "inner" fashion but on the "new_index" column
369+ inner_res_on = base .add_nested (nested , "nested" , how = "inner" , on = "new_index" )
370+ assert "nested" in inner_res_on .columns
371+ # We expect the new index to be the set intersection of the base and nested column we used
372+ # for the 'on' argument
373+ assert set (inner_res_on .new_index ) == set (base .new_index ).intersection (set (nested .new_index ))
374+ # Check that the new_index column we joined on was dropped from the nested layer
375+ assert "new_index" not in right_res_on ["nested" ].nest .to_flat ().columns
376+
377+ # Since we have confirmed that the "nex_index" column was the intersection that we expected
378+ # we know that none of the joined values should be none
379+ assert not inner_res_on .isnull ().values .any ()
271380
272381def test_add_nested_with_series ():
273382 """Test that add_nested correctly adds a nested column to the base df"""
@@ -433,7 +542,7 @@ def test_from_lists():
433542def test_query ():
434543 """Test that NestedFrame.query handles nested queries correctly"""
435544
436- base = NestedFrame (data = {"a" : [1 , 2 , 3 ], "b" : [2 , 4 , 6 ]}, index = [0 , 1 , 2 ])
545+ base = NestedFrame (data = {"a" : [1 , 2 , 2 , 3 ], "b" : [2 , 3 , 4 , 6 ]}, index = [0 , 1 , 1 , 2 ])
437546
438547 nested = pd .DataFrame (
439548 data = {"c" : [0 , 2 , 4 , 1 , 4 , 3 , 1 , 4 , 1 ], "d" : [5 , 4 , 7 , 5 , 3 , 1 , 9 , 3 , 4 ]},
@@ -455,10 +564,10 @@ def test_query():
455564
456565 # Test nested queries
457566 nest_queried = base .query ("nested.c > 1" )
458- assert len (nest_queried .nested .nest .to_flat ()) == 5
567+ assert len (nest_queried .nested .nest .to_flat ()) == 7
459568
460569 nest_queried = base .query ("(nested.c > 1) and (nested.d>2)" )
461- assert len (nest_queried .nested .nest .to_flat ()) == 4
570+ assert len (nest_queried .nested .nest .to_flat ()) == 5
462571
463572 # Check edge conditions
464573 with pytest .raises (ValueError ):
0 commit comments