From 5f524ee19a44771cdd7f62442c2152fe0fdf3ac6 Mon Sep 17 00:00:00 2001 From: freddyaboulton Date: Tue, 18 Jan 2022 12:49:25 -0500 Subject: [PATCH 1/2] Implementing check_consistent_lengths + test --- dask_ml/utils.py | 17 ++++++++++------- tests/test_utils.py | 43 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/dask_ml/utils.py b/dask_ml/utils.py index abeaa58a0..2150a2ea5 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -241,7 +241,7 @@ def check_random_state(random_state): raise TypeError("Unexpected type '{}'".format(type(random_state))) -def check_matching_blocks(*arrays): +def check_matching_blocks(*arrays, check_first_dim_only=False): """Check that the partitioning structure for many arrays matches. Parameters @@ -252,22 +252,25 @@ def check_matching_blocks(*arrays): * Dask Array * Dask DataFrame * Dask Series + check_first_dim_only: bool, default false + Whether to only checks the chunks along the first dimension """ if len(arrays) <= 1: return + slice_to_check = slice(0, 1, 1) if check_first_dim_only else slice(None, None) if all(isinstance(x, da.Array) for x in arrays): # TODO: unknown chunks, ensure blocks match, or just raise (configurable) - chunks = arrays[0].chunks + chunks = arrays[0].chunks[slice_to_check] for array in arrays[1:]: - if array.chunks != chunks: + if array.chunks[slice_to_check] != chunks: raise ValueError( "Mismatched chunks. {} != {}".format(chunks, array.chunks) ) elif all(isinstance(x, (dd.Series, dd.DataFrame)) for x in arrays): - divisions = arrays[0].divisions + divisions = arrays[0].divisions[slice_to_check] for array in arrays[1:]: - if array.divisions != divisions: + if array.divisions[slice_to_check] != divisions: raise ValueError( "Mismatched divisions. {} != {}".format(divisions, array.divisions) ) @@ -433,8 +436,8 @@ def _check_y(y, multi_output=False, y_numeric=False): def check_consistent_length(*arrays): - # TODO: check divisions, chunks, etc. - pass + """Check that blocks match for arrays and divisions match for dataframes.""" + check_matching_blocks(*arrays, check_first_dim_only=True) def check_chunks(n_samples, n_features, chunks=None): diff --git a/tests/test_utils.py b/tests/test_utils.py index e26097748..b4961930d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -15,6 +15,7 @@ assert_estimator_equal, check_array, check_chunks, + check_consistent_length, check_matching_blocks, check_random_state, handle_zeros_in_scale, @@ -208,8 +209,9 @@ def test_check_array_1d(): ), ], ) -def test_matching_blocks_ok(arrays): - check_matching_blocks(*arrays) +@pytest.mark.parametrize("check_first_dim_only", [True, False]) +def test_matching_blocks_ok(arrays, check_first_dim_only): + check_matching_blocks(*arrays, check_first_dim_only=check_first_dim_only) @pytest.mark.parametrize( @@ -234,3 +236,40 @@ def test_matching_blocks_ok(arrays): def test_matching_blocks_raises(arrays): with pytest.raises(ValueError): check_matching_blocks(*arrays) + + +@pytest.mark.parametrize( + "arrays", + [ + ( + da.random.uniform(size=(10, 10), chunks=(10, 10)), + da.random.uniform(size=10, chunks=10), + ), + ( + da.random.uniform(size=(50, 10), chunks=(50, 10)), + da.random.uniform(size=50, chunks=50), + ), + ( + dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), 2) + .reset_index() + .to_dask_array(), + dd.from_pandas(pd.Series([1, 2, 3]), 2).reset_index().to_dask_array(), + ), + ( + dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), 2), + dd.from_pandas(pd.Series([1, 2, 3]), 2), + ), + # Allow known and unknown? + pytest.param( + ( + dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), 2) + .reset_index() + .to_dask_array(), + dd.from_pandas(pd.Series([1, 2, 3]), 2).reset_index(), + ), + marks=pytest.mark.xfail(reason="Known and unknown blocks."), + ), + ], +) +def test_check_consistent_length_ok(arrays): + check_consistent_length(*arrays) From bc953797e87824b889aca13bcee4eba1456d9a09 Mon Sep 17 00:00:00 2001 From: freddyaboulton Date: Tue, 18 Jan 2022 15:28:59 -0500 Subject: [PATCH 2/2] Fix bug. Refactor into helper --- dask_ml/utils.py | 30 +++++++++++++++++++++++------- tests/test_utils.py | 27 ++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/dask_ml/utils.py b/dask_ml/utils.py index 2150a2ea5..83e89de0d 100644 --- a/dask_ml/utils.py +++ b/dask_ml/utils.py @@ -241,8 +241,8 @@ def check_random_state(random_state): raise TypeError("Unexpected type '{}'".format(type(random_state))) -def check_matching_blocks(*arrays, check_first_dim_only=False): - """Check that the partitioning structure for many arrays matches. +def _check_matching_blocks(*arrays, check_first_dim_only=False): + """Helper function to check blocks match across *arrays. Parameters ---------- @@ -253,7 +253,8 @@ def check_matching_blocks(*arrays, check_first_dim_only=False): * Dask DataFrame * Dask Series check_first_dim_only: bool, default false - Whether to only checks the chunks along the first dimension + Whether to only checks the chunks along the first dimension. Only applies + if all the arrays are dask arrays. """ if len(arrays) <= 1: return @@ -266,11 +267,11 @@ def check_matching_blocks(*arrays, check_first_dim_only=False): raise ValueError( "Mismatched chunks. {} != {}".format(chunks, array.chunks) ) - + # Divisions correspond to the index (first_dim) so no need to use slice_to_check elif all(isinstance(x, (dd.Series, dd.DataFrame)) for x in arrays): - divisions = arrays[0].divisions[slice_to_check] + divisions = arrays[0].divisions for array in arrays[1:]: - if array.divisions[slice_to_check] != divisions: + if array.divisions != divisions: raise ValueError( "Mismatched divisions. {} != {}".format(divisions, array.divisions) ) @@ -278,6 +279,21 @@ def check_matching_blocks(*arrays, check_first_dim_only=False): raise ValueError("Unexpected types {}.".format({type(x) for x in arrays})) +def check_matching_blocks(*arrays): + """Check that the partitioning structure for many arrays matches. + + Parameters + ---------- + *arrays : Sequence of array-likes + This includes + + * Dask Array + * Dask DataFrame + * Dask Series + """ + _check_matching_blocks(*arrays, check_first_dim_only=False) + + def check_X_y( X, y, @@ -437,7 +453,7 @@ def _check_y(y, multi_output=False, y_numeric=False): def check_consistent_length(*arrays): """Check that blocks match for arrays and divisions match for dataframes.""" - check_matching_blocks(*arrays, check_first_dim_only=True) + _check_matching_blocks(*arrays, check_first_dim_only=True) def check_chunks(n_samples, n_features, chunks=None): diff --git a/tests/test_utils.py b/tests/test_utils.py index b4961930d..fc64c73d0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -209,9 +209,8 @@ def test_check_array_1d(): ), ], ) -@pytest.mark.parametrize("check_first_dim_only", [True, False]) -def test_matching_blocks_ok(arrays, check_first_dim_only): - check_matching_blocks(*arrays, check_first_dim_only=check_first_dim_only) +def test_matching_blocks_ok(arrays): + check_matching_blocks(*arrays) @pytest.mark.parametrize( @@ -273,3 +272,25 @@ def test_matching_blocks_raises(arrays): ) def test_check_consistent_length_ok(arrays): check_consistent_length(*arrays) + + +@pytest.mark.parametrize( + "arrays", + [ + ( + da.random.uniform(size=(10, 10), chunks=(10, 10)), + da.random.uniform(size=8, chunks=8), + ), + ( + da.random.uniform(size=(100, 10), chunks=(100, 10)), + da.random.uniform(size=50, chunks=50), + ), + ( + dd.from_pandas(pd.DataFrame({"a": [1, 2, 3, 4]}), 4), + dd.from_pandas(pd.Series([1, 2, 3]), 2), + ), + ], +) +def test_check_consistent_length_raises(arrays): + with pytest.raises(ValueError): + check_consistent_length(*arrays)