From 5f524ee19a44771cdd7f62442c2152fe0fdf3ac6 Mon Sep 17 00:00:00 2001
From: freddyaboulton <alfonsoboulton@gmail.com>
Date: Tue, 18 Jan 2022 12:49:25 -0500
Subject: [PATCH 1/2] Implementing check_consistent_lengths + test

---
 dask_ml/utils.py    | 17 ++++++++++-------
 tests/test_utils.py | 43 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/dask_ml/utils.py b/dask_ml/utils.py
index abeaa58a0..2150a2ea5 100644
--- a/dask_ml/utils.py
+++ b/dask_ml/utils.py
@@ -241,7 +241,7 @@ def check_random_state(random_state):
         raise TypeError("Unexpected type '{}'".format(type(random_state)))
 
 
-def check_matching_blocks(*arrays):
+def check_matching_blocks(*arrays, check_first_dim_only=False):
     """Check that the partitioning structure for many arrays matches.
 
     Parameters
@@ -252,22 +252,25 @@ def check_matching_blocks(*arrays):
         * Dask Array
         * Dask DataFrame
         * Dask Series
+    check_first_dim_only: bool, default false
+        Whether to only checks the chunks along the first dimension
     """
     if len(arrays) <= 1:
         return
+    slice_to_check = slice(0, 1, 1) if check_first_dim_only else slice(None, None)
     if all(isinstance(x, da.Array) for x in arrays):
         # TODO: unknown chunks, ensure blocks match, or just raise (configurable)
-        chunks = arrays[0].chunks
+        chunks = arrays[0].chunks[slice_to_check]
         for array in arrays[1:]:
-            if array.chunks != chunks:
+            if array.chunks[slice_to_check] != chunks:
                 raise ValueError(
                     "Mismatched chunks. {} != {}".format(chunks, array.chunks)
                 )
 
     elif all(isinstance(x, (dd.Series, dd.DataFrame)) for x in arrays):
-        divisions = arrays[0].divisions
+        divisions = arrays[0].divisions[slice_to_check]
         for array in arrays[1:]:
-            if array.divisions != divisions:
+            if array.divisions[slice_to_check] != divisions:
                 raise ValueError(
                     "Mismatched divisions. {} != {}".format(divisions, array.divisions)
                 )
@@ -433,8 +436,8 @@ def _check_y(y, multi_output=False, y_numeric=False):
 
 
 def check_consistent_length(*arrays):
-    # TODO: check divisions, chunks, etc.
-    pass
+    """Check that blocks match for arrays and divisions match for dataframes."""
+    check_matching_blocks(*arrays, check_first_dim_only=True)
 
 
 def check_chunks(n_samples, n_features, chunks=None):
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e26097748..b4961930d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -15,6 +15,7 @@
     assert_estimator_equal,
     check_array,
     check_chunks,
+    check_consistent_length,
     check_matching_blocks,
     check_random_state,
     handle_zeros_in_scale,
@@ -208,8 +209,9 @@ def test_check_array_1d():
         ),
     ],
 )
-def test_matching_blocks_ok(arrays):
-    check_matching_blocks(*arrays)
+@pytest.mark.parametrize("check_first_dim_only", [True, False])
+def test_matching_blocks_ok(arrays, check_first_dim_only):
+    check_matching_blocks(*arrays, check_first_dim_only=check_first_dim_only)
 
 
 @pytest.mark.parametrize(
@@ -234,3 +236,40 @@ def test_matching_blocks_ok(arrays):
 def test_matching_blocks_raises(arrays):
     with pytest.raises(ValueError):
         check_matching_blocks(*arrays)
+
+
+@pytest.mark.parametrize(
+    "arrays",
+    [
+        (
+            da.random.uniform(size=(10, 10), chunks=(10, 10)),
+            da.random.uniform(size=10, chunks=10),
+        ),
+        (
+            da.random.uniform(size=(50, 10), chunks=(50, 10)),
+            da.random.uniform(size=50, chunks=50),
+        ),
+        (
+            dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), 2)
+            .reset_index()
+            .to_dask_array(),
+            dd.from_pandas(pd.Series([1, 2, 3]), 2).reset_index().to_dask_array(),
+        ),
+        (
+            dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), 2),
+            dd.from_pandas(pd.Series([1, 2, 3]), 2),
+        ),
+        # Allow known and unknown?
+        pytest.param(
+            (
+                dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), 2)
+                .reset_index()
+                .to_dask_array(),
+                dd.from_pandas(pd.Series([1, 2, 3]), 2).reset_index(),
+            ),
+            marks=pytest.mark.xfail(reason="Known and unknown blocks."),
+        ),
+    ],
+)
+def test_check_consistent_length_ok(arrays):
+    check_consistent_length(*arrays)

From bc953797e87824b889aca13bcee4eba1456d9a09 Mon Sep 17 00:00:00 2001
From: freddyaboulton <alfonsoboulton@gmail.com>
Date: Tue, 18 Jan 2022 15:28:59 -0500
Subject: [PATCH 2/2] Fix bug. Refactor into helper

---
 dask_ml/utils.py    | 30 +++++++++++++++++++++++-------
 tests/test_utils.py | 27 ++++++++++++++++++++++++---
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/dask_ml/utils.py b/dask_ml/utils.py
index 2150a2ea5..83e89de0d 100644
--- a/dask_ml/utils.py
+++ b/dask_ml/utils.py
@@ -241,8 +241,8 @@ def check_random_state(random_state):
         raise TypeError("Unexpected type '{}'".format(type(random_state)))
 
 
-def check_matching_blocks(*arrays, check_first_dim_only=False):
-    """Check that the partitioning structure for many arrays matches.
+def _check_matching_blocks(*arrays, check_first_dim_only=False):
+    """Helper function to check blocks match across *arrays.
 
     Parameters
     ----------
@@ -253,7 +253,8 @@ def check_matching_blocks(*arrays, check_first_dim_only=False):
         * Dask DataFrame
         * Dask Series
     check_first_dim_only: bool, default false
-        Whether to only checks the chunks along the first dimension
+        Whether to only checks the chunks along the first dimension. Only applies
+        if all the arrays are dask arrays.
     """
     if len(arrays) <= 1:
         return
@@ -266,11 +267,11 @@ def check_matching_blocks(*arrays, check_first_dim_only=False):
                 raise ValueError(
                     "Mismatched chunks. {} != {}".format(chunks, array.chunks)
                 )
-
+    # Divisions correspond to the index (first_dim) so no need to use slice_to_check
     elif all(isinstance(x, (dd.Series, dd.DataFrame)) for x in arrays):
-        divisions = arrays[0].divisions[slice_to_check]
+        divisions = arrays[0].divisions
         for array in arrays[1:]:
-            if array.divisions[slice_to_check] != divisions:
+            if array.divisions != divisions:
                 raise ValueError(
                     "Mismatched divisions. {} != {}".format(divisions, array.divisions)
                 )
@@ -278,6 +279,21 @@ def check_matching_blocks(*arrays, check_first_dim_only=False):
         raise ValueError("Unexpected types {}.".format({type(x) for x in arrays}))
 
 
+def check_matching_blocks(*arrays):
+    """Check that the partitioning structure for many arrays matches.
+
+    Parameters
+    ----------
+    *arrays : Sequence of array-likes
+        This includes
+
+        * Dask Array
+        * Dask DataFrame
+        * Dask Series
+    """
+    _check_matching_blocks(*arrays, check_first_dim_only=False)
+
+
 def check_X_y(
     X,
     y,
@@ -437,7 +453,7 @@ def _check_y(y, multi_output=False, y_numeric=False):
 
 def check_consistent_length(*arrays):
     """Check that blocks match for arrays and divisions match for dataframes."""
-    check_matching_blocks(*arrays, check_first_dim_only=True)
+    _check_matching_blocks(*arrays, check_first_dim_only=True)
 
 
 def check_chunks(n_samples, n_features, chunks=None):
diff --git a/tests/test_utils.py b/tests/test_utils.py
index b4961930d..fc64c73d0 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -209,9 +209,8 @@ def test_check_array_1d():
         ),
     ],
 )
-@pytest.mark.parametrize("check_first_dim_only", [True, False])
-def test_matching_blocks_ok(arrays, check_first_dim_only):
-    check_matching_blocks(*arrays, check_first_dim_only=check_first_dim_only)
+def test_matching_blocks_ok(arrays):
+    check_matching_blocks(*arrays)
 
 
 @pytest.mark.parametrize(
@@ -273,3 +272,25 @@ def test_matching_blocks_raises(arrays):
 )
 def test_check_consistent_length_ok(arrays):
     check_consistent_length(*arrays)
+
+
+@pytest.mark.parametrize(
+    "arrays",
+    [
+        (
+            da.random.uniform(size=(10, 10), chunks=(10, 10)),
+            da.random.uniform(size=8, chunks=8),
+        ),
+        (
+            da.random.uniform(size=(100, 10), chunks=(100, 10)),
+            da.random.uniform(size=50, chunks=50),
+        ),
+        (
+            dd.from_pandas(pd.DataFrame({"a": [1, 2, 3, 4]}), 4),
+            dd.from_pandas(pd.Series([1, 2, 3]), 2),
+        ),
+    ],
+)
+def test_check_consistent_length_raises(arrays):
+    with pytest.raises(ValueError):
+        check_consistent_length(*arrays)