scikit-learn-contrib · jbbqqf · May 9, 2026 · May 9, 2026 · glemaitre · Jun 7, 2026
diff --git a/doc/whats_new/v0.15.rst b/doc/whats_new/v0.15.rst
@@ -11,6 +11,14 @@ Changelog
 Bug fixes
 .........
 
+- Fix :func:`~imblearn.metrics.geometric_mean_score` with
+  ``average='macro'`` (and ``'weighted'``): the function now returns the
+  (weighted) mean of the per-class G-means, matching ``np.mean`` of
+  ``geometric_mean_score(..., average=None)`` as users expect. The
+  previous implementation returned ``sqrt(macro_sen * macro_spe)``,
+  which disagreed with the per-class average — most visibly on binary
+  inputs. :pr:`1173` by :user:`jbbqqf`.
+
 Enhancements
 ............
 

diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py
@@ -662,15 +662,47 @@ class is unrecognized by the classifier, G-mean resolves to zero. To
     >>> geometric_mean_score(y_true, y_pred, correction=0.001)
     0.010...
     >>> geometric_mean_score(y_true, y_pred, average='macro')
-    0.471...
+    0.288...
     >>> geometric_mean_score(y_true, y_pred, average='micro')
     0.471...
     >>> geometric_mean_score(y_true, y_pred, average='weighted')
-    0.471...
+    0.288...
     >>> geometric_mean_score(y_true, y_pred, average=None)
     array([0.866...,  0.       ,  0.       ])
     """
     if average is None or average != "multiclass":
+        # For macro/weighted averaging the previous implementation passed
+        # ``average`` directly to sensitivity_specificity_support, which
+        # returns the macro/weighted average of sensitivity and
+        # specificity, and then took ``sqrt(sen * spe)`` of those scalars.
+        # That is the geometric mean of mean-sensitivity and
+        # mean-specificity, NOT the mean of per-class G-means — and the two
+        # disagree on binary inputs (see #1096). Compute the per-class
+        # G-mean first and aggregate, so that
+        # ``geometric_mean_score(..., average='macro')`` equals
+        # ``np.mean(geometric_mean_score(..., average=None))`` as users
+        # rightly expect.
+        if average in ("macro", "weighted"):
+            sen, spe, sup = sensitivity_specificity_support(
+                y_true,
+                y_pred,
+                labels=labels,
+                pos_label=pos_label,
+                average=None,
+                warn_for=("specificity", "specificity"),
+                sample_weight=sample_weight,
+            )
+            per_class = np.sqrt(sen * spe)
+            if average == "macro":
+                return np.mean(per_class)
+            # weighted: mean weighted by support (true samples per class).
+            # Mirror sklearn's behaviour and return 0 when the total
+            # support is zero rather than producing a NaN from 0/0.
+            total = sup.sum()
+            if total == 0:
+                return 0.0
+            return np.average(per_class, weights=sup)
+
         sen, spe, _ = sensitivity_specificity_support(
             y_true,
             y_pred,

diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
@@ -229,9 +229,13 @@ def test_geometric_mean_multiclass(y_true, y_pred, correction, expected_gmean):
 @pytest.mark.parametrize(
     "y_true, y_pred, average, expected_gmean",
     [
-        ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "macro", 0.471),
+        # Macro / weighted G-mean is the (weighted) mean of the per-class
+        # G-mean array — which for this input is [0.866, 0, 0]. Their
+        # value is therefore 0.866/3 = 0.2887, NOT sqrt(macro_sen *
+        # macro_spe) = 0.471 as the metric used to return before #1096.
+        ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "macro", 0.2887),
         ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "micro", 0.471),
-        ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "weighted", 0.471),
+        ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], "weighted", 0.2887),
         ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 0, 1], None, [0.8660254, 0.0, 0.0]),
     ],
 )
@@ -251,12 +255,18 @@ def test_geometric_mean_average(y_true, y_pred, average, expected_gmean):
             "multiclass",
             0.707,
         ),
+        # weighted G-mean with labels=[0, 1] and the given sample
+        # weights: per-class (sen, spe) restricted to [0, 1] are
+        # [(1.0, 0.5), (0.5, 0.0)] with supports [2, 4]; per-class
+        # G-means = [sqrt(0.5), 0] = [0.707, 0]; weighted average =
+        # (2*0.707 + 4*0)/6 = 0.236. Old expected 0.333 came from
+        # sqrt(weighted_sen * weighted_spe), the bug fixed in #1096.
         (
             [0, 1, 2, 0, 1, 2],
             [0, 1, 1, 0, 0, 1],
             [1, 2, 1, 1, 2, 1],
             "weighted",
-            0.333,
+            0.236,
         ),
     ],
 )
@@ -278,8 +288,13 @@ def test_geometric_mean_sample_weight(
     [
         ("multiclass", 0.36),
         (None, [0.82, 0.24, 0.72]),
-        ("macro", 0.67),
-        ("weighted", 0.64),
+        # macro / weighted updated in #1096: now mean(per-class gmean)
+        # rather than sqrt(macro_sen * macro_spe). Per-class G-means
+        # for this fixture are [0.82, 0.24, 0.72]; macro = mean = 0.59;
+        # weighted mean over class supports (which differ slightly due
+        # to the 50/50 train/test split) = 0.55.
+        ("macro", 0.59),
+        ("weighted", 0.55),
     ],
 )
 def test_geometric_mean_score_prediction(average, expected_gmean):
@@ -289,6 +304,29 @@ def test_geometric_mean_score_prediction(average, expected_gmean):
     assert gmean == pytest.approx(expected_gmean, rel=R_TOL)
 
 
+@pytest.mark.parametrize(
+    "y_true, y_pred",
+    [
+        # binary case from #1096
+        ([0, 0, 1, 0, 1, 1], [0, 0, 0, 0, 0, 1]),
+        # binary, fully balanced
+        ([0, 1, 0, 1], [0, 1, 1, 0]),
+        # 3-class, asymmetric errors
+        ([0, 1, 2, 0, 1, 2], [0, 2, 1, 0, 1, 2]),
+    ],
+)
+def test_geometric_mean_macro_equals_mean_of_per_class(y_true, y_pred):
+    # Non-regression test for #1096: the macro G-mean MUST equal the
+    # arithmetic mean of the per-class G-mean array. Before #1096 the
+    # macro path returned sqrt(macro_sen * macro_spe), which generally
+    # disagrees with mean(per_class_gmean) — most visibly on binary
+    # inputs, where the original report observed 0.745 (correct ~0.577)
+    # for y_true=[0,0,1,0,1,1], y_pred=[0,0,0,0,0,1].
+    per_class = geometric_mean_score(y_true, y_pred, average=None)
+    macro = geometric_mean_score(y_true, y_pred, average="macro")
+    assert macro == pytest.approx(np.mean(per_class))
+
+
 def test_iba_geo_mean_binary():
     y_true, y_pred, _ = make_prediction(binary=True)