opt out of bottleneck for nanmean (#47716)

sebasv · mroeschke · JMBurley · web-flow · commit cf4758f71e2b · 2022-07-18T12:14:17.000-07:00
* opt out of bottleneck for nanmean

* remove trailing whitespace

* make error bound explicit

* unittest only _bn_ok_dtype

* link issue to test function

* Update doc/source/whatsnew/v1.5.0.rst

clarify that there might be a performance decrease experienced from disabling `mean` for bottleneck

Co-authored-by: Matthew Roeschke &lt;emailformattr@gmail.com&gt;

* extend unit tests with (u)int dtypes

* Update pandas/core/nanops.py

Co-authored-by: JMBurley &lt;JMBurley@users.noreply.github.com&gt;

Co-authored-by: Matthew Roeschke &lt;emailformattr@gmail.com&gt;
Co-authored-by: JMBurley &lt;JMBurley@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -846,7 +846,7 @@ Numeric
 - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`)
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
--
+- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -162,6 +162,10 @@ def f(
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
     # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
     if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
+        # GH 42878
+        # Bottleneck uses naive summation leading to O(n) loss of precision
+        # unlike numpy which implements pairwise summation, which has O(log(n)) loss
+        # crossref: https://github.com/pydata/bottleneck/issues/379
 
         # GH 15507
         # bottleneck does not properly upcast during the sum
@@ -171,7 +175,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
         # further we also want to preserve NaN when all elements
         # are NaN, unlike bottleneck/numpy which consider this
         # to be 0
-        return name not in ["nansum", "nanprod"]
+        return name not in ["nansum", "nanprod", "nanmean"]
     return False
 
 
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -1120,3 +1120,25 @@ def test_check_below_min_count__large_shape(min_count, expected_result):
     shape = (2244367, 1253)
     result = nanops.check_below_min_count(shape, mask=None, min_count=min_count)
     assert result == expected_result
+
+
+@pytest.mark.parametrize("func", ["nanmean", "nansum"])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.float16,
+        np.float32,
+        np.float64,
+    ],
+)
+def test_check_bottleneck_disallow(dtype, func):
+    # GH 42878 bottleneck sometimes produces unreliable results for mean and sum
+    assert not nanops._bn_ok_dtype(dtype, func)