pandas-dev · mroeschke · Jul 18, 2022 · Jul 13, 2022 · Jul 14, 2022 · Jul 14, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -844,7 +844,7 @@ Numeric
 - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`)
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
--
+- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. By falling back to numpy the loss is now log-linear (:issue:`42878`)
 
 Conversion
 ^^^^^^^^^^

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -162,6 +162,9 @@ def f(
 def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
     # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
     if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
+        # GH 42878
+        # Bottleneck uses naive summation leading to O(n) loss of precision
+        # unlike numpy which implements pairwise summation, which has O(log(n)) loss
 
         # GH 15507
         # bottleneck does not properly upcast during the sum
@@ -171,7 +174,7 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
         # further we also want to preserve NaN when all elements
         # are NaN, unlike bottleneck/numpy which consider this
         # to be 0
-        return name not in ["nansum", "nanprod"]
+        return name not in ["nansum", "nanprod", "nanmean"]
     return False
 
 

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
@@ -1534,3 +1534,30 @@ def test_multimode_complex(self, array, expected, dtype):
         # Complex numbers are sorted by their magnitude
         result = Series(array, dtype=dtype).mode()
         tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float64"])
+def test_numerical_precision_mean(dtype):
+    np_dtype = np.dtype(dtype)
+    eps = np.finfo(np_dtype).eps
+    answer = 0.1
+    n = 1_000_000
+    max_error = answer * eps * np.log2(n)
+
+    series = Series(np.full(n, fill_value=answer, dtype=np_dtype))
+    assert series.dtype == np_dtype
+    assert np.abs(series.mean() - answer) < max_error
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float64"])
+def test_numerical_precision_sum(dtype):
+    np_dtype = np.dtype(dtype)
+    eps = np.finfo(np_dtype).eps
+    value = 0.1
+    n = 1_000_000
+    answer = value * n
+    max_error = answer * eps * np.log2(n)
+
+    series = Series(np.full(n, fill_value=value, dtype=np_dtype))
+    assert series.dtype == np_dtype
+    assert np.abs(series.sum() - answer) < max_error