pandas-dev · scott-routledge2 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
@@ -3,6 +3,7 @@
 name: pandas-dev
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.10
 
@@ -37,6 +38,7 @@ dependencies:
   - lxml=4.9.2
   - matplotlib=3.6.3
   - numba=0.56.4
+  - bodo=2024.12.3
   - numexpr=2.8.4
   - odfpy=1.4.1
   - qtpy=2.3.0

@@ -1,6 +1,7 @@
 name: pandas-dev
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.10
 
@@ -35,6 +36,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2024.12.3
   - numexpr>=2.8.4
   - odfpy>=1.4.1
   - qtpy>=2.3.0

@@ -1,6 +1,7 @@
 name: pandas-dev
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.11
 
@@ -35,6 +36,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2024.12.3
   - numexpr>=2.8.4
   - odfpy>=1.4.1
   - qtpy>=2.3.0

@@ -1,6 +1,7 @@
 name: pandas-dev-312
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.12
 
@@ -35,6 +36,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2024.12.3
   - numexpr>=2.8.4
   - odfpy>=1.4.1
   - qtpy>=2.3.0

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -186,6 +186,7 @@ Dependency                                            Minimum Version    pip ext
 `numexpr <https://github.com/pydata/numexpr>`__       2.8.4              performance        Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups
 `bottleneck <https://github.com/pydata/bottleneck>`__ 1.3.6              performance        Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup.
 `numba <https://github.com/numba/numba>`__            0.56.4             performance        Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler.
+`bodo <https://github.com/bodo-ai/Bodo>`__            2024.12.3             performance        Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI.
 ===================================================== ================== ================== ===================================================================================================================================================================================
 
 Visualization

diff --git a/environment.yml b/environment.yml
@@ -2,6 +2,7 @@
 name: pandas-dev
 channels:
   - conda-forge
+  - bodo.ai
 dependencies:
   - python=3.10
   - pip
@@ -40,6 +41,7 @@ dependencies:
   - lxml>=4.9.2
   - matplotlib>=3.6.3
   - numba>=0.56.4
+  - bodo>=2024.12.3
   - numexpr>=2.8.4
   - openpyxl>=3.1.0
   - odfpy>=1.4.1

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -57,6 +57,7 @@
     "tzdata": "2022.7",
     "qtpy": "2.3.0",
     "pyqt5": "5.15.9",
+    "bodo": "2024.12.3",
 }
 
 # A mapping from import name to package name (on PyPI) for packages where

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -598,9 +598,9 @@ def apply_list_or_dict_like(self) -> DataFrame | Series:
             Result when self.func is a list-like or dict-like, None otherwise.
         """
 
-        if self.engine == "numba":
+        if self.engine in ("numba", "bodo"):
             raise NotImplementedError(
-                "The 'numba' engine doesn't support list-like/"
+                f"The '{self.engine}' engine doesn't support list-like/"
                 "dict likes of callables yet."
             )
 
@@ -853,9 +853,9 @@ def apply(self) -> DataFrame | Series:
 
         # dispatch to handle list-like or dict-like
         if is_list_like(self.func):
-            if self.engine == "numba":
+            if self.engine in ("numba", "bodo"):
                 raise NotImplementedError(
-                    "the 'numba' engine doesn't support lists of callables yet"
+                    f"the '{self.engine}' engine doesn't support lists of callables yet"
                 )
             return self.apply_list_or_dict_like()
 
@@ -870,13 +870,16 @@ def apply(self) -> DataFrame | Series:
                     "the 'numba' engine doesn't support using "
                     "a string as the callable function"
                 )
+            elif self.engine == "bodo":
+                return self.apply_series_bodo()
+
             return self.apply_str()
 
         # ufunc
         elif isinstance(self.func, np.ufunc):
-            if self.engine == "numba":
+            if self.engine in ("numba", "bodo"):
                 raise NotImplementedError(
-                    "the 'numba' engine doesn't support "
+                    f"the '{self.engine}' engine doesn't support "
                     "using a numpy ufunc as the callable function"
                 )
             with np.errstate(all="ignore"):
@@ -886,9 +889,10 @@ def apply(self) -> DataFrame | Series:
 
         # broadcasting
         if self.result_type == "broadcast":
-            if self.engine == "numba":
+            if self.engine in ("numba", "bodo"):
                 raise NotImplementedError(
-                    "the 'numba' engine doesn't support result_type='broadcast'"
+                    f"the '{self.engine}' engine doesn't support "
+                    "result_type='broadcast'"
                 )
             return self.apply_broadcast(self.obj)
 
@@ -1007,6 +1011,8 @@ def wrapper(*args, **kwargs):
             result = nb_looper(self.values, self.axis, *args)
             # If we made the result 2-D, squeeze it back to 1-D
             result = np.squeeze(result)
+        elif self.engine == "bodo":
+            raise NotImplementedError("the 'bodo' engine does not support raw=True.")
         else:
             result = np.apply_along_axis(
                 wrap_function(self.func),
@@ -1051,10 +1057,17 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
         return result
 
     def apply_standard(self):
-        if self.engine == "python":
+        if self.engine == "numba":
+            results, res_index = self.apply_series_numba()
+        elif self.engine == "bodo":
+            return self.apply_series_bodo()
+        elif self.engine == "python":
             results, res_index = self.apply_series_generator()
         else:
-            results, res_index = self.apply_series_numba()
+            raise ValueError(
+                "invalid value for engine, must be one "
+                "of {'python', 'numba', 'bodo'}"
+            )
 
         # wrap results
         return self.wrap_results(results, res_index)
@@ -1089,6 +1102,26 @@ def apply_series_numba(self):
         results = self.apply_with_numba()
         return results, self.result_index
 
+    def apply_series_bodo(self) -> DataFrame | Series:
+        if self.result_type is not None:
+            raise NotImplementedError(
+                "the 'bodo' engine does not support result_type yet."
+            )
+
+        if self.axis != 1 and not isinstance(self.func, str):
+            raise NotImplementedError(
+                "the 'bodo' engine only supports axis=1 for user-defined functions."
+            )
+
+        bodo = import_optional_dependency("bodo")
+
+        @bodo.jit
+        def do_apply(obj, func, axis):
+            return obj.apply(func, axis)
+
+        result = do_apply(self.obj, self.func, self.axis)
+        return result
+
     def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series:
         from pandas import Series
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -10227,7 +10227,7 @@ def apply(
         result_type: Literal["expand", "reduce", "broadcast"] | None = None,
         args=(),
         by_row: Literal[False, "compat"] = "compat",
-        engine: Literal["python", "numba"] = "python",
+        engine: Literal["python", "numba", "bodo"] = "python",
         engine_kwargs: dict[str, bool] | None = None,
         **kwargs,
     ):
@@ -10289,7 +10289,7 @@ def apply(
 
             .. versionadded:: 2.1.0
 
-        engine : {'python', 'numba'}, default 'python'
+        engine : {'python', 'numba', 'bodo'}, default 'python'
             Choose between the python (default) engine or the numba engine in apply.
 
             The numba engine will attempt to JIT compile the passed function,
@@ -10312,6 +10312,8 @@ def apply(
             <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
             in numba to learn what you can or cannot use in the passed function.
 
+            TODO: describe bodo
+
             .. versionadded:: 2.2.0
 
         engine_kwargs : dict

diff --git a/pandas/tests/apply/test_bodo.py b/pandas/tests/apply/test_bodo.py
@@ -0,0 +1,107 @@
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+
+pytestmark = [pytest.mark.single_cpu, td.skip_if_no("bodo")]
+
+
+def test_bodo_vs_python_indexing():
+    frame = pd.DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]},
+    )
+    f = lambda x: x["c"]
+    result = frame.apply(f, engine="bodo", axis=1)
+    expected = frame.apply(f, engine="python", axis=1)
+
+    tm.assert_series_equal(result, expected, check_series_type=False)
+
+
+@pytest.mark.parametrize(
+    "reduction",
+    [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()],
+)
+def test_bodo_vs_python_reductions(reduction):
+    df = pd.DataFrame(np.ones((4, 4), dtype=np.float64))
+    result = df.apply(reduction, engine="bodo", axis=1)
+    expected = df.apply(reduction, engine="python", axis=1)
+    tm.assert_series_equal(result, expected, check_series_type=False)
+
+
+def test_bodo_vs_python_df_output():
+    df = pd.DataFrame({"A": np.arange(20), "B": ["hi", "there"] * 10})
+
+    f = lambda a: pd.Series([a["B"], a["A"]])
+    result = df.apply(f, engine="bodo", axis=1)
+    expected = df.apply(f, engine="python", axis=1)
+
+    tm.assert_frame_equal(result, expected, check_frame_type=False, check_dtype=False)
+
+
+@pytest.mark.skip(reason="TODO: pass args/kwargs to bodo jitted function")
+def test_bodo_vs_python_args_kwargs():
+    def f(x, y, z=3):
+        return x.A == y + z
+
+    df = pd.DataFrame({"A": np.arange(20)})
+
+    result = df.apply(f, z=2, engine="bodo", axis=1, args=(2,))
+    expected = df.apply(f, z=2, axis=1, args=(2,))
+    tm.assert_series_equal(result, expected, check_series_type=False)
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+def test_bodo_vs_python_str_apply(axis):
+    df = pd.DataFrame({"A": np.arange(20)})
+
+    func = "mean"
+    axis = 1
+    result = df.apply(func, axis, engine="bodo")
+    expected = df.apply(func, axis)
+
+    tm.assert_series_equal(result, expected, check_series_type=False)
+
+
+def test_bodo_unsupported_axis():
+    """Tests that a BodoError is raised when trying to apply UDF column-wise"""
+    frame = pd.DataFrame(
+        {"a": [1, 2, 3]},
+    )
+    f = lambda x: 1
+
+    with pytest.raises(
+        NotImplementedError,
+        match=r"the 'bodo' engine only supports axis=1 for user-defined functions",
+    ):
+        frame.apply(f, engine="bodo", axis=0)
+
+
+def test_bodo_raw_unsupported():
+    """Tests that error gets raised when using raw=True"""
+    frame = pd.DataFrame(
+        {"a": [1, 2, 3]},
+    )
+    f = lambda a: 1
+
+    with pytest.raises(
+        NotImplementedError, match="the 'bodo' engine does not support raw=True."
+    ):
+        frame.apply(f, engine="bodo", raw=True, axis=1)
+
+
+def test_bodo_result_type_unsupported():
+    """Tests that error gets raised when passing any value to result_type"""
+    frame = pd.DataFrame(
+        {"a": [1, 2, 3]},
+    )
+
+    def f(a):
+        return 1
+
+    with pytest.raises(
+        NotImplementedError, match="the 'bodo' engine does not support result_type yet."
+    ):
+        frame.apply(f, engine="bodo", axis=1, result_type="reduce")
diff --git a/pandas/tests/util/test_bodo.py b/pandas/tests/util/test_bodo.py
@@ -0,0 +1,18 @@
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import DataFrame
+
+
+@td.skip_if_installed("bodo")
+def test_bodo_not_installed_df_apply():
+    "Test that importing bodo when not installed results in ImportError."
+
+    df = DataFrame({"A": [1, 2, 3, 4, 5]})
+
+    def f(x):
+        return 1
+
+    with pytest.raises(ImportError, match="Missing optional"):
+        df.apply(f, engine="bodo")
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,7 +60,7 @@ matplotlib = "pandas:plotting._matplotlib"
 [project.optional-dependencies]
 test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0']
 pyarrow = ['pyarrow>=10.0.1']
-performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4']
+performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4', 'bodo>=2024.12.3']
 computation = ['scipy>=1.10.0', 'xarray>=2022.12.0']
 fss = ['fsspec>=2022.11.0']
 aws = ['s3fs>=2022.11.0']
@@ -97,6 +97,7 @@ all = ['adbc-driver-postgresql>=0.10.0',
        'lxml>=4.9.2',
        'matplotlib>=3.6.3',
        'numba>=0.56.4',
+       'bodo>=2024.12.3',
        'numexpr>=2.8.4',
        'odfpy>=1.4.1',
        'openpyxl>=3.1.0',

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -29,6 +29,7 @@ jinja2>=3.1.2
 lxml>=4.9.2
 matplotlib>=3.6.3
 numba>=0.56.4
+bodo>=2024.12.3
 numexpr>=2.8.4
 openpyxl>=3.1.0
 odfpy>=1.4.1