-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] df.apply
: add support for engine='bodo'
#60622
base: main
Are you sure you want to change the base?
Changes from all commits
1e62d38
7e2c2c3
27fbc0a
4c2e94a
4349d61
0872285
cd94be9
9a90fa0
dcdd00e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10227,7 +10227,7 @@ def apply( | |
result_type: Literal["expand", "reduce", "broadcast"] | None = None, | ||
args=(), | ||
by_row: Literal[False, "compat"] = "compat", | ||
engine: Literal["python", "numba"] = "python", | ||
engine: Literal["python", "numba", "bodo"] = "python", | ||
engine_kwargs: dict[str, bool] | None = None, | ||
**kwargs, | ||
): | ||
|
@@ -10289,7 +10289,7 @@ def apply( | |
|
||
.. versionadded:: 2.1.0 | ||
|
||
engine : {'python', 'numba'}, default 'python' | ||
engine : {'python', 'numba', 'bodo'}, default 'python' | ||
Choose between the python (default) engine or the numba engine in apply. | ||
|
||
The numba engine will attempt to JIT compile the passed function, | ||
|
@@ -10312,6 +10312,8 @@ def apply( | |
<https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_ | ||
in numba to learn what you can or cannot use in the passed function. | ||
|
||
TODO: describe bodo | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding this comment to the review, so the TODO is not forgotten |
||
|
||
.. versionadded:: 2.2.0 | ||
|
||
engine_kwargs : dict | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import numpy as np | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than creating separate tests for bodo is there a way that we can create a fixture for the three different engines? Structuring the tests that way would be very helpful to ensure result consistency |
||
import pytest | ||
|
||
import pandas.util._test_decorators as td | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
|
||
pytestmark = [pytest.mark.single_cpu, td.skip_if_no("bodo")] | ||
|
||
|
||
def test_bodo_vs_python_indexing(): | ||
frame = pd.DataFrame( | ||
{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, | ||
) | ||
f = lambda x: x["c"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure why our linter doesn't complain, but I think it's considered a bad practice to use lambda when assigning to a variable. I'd use |
||
result = frame.apply(f, engine="bodo", axis=1) | ||
expected = frame.apply(f, engine="python", axis=1) | ||
|
||
tm.assert_series_equal(result, expected, check_series_type=False) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"reduction", | ||
[lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()], | ||
) | ||
def test_bodo_vs_python_reductions(reduction): | ||
df = pd.DataFrame(np.ones((4, 4), dtype=np.float64)) | ||
result = df.apply(reduction, engine="bodo", axis=1) | ||
expected = df.apply(reduction, engine="python", axis=1) | ||
tm.assert_series_equal(result, expected, check_series_type=False) | ||
|
||
|
||
def test_bodo_vs_python_df_output(): | ||
df = pd.DataFrame({"A": np.arange(20), "B": ["hi", "there"] * 10}) | ||
|
||
f = lambda a: pd.Series([a["B"], a["A"]]) | ||
result = df.apply(f, engine="bodo", axis=1) | ||
expected = df.apply(f, engine="python", axis=1) | ||
|
||
tm.assert_frame_equal(result, expected, check_frame_type=False, check_dtype=False) | ||
|
||
|
||
@pytest.mark.skip(reason="TODO: pass args/kwargs to bodo jitted function") | ||
def test_bodo_vs_python_args_kwargs(): | ||
def f(x, y, z=3): | ||
return x.A == y + z | ||
|
||
df = pd.DataFrame({"A": np.arange(20)}) | ||
|
||
result = df.apply(f, z=2, engine="bodo", axis=1, args=(2,)) | ||
expected = df.apply(f, z=2, axis=1, args=(2,)) | ||
tm.assert_series_equal(result, expected, check_series_type=False) | ||
|
||
|
||
@pytest.mark.parametrize("axis", [0, 1]) | ||
def test_bodo_vs_python_str_apply(axis): | ||
df = pd.DataFrame({"A": np.arange(20)}) | ||
|
||
func = "mean" | ||
axis = 1 | ||
result = df.apply(func, axis, engine="bodo") | ||
expected = df.apply(func, axis) | ||
|
||
tm.assert_series_equal(result, expected, check_series_type=False) | ||
|
||
|
||
def test_bodo_unsupported_axis(): | ||
"""Tests that a BodoError is raised when trying to apply UDF column-wise""" | ||
frame = pd.DataFrame( | ||
{"a": [1, 2, 3]}, | ||
) | ||
f = lambda x: 1 | ||
|
||
with pytest.raises( | ||
NotImplementedError, | ||
match=r"the 'bodo' engine only supports axis=1 for user-defined functions", | ||
): | ||
frame.apply(f, engine="bodo", axis=0) | ||
|
||
|
||
def test_bodo_raw_unsupported(): | ||
"""Tests that error gets raised when using raw=True""" | ||
frame = pd.DataFrame( | ||
{"a": [1, 2, 3]}, | ||
) | ||
f = lambda a: 1 | ||
|
||
with pytest.raises( | ||
NotImplementedError, match="the 'bodo' engine does not support raw=True." | ||
): | ||
frame.apply(f, engine="bodo", raw=True, axis=1) | ||
|
||
|
||
def test_bodo_result_type_unsupported(): | ||
"""Tests that error gets raised when passing any value to result_type""" | ||
frame = pd.DataFrame( | ||
{"a": [1, 2, 3]}, | ||
) | ||
|
||
def f(a): | ||
return 1 | ||
|
||
with pytest.raises( | ||
NotImplementedError, match="the 'bodo' engine does not support result_type yet." | ||
): | ||
frame.apply(f, engine="bodo", axis=1, result_type="reduce") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import pytest | ||
|
||
import pandas.util._test_decorators as td | ||
|
||
from pandas import DataFrame | ||
|
||
|
||
@td.skip_if_installed("bodo") | ||
def test_bodo_not_installed_df_apply(): | ||
"Test that importing bodo when not installed results in ImportError." | ||
|
||
df = DataFrame({"A": [1, 2, 3, 4, 5]}) | ||
|
||
def f(x): | ||
return 1 | ||
|
||
with pytest.raises(ImportError, match="Missing optional"): | ||
df.apply(f, engine="bodo") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you have plans to package Bodo for conda-forge? Is there a reason not to? I think it'd be better for users and our CI if we could simply use conda-forge.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we currently have an in progress PR here: conda-forge/staged-recipes#28648