Skip to content

Commit 85d9356

Browse files
jorisvandenbosschemroeschke
authored andcommitted
Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 (pandas-dev#60716)
* Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 * update pypi index * extra filterwarnings * more test updates * temp enable infer_string option * Adapt test_get_handle_pyarrow_compat for pyarrow 19 * Use pa_version_under19p0 in test_get_handle_pyarrow_compat * Adjust test_string_inference for using_infer_string * Fix test_string_inference for feather --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 6441747 commit 85d9356

File tree

9 files changed

+96
-31
lines changed

9 files changed

+96
-31
lines changed

.github/workflows/unit-tests.yml

+1
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ jobs:
8484
- name: "Pyarrow Nightly"
8585
env_file: actions-311-pyarrownightly.yaml
8686
pattern: "not slow and not network and not single_cpu"
87+
pandas_future_infer_string: "1"
8788
platform: ubuntu-22.04
8889
fail-fast: false
8990
name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }}

ci/deps/actions-311-pyarrownightly.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ dependencies:
2323

2424
- pip:
2525
- "tzdata>=2022.7"
26-
- "--extra-index-url https://pypi.fury.io/arrow-nightlies/"
26+
- "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
2727
- "--prefer-binary"
2828
- "--pre"
2929
- "pyarrow"

pandas/compat/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
pa_version_under16p0,
3535
pa_version_under17p0,
3636
pa_version_under18p0,
37+
pa_version_under19p0,
3738
)
3839

3940
if TYPE_CHECKING:
@@ -166,4 +167,5 @@ def is_ci_environment() -> bool:
166167
"pa_version_under16p0",
167168
"pa_version_under17p0",
168169
"pa_version_under18p0",
170+
"pa_version_under19p0",
169171
]

pandas/compat/pyarrow.py

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
pa_version_under16p0 = _palv < Version("16.0.0")
1919
pa_version_under17p0 = _palv < Version("17.0.0")
2020
pa_version_under18p0 = _palv < Version("18.0.0")
21+
pa_version_under19p0 = _palv < Version("19.0.0")
2122
HAS_PYARROW = True
2223
except ImportError:
2324
pa_version_under10p1 = True
@@ -30,4 +31,5 @@
3031
pa_version_under16p0 = True
3132
pa_version_under17p0 = True
3233
pa_version_under18p0 = True
34+
pa_version_under19p0 = True
3335
HAS_PYARROW = False

pandas/io/_util.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
from pandas._config import using_string_dtype
1111

1212
from pandas._libs import lib
13-
from pandas.compat import pa_version_under18p0
13+
from pandas.compat import (
14+
pa_version_under18p0,
15+
pa_version_under19p0,
16+
)
1417
from pandas.compat._optional import import_optional_dependency
1518

1619
import pandas as pd
@@ -77,7 +80,10 @@ def arrow_table_to_pandas(
7780
elif dtype_backend == "pyarrow":
7881
types_mapper = pd.ArrowDtype
7982
elif using_string_dtype():
80-
types_mapper = _arrow_string_types_mapper()
83+
if pa_version_under19p0:
84+
types_mapper = _arrow_string_types_mapper()
85+
else:
86+
types_mapper = None
8187
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
8288
types_mapper = None
8389
else:

pandas/tests/arrays/string_/test_string.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010

1111
from pandas._config import using_string_dtype
1212

13-
from pandas.compat.pyarrow import pa_version_under12p0
13+
from pandas.compat.pyarrow import (
14+
pa_version_under12p0,
15+
pa_version_under19p0,
16+
)
1417

1518
from pandas.core.dtypes.common import is_dtype_equal
1619

@@ -539,7 +542,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
539542
assert table.field("a").type == "large_string"
540543
with pd.option_context("string_storage", string_storage):
541544
result = table.to_pandas()
542-
if dtype.na_value is np.nan and not using_string_dtype():
545+
if dtype.na_value is np.nan and not using_infer_string:
543546
assert result["a"].dtype == "object"
544547
else:
545548
assert isinstance(result["a"].dtype, pd.StringDtype)
@@ -553,6 +556,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
553556
assert result.loc[2, "a"] is result["a"].dtype.na_value
554557

555558

559+
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
560+
def test_arrow_from_string(using_infer_string):
561+
# not roundtrip, but starting with pyarrow table without pandas metadata
562+
pa = pytest.importorskip("pyarrow")
563+
table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())})
564+
565+
result = table.to_pandas()
566+
567+
if using_infer_string and not pa_version_under19p0:
568+
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
569+
else:
570+
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
571+
tm.assert_frame_equal(result, expected)
572+
573+
556574
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
557575
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
558576
# GH-41040

pandas/tests/io/test_common.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
WASM,
2424
is_platform_windows,
2525
)
26+
from pandas.compat.pyarrow import pa_version_under19p0
2627
import pandas.util._test_decorators as td
2728

2829
import pandas as pd
@@ -152,8 +153,8 @@ def test_get_handle_pyarrow_compat(self):
152153
s = StringIO(data)
153154
with icom.get_handle(s, "rb", is_text=False) as handles:
154155
df = pa_csv.read_csv(handles.handle).to_pandas()
155-
# TODO will have to update this when pyarrow' to_pandas() is fixed
156-
expected = expected.astype("object")
156+
if pa_version_under19p0:
157+
expected = expected.astype("object")
157158
tm.assert_frame_equal(df, expected)
158159
assert not s.closed
159160

pandas/tests/io/test_feather.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66
import numpy as np
77
import pytest
88

9-
from pandas.compat.pyarrow import pa_version_under18p0
9+
from pandas.compat.pyarrow import (
10+
pa_version_under18p0,
11+
pa_version_under19p0,
12+
)
1013

1114
import pandas as pd
1215
import pandas._testing as tm
@@ -239,16 +242,27 @@ def test_invalid_dtype_backend(self):
239242
with pytest.raises(ValueError, match=msg):
240243
read_feather(path, dtype_backend="numpy")
241244

242-
def test_string_inference(self, tmp_path):
245+
def test_string_inference(self, tmp_path, using_infer_string):
243246
# GH#54431
244247
path = tmp_path / "test_string_inference.p"
245248
df = pd.DataFrame(data={"a": ["x", "y"]})
246249
df.to_feather(path)
247250
with pd.option_context("future.infer_string", True):
248251
result = read_feather(path)
252+
dtype = pd.StringDtype(na_value=np.nan)
249253
expected = pd.DataFrame(
250254
data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
251255
)
256+
expected = pd.DataFrame(
257+
data={"a": ["x", "y"]},
258+
dtype=dtype,
259+
columns=pd.Index(
260+
["a"],
261+
dtype=object
262+
if pa_version_under19p0 and not using_infer_string
263+
else dtype,
264+
),
265+
)
252266
tm.assert_frame_equal(result, expected)
253267

254268
@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")

pandas/tests/io/test_parquet.py

+43-22
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
pa_version_under13p0,
1818
pa_version_under15p0,
1919
pa_version_under17p0,
20+
pa_version_under19p0,
2021
)
2122

2223
import pandas as pd
@@ -254,8 +255,10 @@ def test_invalid_engine(df_compat):
254255
check_round_trip(df_compat, "foo", "bar")
255256

256257

257-
def test_options_py(df_compat, pa):
258+
def test_options_py(df_compat, pa, using_infer_string):
258259
# use the set option
260+
if using_infer_string and not pa_version_under19p0:
261+
df_compat.columns = df_compat.columns.astype("str")
259262

260263
with pd.option_context("io.parquet.engine", "pyarrow"):
261264
check_round_trip(df_compat)
@@ -784,18 +787,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):
784787

785788
def test_categorical(self, pa):
786789
# supported in >= 0.7.0
787-
df = pd.DataFrame()
788-
df["a"] = pd.Categorical(list("abcdef"))
789-
790-
# test for null, out-of-order values, and unobserved category
791-
df["b"] = pd.Categorical(
792-
["bar", "foo", "foo", "bar", None, "bar"],
793-
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
794-
)
795-
796-
# test for ordered flag
797-
df["c"] = pd.Categorical(
798-
["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True
790+
df = pd.DataFrame(
791+
{
792+
"a": pd.Categorical(list("abcdef")),
793+
# test for null, out-of-order values, and unobserved category
794+
"b": pd.Categorical(
795+
["bar", "foo", "foo", "bar", None, "bar"],
796+
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
797+
),
798+
# test for ordered flag
799+
"c": pd.Categorical(
800+
["a", "b", "c", "a", "c", "b"],
801+
categories=["b", "c", "d"],
802+
ordered=True,
803+
),
804+
}
799805
)
800806

801807
check_round_trip(df, pa)
@@ -858,11 +864,13 @@ def test_s3_roundtrip_for_dir(
858864
repeat=1,
859865
)
860866

861-
def test_read_file_like_obj_support(self, df_compat):
867+
def test_read_file_like_obj_support(self, df_compat, using_infer_string):
862868
pytest.importorskip("pyarrow")
863869
buffer = BytesIO()
864870
df_compat.to_parquet(buffer)
865871
df_from_buf = read_parquet(buffer)
872+
if using_infer_string and not pa_version_under19p0:
873+
df_compat.columns = df_compat.columns.astype("str")
866874
tm.assert_frame_equal(df_compat, df_from_buf)
867875

868876
def test_expand_user(self, df_compat, monkeypatch):
@@ -929,7 +937,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
929937
"c": pd.Series(["a", None, "c"], dtype="string"),
930938
}
931939
)
932-
if using_infer_string:
940+
if using_infer_string and pa_version_under19p0:
933941
check_round_trip(df, pa, expected=df.astype({"c": "str"}))
934942
else:
935943
check_round_trip(df, pa)
@@ -943,7 +951,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
943951
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
944952
with pd.option_context("string_storage", string_storage):
945953
if using_infer_string:
946-
expected = df.astype("str")
954+
if pa_version_under19p0:
955+
expected = df.astype("str")
956+
else:
957+
expected = df.astype(f"string[{string_storage}]")
947958
expected.columns = expected.columns.astype("str")
948959
else:
949960
expected = df.astype(f"string[{string_storage}]")
@@ -1099,17 +1110,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
10991110
new_df = read_parquet(path, engine=pa)
11001111
assert new_df.attrs == df.attrs
11011112

1102-
def test_string_inference(self, tmp_path, pa):
1113+
def test_string_inference(self, tmp_path, pa, using_infer_string):
11031114
# GH#54431
11041115
path = tmp_path / "test_string_inference.p"
11051116
df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
1106-
df.to_parquet(path, engine="pyarrow")
1117+
df.to_parquet(path, engine=pa)
11071118
with pd.option_context("future.infer_string", True):
1108-
result = read_parquet(path, engine="pyarrow")
1119+
result = read_parquet(path, engine=pa)
1120+
dtype = pd.StringDtype(na_value=np.nan)
11091121
expected = pd.DataFrame(
11101122
data={"a": ["x", "y"]},
1111-
dtype=pd.StringDtype(na_value=np.nan),
1112-
index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
1123+
dtype=dtype,
1124+
index=pd.Index(["a", "b"], dtype=dtype),
1125+
columns=pd.Index(
1126+
["a"],
1127+
dtype=object
1128+
if pa_version_under19p0 and not using_infer_string
1129+
else dtype,
1130+
),
11131131
)
11141132
tm.assert_frame_equal(result, expected)
11151133

@@ -1122,7 +1140,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
11221140
df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]")
11231141
df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))]))
11241142
result = read_parquet(path)
1125-
expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
1143+
if pa_version_under19p0:
1144+
expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
1145+
else:
1146+
expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object")
11261147
tm.assert_frame_equal(result, expected)
11271148

11281149
def test_infer_string_large_string_type(self, tmp_path, pa):

0 commit comments

Comments
 (0)