17
17
pa_version_under13p0 ,
18
18
pa_version_under15p0 ,
19
19
pa_version_under17p0 ,
20
+ pa_version_under19p0 ,
20
21
)
21
22
22
23
import pandas as pd
@@ -254,8 +255,10 @@ def test_invalid_engine(df_compat):
254
255
check_round_trip (df_compat , "foo" , "bar" )
255
256
256
257
257
- def test_options_py (df_compat , pa ):
258
+ def test_options_py (df_compat , pa , using_infer_string ):
258
259
# use the set option
260
+ if using_infer_string and not pa_version_under19p0 :
261
+ df_compat .columns = df_compat .columns .astype ("str" )
259
262
260
263
with pd .option_context ("io.parquet.engine" , "pyarrow" ):
261
264
check_round_trip (df_compat )
@@ -784,18 +787,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):
784
787
785
788
def test_categorical (self , pa ):
786
789
# supported in >= 0.7.0
787
- df = pd .DataFrame ()
788
- df ["a" ] = pd .Categorical (list ("abcdef" ))
789
-
790
- # test for null, out-of-order values, and unobserved category
791
- df ["b" ] = pd .Categorical (
792
- ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
793
- dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
794
- )
795
-
796
- # test for ordered flag
797
- df ["c" ] = pd .Categorical (
798
- ["a" , "b" , "c" , "a" , "c" , "b" ], categories = ["b" , "c" , "d" ], ordered = True
790
+ df = pd .DataFrame (
791
+ {
792
+ "a" : pd .Categorical (list ("abcdef" )),
793
+ # test for null, out-of-order values, and unobserved category
794
+ "b" : pd .Categorical (
795
+ ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
796
+ dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
797
+ ),
798
+ # test for ordered flag
799
+ "c" : pd .Categorical (
800
+ ["a" , "b" , "c" , "a" , "c" , "b" ],
801
+ categories = ["b" , "c" , "d" ],
802
+ ordered = True ,
803
+ ),
804
+ }
799
805
)
800
806
801
807
check_round_trip (df , pa )
@@ -858,11 +864,13 @@ def test_s3_roundtrip_for_dir(
858
864
repeat = 1 ,
859
865
)
860
866
861
- def test_read_file_like_obj_support (self , df_compat ):
867
+ def test_read_file_like_obj_support (self , df_compat , using_infer_string ):
862
868
pytest .importorskip ("pyarrow" )
863
869
buffer = BytesIO ()
864
870
df_compat .to_parquet (buffer )
865
871
df_from_buf = read_parquet (buffer )
872
+ if using_infer_string and not pa_version_under19p0 :
873
+ df_compat .columns = df_compat .columns .astype ("str" )
866
874
tm .assert_frame_equal (df_compat , df_from_buf )
867
875
868
876
def test_expand_user (self , df_compat , monkeypatch ):
@@ -929,7 +937,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
929
937
"c" : pd .Series (["a" , None , "c" ], dtype = "string" ),
930
938
}
931
939
)
932
- if using_infer_string :
940
+ if using_infer_string and pa_version_under19p0 :
933
941
check_round_trip (df , pa , expected = df .astype ({"c" : "str" }))
934
942
else :
935
943
check_round_trip (df , pa )
@@ -943,7 +951,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
943
951
df = pd .DataFrame ({"a" : pd .Series (["a" , None , "c" ], dtype = "string[pyarrow]" )})
944
952
with pd .option_context ("string_storage" , string_storage ):
945
953
if using_infer_string :
946
- expected = df .astype ("str" )
954
+ if pa_version_under19p0 :
955
+ expected = df .astype ("str" )
956
+ else :
957
+ expected = df .astype (f"string[{ string_storage } ]" )
947
958
expected .columns = expected .columns .astype ("str" )
948
959
else :
949
960
expected = df .astype (f"string[{ string_storage } ]" )
@@ -1099,17 +1110,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
1099
1110
new_df = read_parquet (path , engine = pa )
1100
1111
assert new_df .attrs == df .attrs
1101
1112
1102
- def test_string_inference (self , tmp_path , pa ):
1113
+ def test_string_inference (self , tmp_path , pa , using_infer_string ):
1103
1114
# GH#54431
1104
1115
path = tmp_path / "test_string_inference.p"
1105
1116
df = pd .DataFrame (data = {"a" : ["x" , "y" ]}, index = ["a" , "b" ])
1106
- df .to_parquet (path , engine = "pyarrow" )
1117
+ df .to_parquet (path , engine = pa )
1107
1118
with pd .option_context ("future.infer_string" , True ):
1108
- result = read_parquet (path , engine = "pyarrow" )
1119
+ result = read_parquet (path , engine = pa )
1120
+ dtype = pd .StringDtype (na_value = np .nan )
1109
1121
expected = pd .DataFrame (
1110
1122
data = {"a" : ["x" , "y" ]},
1111
- dtype = pd .StringDtype (na_value = np .nan ),
1112
- index = pd .Index (["a" , "b" ], dtype = pd .StringDtype (na_value = np .nan )),
1123
+ dtype = dtype ,
1124
+ index = pd .Index (["a" , "b" ], dtype = dtype ),
1125
+ columns = pd .Index (
1126
+ ["a" ],
1127
+ dtype = object
1128
+ if pa_version_under19p0 and not using_infer_string
1129
+ else dtype ,
1130
+ ),
1113
1131
)
1114
1132
tm .assert_frame_equal (result , expected )
1115
1133
@@ -1122,7 +1140,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
1122
1140
df = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "string[pyarrow]" )
1123
1141
df .to_parquet (path , schema = pa .schema ([("a" , pa .decimal128 (5 ))]))
1124
1142
result = read_parquet (path )
1125
- expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1143
+ if pa_version_under19p0 :
1144
+ expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1145
+ else :
1146
+ expected = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "object" )
1126
1147
tm .assert_frame_equal (result , expected )
1127
1148
1128
1149
def test_infer_string_large_string_type (self , tmp_path , pa ):
0 commit comments