diff --git a/doc/source/whatsnew/v2.3.1.rst b/doc/source/whatsnew/v2.3.1.rst index 64e5c1510e1da..eb3ad72f6a59f 100644 --- a/doc/source/whatsnew/v2.3.1.rst +++ b/doc/source/whatsnew/v2.3.1.rst @@ -59,6 +59,7 @@ Bug fixes - Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`) - Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`) - Fixed bug in :meth:`DataFrame.explode` and :meth:`Series.explode` where methods would fail with ``dtype="str"`` (:issue:`61623`) +- Fixed bug in unpickling objects pickled in pandas versions pre-2.3.0 that used :class:`StringDtype` (:issue:`61763`). .. _whatsnew_231.regressions: @@ -72,7 +73,6 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- .. --------------------------------------------------------------------------- .. _whatsnew_231.other: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0cd989ab9464c..9c8dc2054106a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -69,6 +69,8 @@ from pandas.io.formats import printing if TYPE_CHECKING: + from collections.abc import MutableMapping + import pyarrow from pandas._typing import ( @@ -213,6 +215,11 @@ def __eq__(self, other: object) -> bool: return self.storage == other.storage and self.na_value is other.na_value return False + def __setstate__(self, state: MutableMapping[str, Any]) -> None: + # back-compat for pandas < 2.3, where na_value did not yet exist + self.storage = state.pop("storage", "python") + self._na_value = state.pop("_na_value", libmissing.NA) + def __hash__(self) -> int: # need to override __hash__ as well because of overriding __eq__ return super().__hash__() diff --git a/pandas/tests/io/data/legacy_pickle/1.5.3/1.5.3_x86_64_win_3.11.13.pickle b/pandas/tests/io/data/legacy_pickle/1.5.3/1.5.3_x86_64_win_3.11.13.pickle new file mode 100644 index 0000000000000..d12fc5929ea5b Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/1.5.3/1.5.3_x86_64_win_3.11.13.pickle differ diff --git a/pandas/tests/io/data/legacy_pickle/2.0.3/2.0.3_AMD64_windows_3.11.12.pickle b/pandas/tests/io/data/legacy_pickle/2.0.3/2.0.3_AMD64_windows_3.11.12.pickle new file mode 100644 index 0000000000000..f508272d058f9 Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/2.0.3/2.0.3_AMD64_windows_3.11.12.pickle differ diff --git a/pandas/tests/io/data/legacy_pickle/2.1.4/2.1.4_AMD64_windows_3.11.12.pickle b/pandas/tests/io/data/legacy_pickle/2.1.4/2.1.4_AMD64_windows_3.11.12.pickle new file mode 100644 index 0000000000000..6f838839c2937 Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/2.1.4/2.1.4_AMD64_windows_3.11.12.pickle differ diff --git a/pandas/tests/io/data/legacy_pickle/2.2.3/2.2.3_AMD64_windows_3.11.12.pickle b/pandas/tests/io/data/legacy_pickle/2.2.3/2.2.3_AMD64_windows_3.11.12.pickle new file mode 100644 index 0000000000000..f98766fd4e05d Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/2.2.3/2.2.3_AMD64_windows_3.11.12.pickle differ diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 9bfd8eb9d51d5..9cb50b03e223a 100644 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -147,6 +147,7 @@ def create_pickle_data(): "float": Index(np.arange(10, dtype=np.float64)), "uint": Index(np.arange(10, dtype=np.uint64)), "timedelta": timedelta_range("00:00:00", freq="30min", periods=10), + "string": Index(["foo", "bar", "baz", "qux", "quux"], dtype="string"), } index["range"] = RangeIndex(10) @@ -185,6 +186,7 @@ def create_pickle_data(): "dt": Series(date_range("20130101", periods=5)), "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")), "period": Series([Period("2000Q1")] * 5), + "string": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"), } mixed_dup_df = DataFrame(data) @@ -233,6 +235,12 @@ def create_pickle_data(): }, index=range(5), ), + "string": DataFrame( + { + "A": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"), + "B": Series(["one", "two", "one", "two", "three"], dtype="string"), + } + ), } cat = { diff --git a/pyproject.toml b/pyproject.toml index 899bcfca35939..085d5c3bce07a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -526,6 +526,8 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", # Can be removed once https://github.com/numpy/numpy/pull/24794 is merged "ignore:.*In the future `np.long` will be defined as.*:FutureWarning", + # https://github.com/numpy/numpy/pull/29301 + "ignore:.*align should be passed:", ] junit_family = "xunit2" markers = [