Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 128 additions & 5 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,105 @@
from pandas._typing import InterpolateOptions, Self
from pandas.api.extensions import no_default
from pandas.core.arrays import ArrowExtensionArray, ExtensionArray
from pandas.core.dtypes.common import is_float_dtype
from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses, validate_indices
from pandas.io.formats.format import format_array

from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.utils import enumerate_chunks, is_pa_type_a_list

__all__ = ["NestedExtensionArray"]


BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = True
"""Use a trick to by-pass pandas limitations on extension array formatting

Pandas array formatting works in a way, that Pandas objects are always
being formatted with `str()`, see _GenericArrayFormatter._format_strings()
method:
https://github.com/pandas-dev/pandas/blob/0d85d57b18b18e6b216ff081eac0952cb27d0e13/pandas/io/formats/format.py#L1219

_GenericArrayFormatter is used after _ExtensionArrayFormatter was called
initially and extracted values from the extension array with
np.asarray(values, dtype=object):
https://github.com/pandas-dev/pandas/blob/0d85d57b18b18e6b216ff081eac0952cb27d0e13/pandas/io/formats/format.py#L1516

Since our implementation of numpy conversion would return an object array
of data-frames, these data-frames would always be converted using `str()`,
which produces ugly and unreadable output. That's why when `__array__` is
called we check if it was actually called by _ExtensionArrayFormatter and
instead of returning a numpy array of data-frames, we return an array of
`_DataFrameWrapperForRepresentation` objects. That class is used for that
purposes only and should never be used for anything else.
"""
try:
from pandas.io.formats.format import _ExtensionArrayFormatter
except ImportError:
BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = False

Check warning on line 83 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L82-L83

Added lines #L82 - L83 were not covered by tests

NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW = 1
"""Maximum number of nested data-frame's rows to show inside a parent object"""


def _is_called_from_func(func: Callable) -> bool:
"""Check if the given function appears in the call stack by matching its code object.

Parameters
----------
func
Function to check

Returns
-------
bool
"""
from inspect import currentframe

frame = currentframe()
while frame:
if frame.f_code is func.__code__:
return True

Check warning on line 106 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L106

Added line #L106 was not covered by tests
frame = frame.f_back # Move up the call stack
return False


def _is_called_from_ext_array_fmter_fmt_strings():
"""Check if the code was called from _ExtensionArrayFormatter._format_strings

Returns
-------
bool
"""
if not BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK:
raise RuntimeError("Set BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK to True")

Check warning on line 119 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L119

Added line #L119 was not covered by tests
return _is_called_from_func(_ExtensionArrayFormatter._format_strings)


class _DataFrameWrapperForRepresentation:
"""A class used to store nested data-frames for the formatting purposes

It encapsulates the input data-frame and gives access to all its attributes

Parameters
----------
df : pd.DataFrame
Data

Notes
-----
Do not use it out of the formatting code
"""

def __init__(self, df):
self.__internal_nested_df = df

Check warning on line 139 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L139

Added line #L139 was not covered by tests

def __getattr__(self, item):
return getattr(self.__internal_nested_df, item)

Check warning on line 142 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L142

Added line #L142 was not covered by tests

def __len__(self):
return len(self.__internal_nested_df)

Check warning on line 145 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L145

Added line #L145 was not covered by tests


def to_pyarrow_dtype(dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None) -> pa.DataType | None:
"""Convert the dtype to pyarrow.DataType"""
if isinstance(dtype, NestedDtype):
Expand Down Expand Up @@ -390,15 +481,31 @@
return type(self)(self._chunked_array, validate=False)

def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
# TODO: make formatted strings more pretty
# https://github.com/lincc-frameworks/nested-pandas/issues/50
if boxed:

def box_formatter(value):
if value is pd.NA:
return str(pd.NA)
scalar = convert_df_to_pa_scalar(value, pa_type=self._pyarrow_dtype)
return str(scalar.as_py())
# Select first few rows
df = value.iloc[:NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW]
# Format to strings using Pandas default formatters

def format_series(series):
if is_float_dtype(series.dtype):
# Format with the default Pandas formatter and strip white-spaces it adds
return pd.Series(format_array(series.to_numpy(), None)).str.strip()
# Convert to string, add extra quotes for strings
return series.apply(repr)

def format_row(row):
return ", ".join(f"{name}: {value}" for name, value in zip(row.index, row))

# Format series to strings
df = df.apply(format_series, axis=0)
str_rows = "; ".join(f"{{{format_row(row)}}}" for _index, row in df.iterrows())
if len(value) <= NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW:
return f"[{str_rows}]"

Check warning on line 507 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L507

Added line #L507 was not covered by tests
return f"[{str_rows}; …] ({len(value)} rows)"

return box_formatter
return repr
Expand Down Expand Up @@ -446,7 +553,23 @@

def __array__(self, dtype=None):
"""Convert the extension array to a numpy array."""
return self.to_numpy(dtype=dtype, copy=False)

array = self.to_numpy(dtype=dtype, copy=False)

# Check if called inside _ExtensionArrayFormatter._format_strings
# If yes, repack nested data-frames into a wrapper object, so
# Pandas would call our _formatter method on them.
if (
BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK
and dtype == np.object_
and _is_called_from_ext_array_fmter_fmt_strings()
):
for i, df in enumerate(array):

Check warning on line 567 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L567

Added line #L567 was not covered by tests
# Could be data-frame or NA
if isinstance(df, pd.DataFrame):
array[i] = _DataFrameWrapperForRepresentation(df)

Check warning on line 570 in src/nested_pandas/series/ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/ext_array.py#L569-L570

Added lines #L569 - L570 were not covered by tests

return array

# Adopted from ArrowExtensionArray
def __getstate__(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/nested_pandas/series/test_ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,7 +893,7 @@ def test__formatter_boxed():
)._formatter(boxed=True)
d = {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}
df = pd.DataFrame(d)
assert formatter(df) == str(d)
assert formatter(df) == "[{a: 1, b: -4.0}; …] (3 rows)"


def test__formetter_boxed_na():
Expand Down
Loading