|  | 
| 46 | 46 | from pandas._typing import InterpolateOptions, Self | 
| 47 | 47 | from pandas.api.extensions import no_default | 
| 48 | 48 | from pandas.core.arrays import ArrowExtensionArray, ExtensionArray | 
|  | 49 | +from pandas.core.dtypes.common import is_float_dtype | 
| 49 | 50 | from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses, validate_indices | 
|  | 51 | +from pandas.io.formats.format import format_array | 
| 50 | 52 | 
 | 
| 51 | 53 | from nested_pandas.series.dtype import NestedDtype | 
| 52 | 54 | from nested_pandas.series.utils import enumerate_chunks, is_pa_type_a_list | 
| 53 | 55 | 
 | 
| 54 | 56 | __all__ = ["NestedExtensionArray"] | 
| 55 | 57 | 
 | 
| 56 | 58 | 
 | 
|  | 59 | +BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = True | 
|  | 60 | +"""Use a trick to by-pass pandas limitations on extension array formatting | 
|  | 61 | +
 | 
|  | 62 | +Pandas array formatting works in a way, that Pandas objects are always | 
|  | 63 | +being formatted with `str()`, see _GenericArrayFormatter._format_strings() | 
|  | 64 | +method: | 
|  | 65 | +https://github.com/pandas-dev/pandas/blob/0d85d57b18b18e6b216ff081eac0952cb27d0e13/pandas/io/formats/format.py#L1219 | 
|  | 66 | +
 | 
|  | 67 | +_GenericArrayFormatter is used after _ExtensionArrayFormatter was called | 
|  | 68 | +initially and extracted values from the extension array with | 
|  | 69 | +np.asarray(values, dtype=object): | 
|  | 70 | +https://github.com/pandas-dev/pandas/blob/0d85d57b18b18e6b216ff081eac0952cb27d0e13/pandas/io/formats/format.py#L1516 | 
|  | 71 | +
 | 
|  | 72 | +Since our implementation of numpy conversion would return an object array | 
|  | 73 | +of data-frames, these data-frames would always be converted using `str()`, | 
|  | 74 | +which produces ugly and unreadable output. That's why when `__array__` is | 
|  | 75 | +called we check if it was actually called by _ExtensionArrayFormatter and | 
|  | 76 | +instead of returning a numpy array of data-frames, we return an array of | 
|  | 77 | +`_DataFrameWrapperForRepresentation` objects. That class is used for that | 
|  | 78 | +purposes only and should never be used for anything else. | 
|  | 79 | +""" | 
|  | 80 | +try: | 
|  | 81 | +    from pandas.io.formats.format import _ExtensionArrayFormatter | 
|  | 82 | +except ImportError: | 
|  | 83 | +    BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK = False | 
|  | 84 | + | 
|  | 85 | +NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW = 1 | 
|  | 86 | +"""Maximum number of nested data-frame's rows to show inside a parent object""" | 
|  | 87 | + | 
|  | 88 | + | 
|  | 89 | +def _is_called_from_func(func: Callable) -> bool: | 
|  | 90 | +    """Check if the given function appears in the call stack by matching its code object. | 
|  | 91 | +
 | 
|  | 92 | +    Parameters | 
|  | 93 | +    ---------- | 
|  | 94 | +    func | 
|  | 95 | +        Function to check | 
|  | 96 | +
 | 
|  | 97 | +    Returns | 
|  | 98 | +    ------- | 
|  | 99 | +    bool | 
|  | 100 | +    """ | 
|  | 101 | +    from inspect import currentframe | 
|  | 102 | + | 
|  | 103 | +    frame = currentframe() | 
|  | 104 | +    while frame: | 
|  | 105 | +        if frame.f_code is func.__code__: | 
|  | 106 | +            return True | 
|  | 107 | +        frame = frame.f_back  # Move up the call stack | 
|  | 108 | +    return False | 
|  | 109 | + | 
|  | 110 | + | 
|  | 111 | +def _is_called_from_ext_array_fmter_fmt_strings(): | 
|  | 112 | +    """Check if the code was called from _ExtensionArrayFormatter._format_strings | 
|  | 113 | +
 | 
|  | 114 | +    Returns | 
|  | 115 | +    ------- | 
|  | 116 | +    bool | 
|  | 117 | +    """ | 
|  | 118 | +    if not BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK: | 
|  | 119 | +        raise RuntimeError("Set BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK to True") | 
|  | 120 | +    return _is_called_from_func(_ExtensionArrayFormatter._format_strings) | 
|  | 121 | + | 
|  | 122 | + | 
|  | 123 | +class _DataFrameWrapperForRepresentation: | 
|  | 124 | +    """A class used to store nested data-frames for the formatting purposes | 
|  | 125 | +
 | 
|  | 126 | +    It encapsulates the input data-frame and gives access to all its attributes | 
|  | 127 | +
 | 
|  | 128 | +    Parameters | 
|  | 129 | +    ---------- | 
|  | 130 | +    df : pd.DataFrame | 
|  | 131 | +        Data | 
|  | 132 | +
 | 
|  | 133 | +    Notes | 
|  | 134 | +    ----- | 
|  | 135 | +    Do not use it out of the formatting code | 
|  | 136 | +    """ | 
|  | 137 | + | 
|  | 138 | +    def __init__(self, df): | 
|  | 139 | +        self.__internal_nested_df = df | 
|  | 140 | + | 
|  | 141 | +    def __getattr__(self, item): | 
|  | 142 | +        return getattr(self.__internal_nested_df, item) | 
|  | 143 | + | 
|  | 144 | +    def __len__(self): | 
|  | 145 | +        return len(self.__internal_nested_df) | 
|  | 146 | + | 
|  | 147 | + | 
| 57 | 148 | def to_pyarrow_dtype(dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None) -> pa.DataType | None: | 
| 58 | 149 |     """Convert the dtype to pyarrow.DataType""" | 
| 59 | 150 |     if isinstance(dtype, NestedDtype): | 
| @@ -390,15 +481,31 @@ def copy(self) -> Self:  # type: ignore[name-defined] # noqa: F821 | 
| 390 | 481 |         return type(self)(self._chunked_array, validate=False) | 
| 391 | 482 | 
 | 
| 392 | 483 |     def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: | 
| 393 |  | -        # TODO: make formatted strings more pretty | 
| 394 |  | -        # https://github.com/lincc-frameworks/nested-pandas/issues/50 | 
| 395 | 484 |         if boxed: | 
| 396 | 485 | 
 | 
| 397 | 486 |             def box_formatter(value): | 
| 398 | 487 |                 if value is pd.NA: | 
| 399 | 488 |                     return str(pd.NA) | 
| 400 |  | -                scalar = convert_df_to_pa_scalar(value, pa_type=self._pyarrow_dtype) | 
| 401 |  | -                return str(scalar.as_py()) | 
|  | 489 | +                # Select first few rows | 
|  | 490 | +                df = value.iloc[:NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW] | 
|  | 491 | +                # Format to strings using Pandas default formatters | 
|  | 492 | + | 
|  | 493 | +                def format_series(series): | 
|  | 494 | +                    if is_float_dtype(series.dtype): | 
|  | 495 | +                        # Format with the default Pandas formatter and strip white-spaces it adds | 
|  | 496 | +                        return pd.Series(format_array(series.to_numpy(), None)).str.strip() | 
|  | 497 | +                    # Convert to string, add extra quotes for strings | 
|  | 498 | +                    return series.apply(repr) | 
|  | 499 | + | 
|  | 500 | +                def format_row(row): | 
|  | 501 | +                    return ", ".join(f"{name}: {value}" for name, value in zip(row.index, row)) | 
|  | 502 | + | 
|  | 503 | +                # Format series to strings | 
|  | 504 | +                df = df.apply(format_series, axis=0) | 
|  | 505 | +                str_rows = "; ".join(f"{{{format_row(row)}}}" for _index, row in df.iterrows()) | 
|  | 506 | +                if len(value) <= NESTED_EXTENSION_ARRAY_FORMATTING_MAX_ITEMS_TO_SHOW: | 
|  | 507 | +                    return f"[{str_rows}]" | 
|  | 508 | +                return f"[{str_rows}; …] ({len(value)} rows)" | 
| 402 | 509 | 
 | 
| 403 | 510 |             return box_formatter | 
| 404 | 511 |         return repr | 
| @@ -446,7 +553,23 @@ def __arrow_array__(self, type=None): | 
| 446 | 553 | 
 | 
| 447 | 554 |     def __array__(self, dtype=None): | 
| 448 | 555 |         """Convert the extension array to a numpy array.""" | 
| 449 |  | -        return self.to_numpy(dtype=dtype, copy=False) | 
|  | 556 | + | 
|  | 557 | +        array = self.to_numpy(dtype=dtype, copy=False) | 
|  | 558 | + | 
|  | 559 | +        # Check if called inside _ExtensionArrayFormatter._format_strings | 
|  | 560 | +        # If yes, repack nested data-frames into a wrapper object, so | 
|  | 561 | +        # Pandas would call our _formatter method on them. | 
|  | 562 | +        if ( | 
|  | 563 | +            BOXED_NESTED_EXTENSION_ARRAY_FORMAT_TRICK | 
|  | 564 | +            and dtype == np.object_ | 
|  | 565 | +            and _is_called_from_ext_array_fmter_fmt_strings() | 
|  | 566 | +        ): | 
|  | 567 | +            for i, df in enumerate(array): | 
|  | 568 | +                # Could be data-frame or NA | 
|  | 569 | +                if isinstance(df, pd.DataFrame): | 
|  | 570 | +                    array[i] = _DataFrameWrapperForRepresentation(df) | 
|  | 571 | + | 
|  | 572 | +        return array | 
| 450 | 573 | 
 | 
| 451 | 574 |     # Adopted from ArrowExtensionArray | 
| 452 | 575 |     def __getstate__(self): | 
|  | 
0 commit comments