5151from pandas .io .formats .format import format_array
5252
5353from nested_pandas .series .dtype import NestedDtype
54- from nested_pandas .series .utils import enumerate_chunks , is_pa_type_a_list
54+ from nested_pandas .series .utils import (
55+ enumerate_chunks ,
56+ is_pa_type_a_list ,
57+ transpose_list_struct_array ,
58+ transpose_struct_list_array ,
59+ transpose_struct_list_type ,
60+ validate_struct_list_array_for_equal_lengths ,
61+ )
5562
5663__all__ = ["NestedExtensionArray" ]
5764
@@ -549,6 +556,8 @@ def __arrow_array__(self, type=None):
549556 """Convert the extension array to a PyArrow array."""
550557 if type is None :
551558 return self ._chunked_array
559+ if isinstance (type , pa .ListType ):
560+ return self ._list_array .cast (type )
552561 return self ._chunked_array .cast (type )
553562
554563 def __array__ (self , dtype = None ):
@@ -650,12 +659,27 @@ def __init__(self, values: pa.Array | pa.ChunkedArray, *, validate: bool = True)
650659 if isinstance (values , pa .Array ):
651660 values = pa .chunked_array ([values ])
652661
653- if validate :
662+ # Convert list-struct array to struct-list array
663+ if is_pa_type_a_list (values .type ):
664+ struct_chunks = []
665+ for list_chunk in values .iterchunks ():
666+ struct_chunks .append (transpose_list_struct_array (list_chunk ))
667+ values = pa .chunked_array (struct_chunks )
668+ # Validate struct-array with list fields
669+ elif validate :
654670 self ._validate (values )
655671
656672 self ._chunked_array = values
657673 self ._dtype = NestedDtype (values .type )
658674
675+ @property
676+ def _list_array (self ) -> pa .ChunkedArray :
677+ """Pyarrow chunked list-struct array representation"""
678+ list_chunks = []
679+ for struct_chunk in self ._chunked_array .iterchunks ():
680+ list_chunks .append (transpose_struct_list_array (struct_chunk , validate = False ))
681+ return pa .chunked_array (list_chunks )
682+
659683 @classmethod
660684 def from_sequence (cls , scalars , * , dtype : NestedDtype | pd .ArrowDtype | pa .DataType = None ) -> Self : # type: ignore[name-defined] # noqa: F821
661685 """Construct a NestedExtensionArray from a sequence of items
@@ -677,53 +701,65 @@ def from_sequence(cls, scalars, *, dtype: NestedDtype | pd.ArrowDtype | pa.DataT
677701 return cls ._from_sequence (scalars , dtype = dtype )
678702
679703 @property
680- def _pyarrow_dtype (self ) -> pa .DataType :
704+ def _pyarrow_dtype (self ) -> pa .StructType :
681705 """PyArrow data type of the extension array"""
682706 return self ._dtype .pyarrow_dtype
683707
708+ @property
709+ def _pyarrow_list_struct_dtype (self ) -> pa .ListType :
710+ """PyArrow data type of the list-struct view over the ext. array"""
711+ return transpose_struct_list_type (self ._pyarrow_dtype )
712+
684713 @property
685714 def chunked_array (self ) -> pa .ChunkedArray :
686715 """The underlying PyArrow ChunkedArray"""
687716 return self ._chunked_array
688717
718+ @property
719+ def chunked_list_struct_array (self ) -> pa .ChunkedArray :
720+ """Chunked list-struct view over the extension array"""
721+ return self ._list_array
722+
689723 @staticmethod
690724 def _validate (array : pa .ChunkedArray ) -> None :
691725 """Raises ValueError if the input array is not a struct array with all fields being
692726 list arrays of the same lengths.
727+
728+ Parameters
729+ ----------
730+ array : pa.ChunkedArray
731+ The array to validate.
732+
733+ Raises
734+ ------
735+ ValueError
693736 """
694737 for chunk in array .iterchunks ():
695- if not pa .types .is_struct (chunk .type ):
696- raise ValueError (f"Expected a StructArray, got { chunk .type } " )
697- struct_array = cast (pa .StructArray , chunk )
698-
699- first_list_array : pa .ListArray | None = None
700- for field in struct_array .type :
701- inner_array = struct_array .field (field .name )
702- if not is_pa_type_a_list (inner_array .type ):
703- raise ValueError (f"Expected a ListArray, got { inner_array .type } " )
704- list_array = cast (pa .ListArray , inner_array )
705-
706- if first_list_array is None :
707- first_list_array = list_array
708- continue
709- # compare offsets from the first list array with the current one
710- if not first_list_array .offsets .equals (list_array .offsets ):
711- raise ValueError ("Offsets of all ListArrays must be the same" )
738+ validate_struct_list_array_for_equal_lengths (chunk )
712739
713740 @classmethod
714741 def from_arrow_ext_array (cls , array : ArrowExtensionArray ) -> Self : # type: ignore[name-defined] # noqa: F821
715742 """Create a NestedExtensionArray from pandas' ArrowExtensionArray"""
716743 return cls (array ._pa_array )
717744
718- def to_arrow_ext_array (self ) -> ArrowExtensionArray :
719- """Convert the extension array to pandas' ArrowExtensionArray"""
745+ def to_arrow_ext_array (self , list_struct : bool = False ) -> ArrowExtensionArray :
746+ """Convert the extension array to pandas' ArrowExtensionArray
747+
748+ Parameters
749+ ----------
750+ list_struct : bool, optional
751+ If False (default), return struct-list array, otherwise return
752+ list-struct array.
753+ """
754+ if list_struct :
755+ return ArrowExtensionArray (self ._list_array )
720756 return ArrowExtensionArray (self ._chunked_array )
721757
722758 def _replace_chunked_array (self , pa_array : pa .ChunkedArray , * , validate : bool ) -> None :
723759 if validate :
724760 self ._validate (pa_array )
725761 self ._chunked_array = pa_array
726- self ._dtype = NestedDtype (pa_array .chunk ( 0 ). type )
762+ self ._dtype = NestedDtype (pa_array .type )
727763
728764 @property
729765 def list_offsets (self ) -> pa .Array :
@@ -737,48 +773,32 @@ def list_offsets(self) -> pa.Array:
737773 pa.ChunkedArray
738774 The list offsets of the field arrays.
739775 """
740- # Quick and cheap path for a single chunk
776+ # Cheap path for a single chunk
741777 if self ._chunked_array .num_chunks == 1 :
742778 struct_array = cast (pa .StructArray , self ._chunked_array .chunk (0 ))
743779 return cast (pa .ListArray , struct_array .field (0 )).offsets
744780
745- chunks = []
746- # The offset of the current chunk in the flat array.
747- # Offset arrays use int32 type, so we cast to it
748- chunk_offset = pa .scalar (0 , type = pa .int32 ())
749- for chunk in self ._chunked_array .iterchunks ():
750- list_array = cast (pa .ListArray , chunk .field (0 ))
751- if chunk_offset .equals (pa .scalar (0 , type = pa .int32 ())):
752- offsets = list_array .offsets
753- else :
754- offsets = pa .compute .add (list_array .offsets [1 :], chunk_offset )
755- chunks .append (offsets )
756- chunk_offset = offsets [- 1 ]
757- return pa .concat_arrays (chunks )
781+ zero_and_lengths = pa .chunked_array (
782+ [pa .array ([0 ], type = pa .int32 ()), pa .array (self .list_lengths , type = pa .int32 ())]
783+ )
784+ offsets = pa .compute .cumulative_sum (zero_and_lengths )
785+ return offsets .chunk (0 ) if offsets .num_chunks == 1 else offsets .combine_chunks ()
758786
759787 @property
760788 def field_names (self ) -> list [str ]:
761789 """Names of the nested columns"""
762790 return [field .name for field in self ._chunked_array .chunk (0 ).type ]
763791
764- def _iter_list_lengths (self ) -> Generator [int , None , None ]:
765- """Iterate over the lengths of the list arrays"""
766- for chunk in self ._chunked_array .iterchunks ():
767- for length in chunk .field (0 ).value_lengths ():
768- if length .is_valid :
769- yield length .as_py ()
770- else :
771- yield 0
772-
773792 @property
774- def list_lengths (self ) -> list [ int ] :
793+ def list_lengths (self ) -> np . ndarray :
775794 """Lengths of the list arrays"""
776- return list (self ._iter_list_lengths ())
795+ list_lengths = pa .compute .list_value_length (self ._list_array )
796+ return np .asarray (list_lengths )
777797
778798 @property
779799 def flat_length (self ) -> int :
780800 """Length of the flat arrays"""
781- return sum (self ._iter_list_lengths () )
801+ return pa . compute . sum (self .list_lengths ). as_py ( )
782802
783803 @property
784804 def num_chunks (self ) -> int :
@@ -790,8 +810,8 @@ def get_list_index(self) -> np.ndarray:
790810 if len (self ) == 0 :
791811 # Since we have no list offsets, return an empty array
792812 return np .array ([], dtype = int )
793- list_index = np .arange (len (self ))
794- return np .repeat (list_index , np . diff ( self .list_offsets ) )
813+ list_index = np .arange (len (self ), dtype = int )
814+ return np .repeat (list_index , self .list_lengths )
795815
796816 def iter_field_lists (self , field : str ) -> Generator [np .ndarray , None , None ]:
797817 """Iterate over single field nested lists, as numpy arrays
@@ -813,7 +833,7 @@ def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
813833 yield np .asarray (list_scalar .values )
814834
815835 def view_fields (self , fields : str | list [str ]) -> Self : # type: ignore[name-defined] # noqa: F821
816- """Get a view of the extension array with only the specified fields
836+ """Get a view of the extension array with the specified fields only
817837
818838 Parameters
819839 ----------
@@ -842,7 +862,7 @@ def view_fields(self, fields: str | list[str]) -> Self: # type: ignore[name-def
842862 chunks .append (struct_array )
843863 pa_array = pa .chunked_array (chunks )
844864
845- return self . __class__ (pa_array , validate = False )
865+ return type ( self ) (pa_array , validate = False )
846866
847867 def set_flat_field (self , field : str , value : ArrayLike , * , keep_dtype : bool = False ) -> None :
848868 """Set the field from flat-array of values
0 commit comments