@@ -21,6 +21,7 @@ def read_parquet(
2121 pack_columns : dict | None = None ,
2222 dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
2323 reject_nesting : list [str ] | str | None = None ,
24+ infer_nesting : bool = True ,
2425 ** kwargs ,
2526) -> NestedFrame :
2627 """
@@ -61,16 +62,69 @@ def read_parquet(
6162 Returns
6263 -------
6364 NestedFrame
64- """
6565
66- df = NestedFrame (pd .read_parquet (data , engine = "pyarrow" , columns = columns , dtype_backend = "pyarrow" , ** kwargs ))
66+ Notes
67+ -----
68+ pyarrow supports partial loading of nested structures from parquet, for
69+ example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will
70+ load the "a" column of the "nested" column. Standard pandas/pyarrow
71+ behavior will return "a" as a list-array base column with name "a". In
72+ Nested-Pandas, this behavior is changed to load the column as a sub-column
73+ of a nested column called "nested". Be aware that this will prohibit calls
74+ like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])```
75+ from working, as this implies both full and partial load of "nested".
76+ """
6777
6878 # Type convergence for reject_nesting
6979 if reject_nesting is None :
7080 reject_nesting = []
7181 elif isinstance (reject_nesting , str ):
7282 reject_nesting = [reject_nesting ]
7383
84+ # First load through pyarrow
85+ table = pa .parquet .read_pandas (
86+ data ,
87+ columns = columns )
88+
89+ # Resolve partial loading of nested structures
90+ # Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
91+ # Use input column names and the table column names to determine if a column
92+ # was from a nested column.
93+ nested_structures = {}
94+ for col_in , col_pa in zip (columns , table .column_names ):
95+ # if the column name is not the same, it was a partial load
96+ if col_in != col_pa :
97+ # get the top-level column name
98+ nested_col = col_in .split ("." )[0 ]
99+ if nested_col not in reject_nesting :
100+ if nested_col not in nested_structures .keys ():
101+ nested_structures [nested_col ] = [table .column_names .index (col_pa )]
102+ else :
103+ nested_structures [nested_col ].append (table .column_names .index (col_pa ))
104+
105+ # TODO: Catch and disallow partial loading + full loading (e.g. "nested" and "nested.a")
106+ # TODO: Fix multi-column partial loading (e.g. "nested.a" and "nested.b" fails)
107+
108+ # Build structs and replace columns in table
109+ for col , indices in nested_structures .items ():
110+ # Build a struct column from the columns
111+ field_names = [table .column_names [i ] for i in indices ]
112+ struct = pa .StructArray .from_arrays ([table .column (i ).chunk (0 ) for i in indices ], field_names )
113+ # Replace the columns with the struct column
114+ for i in indices :
115+ # Remove the column from the table
116+ table = table .remove_column (i )
117+ table = table .append_column (col , struct )
118+
119+
120+ # Convert to NestedFrame
121+ # How much of a problem is it that this is not zero_copy? True below fails
122+ df = NestedFrame (table .to_pandas (types_mapper = lambda ty : pd .ArrowDtype (ty ), zero_copy_only = False ))
123+
124+
125+ #df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend="pyarrow", **kwargs))
126+
127+
74128 # Attempt to cast struct columns to NestedDTypes
75129 df = _cast_struct_cols_to_nested (df , reject_nesting )
76130
0 commit comments