Skip to content

Commit fc30f75

Browse files
committed
Super WIP: read through pyarrow
1 parent 4977506 commit fc30f75

File tree

1 file changed

+56
-2
lines changed
  • src/nested_pandas/nestedframe

1 file changed

+56
-2
lines changed

src/nested_pandas/nestedframe/io.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def read_parquet(
2121
pack_columns: dict | None = None,
2222
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
2323
reject_nesting: list[str] | str | None = None,
24+
infer_nesting: bool = True,
2425
**kwargs,
2526
) -> NestedFrame:
2627
"""
@@ -61,16 +62,69 @@ def read_parquet(
6162
Returns
6263
-------
6364
NestedFrame
64-
"""
6565
66-
df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend="pyarrow", **kwargs))
66+
Notes
67+
-----
68+
pyarrow supports partial loading of nested structures from parquet, for
69+
example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will
70+
load the "a" column of the "nested" column. Standard pandas/pyarrow
71+
behavior will return "a" as a list-array base column with name "a". In
72+
Nested-Pandas, this behavior is changed to load the column as a sub-column
73+
of a nested column called "nested". Be aware that this will prohibit calls
74+
like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])```
75+
from working, as this implies both full and partial load of "nested".
76+
"""
6777

6878
# Type convergence for reject_nesting
6979
if reject_nesting is None:
7080
reject_nesting = []
7181
elif isinstance(reject_nesting, str):
7282
reject_nesting = [reject_nesting]
7383

84+
# First load through pyarrow
85+
table = pa.parquet.read_pandas(
86+
data,
87+
columns=columns)
88+
89+
# Resolve partial loading of nested structures
90+
# Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
91+
# Use input column names and the table column names to determine if a column
92+
# was from a nested column.
93+
nested_structures = {}
94+
for col_in, col_pa in zip(columns, table.column_names):
95+
# if the column name is not the same, it was a partial load
96+
if col_in != col_pa:
97+
# get the top-level column name
98+
nested_col = col_in.split(".")[0]
99+
if nested_col not in reject_nesting:
100+
if nested_col not in nested_structures.keys():
101+
nested_structures[nested_col] = [table.column_names.index(col_pa)]
102+
else:
103+
nested_structures[nested_col].append(table.column_names.index(col_pa))
104+
105+
# TODO: Catch and disallow partial loading + full loading (e.g. "nested" and "nested.a")
106+
# TODO: Fix multi-column partial loading (e.g. "nested.a" and "nested.b" fails)
107+
108+
# Build structs and replace columns in table
109+
for col, indices in nested_structures.items():
110+
# Build a struct column from the columns
111+
field_names = [table.column_names[i] for i in indices]
112+
struct = pa.StructArray.from_arrays([table.column(i).chunk(0) for i in indices], field_names)
113+
# Replace the columns with the struct column
114+
for i in indices:
115+
# Remove the column from the table
116+
table = table.remove_column(i)
117+
table = table.append_column(col, struct)
118+
119+
120+
# Convert to NestedFrame
121+
# How much of a problem is it that this is not zero_copy? True below fails
122+
df = NestedFrame(table.to_pandas(types_mapper=lambda ty: pd.ArrowDtype(ty), zero_copy_only=False))
123+
124+
125+
#df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend="pyarrow", **kwargs))
126+
127+
74128
# Attempt to cast struct columns to NestedDTypes
75129
df = _cast_struct_cols_to_nested(df, reject_nesting)
76130

0 commit comments

Comments
 (0)