1010from pandas ._libs import lib
1111from pandas ._typing import Any , AnyAll , Axis , IndexLabel
1212from pandas .api .extensions import no_default
13+ from pandas .core .computation import ops
14+ from pandas .core .computation .eval import Expr , ensure_scope
1315from pandas .core .computation .expr import PARSERS , PandasExprVisitor
16+ from pandas .core .computation .parsing import clean_column_name
1417
15- from nested_pandas .nestedframe .utils import extract_nest_names
1618from nested_pandas .series .dtype import NestedDtype
1719from nested_pandas .series .packer import pack , pack_lists , pack_sorted_df_into_struct
1820
@@ -79,6 +81,22 @@ class _NestResolver(dict):
7981 def __init__ (self , outer : NestedFrame ):
8082 self ._outer = outer
8183 super ().__init__ ()
84+ # Pre-load the field resolvers for all columns which are known at present.
85+ for column in outer .nested_columns :
86+ self ._initialize_field_resolver (column , outer )
87+
88+ def _initialize_field_resolver (self , column : str , outer : NestedFrame ):
89+ """
90+ Initialize a resolver for the given nested column, and also an alias
91+ for it, in the case of column names that have spaces or are otherwise
92+ not identifier-like.
93+ """
94+ super ().__setitem__ (column , _NestedFieldResolver (column , outer ))
95+ clean_id = clean_column_name (column )
96+ # And once more for the cleaned name, if it's different.
97+ # This allows us to capture references to it from the Pandas evaluator.
98+ if clean_id != column :
99+ super ().__setitem__ (clean_id , _NestedFieldResolver (column , outer ))
82100
83101 def __contains__ (self , item ):
84102 top_nest = item if "." not in item else item .split ("." )[0 ].strip ()
@@ -89,7 +107,7 @@ def __getitem__(self, item):
89107 if not super ().__contains__ (top_nest ):
90108 if top_nest not in self ._outer .nested_columns :
91109 raise KeyError (f"Unknown nest { top_nest } " )
92- super (). __setitem__ (top_nest , _NestedFieldResolver ( top_nest , self ._outer ) )
110+ self . _initialize_field_resolver (top_nest , self ._outer )
93111 return super ().__getitem__ (top_nest )
94112
95113 def __setitem__ (self , item , _ ):
@@ -133,6 +151,48 @@ def __getattr__(self, item_name: str):
133151 raise AttributeError (f"No attribute { item_name } " )
134152
135153
154+ def _subexprs_by_nest (parents : list , node ) -> dict [str , list ]:
155+ """
156+ Given an expression which contains references to both base and nested
157+ columns, return a dictionary of the sub-expressions that should be
158+ evaluated independently, keyed by nesting context.
159+
160+ The key of the dictionary is the name of the nested column, and will
161+ be a blank string in the case of base columns. The value is a list
162+ of the parent nodes that lead to sub-expressions that can be evaluated
163+ successfully.
164+
165+ While this is not in use today for automatically splitting expressions,
166+ it can be used to detect whether an expression is suitably structured
167+ for evaluation: the returned dictionary should have a single key.
168+ """
169+ if isinstance (node , ops .Term ) and not isinstance (node , ops .Constant ):
170+ if isinstance (node .value , _SeriesFromNest ):
171+ return {node .value .nest_name : parents }
172+ return {getattr (node , "upper_name" , "" ): parents }
173+ if not isinstance (node , ops .Op ):
174+ return {}
175+ sources = [getattr (node , "lhs" , None ), getattr (node , "rhs" , None )]
176+ result : dict [str , list ] = {}
177+ for source in sources :
178+ child = _subexprs_by_nest (parents , source )
179+ for k , v in child .items ():
180+ result .setdefault (k , []).append (v )
181+ # After a complete traversal across sources, check for any necessary splits.
182+ # If it's homogenous, move the split-node up the tree.
183+ if len (result ) == 1 :
184+ # Let the record of each parent node drift up the tree,
185+ # and merge the subtrees into a single node, since by definition,
186+ # this node is homogeneous over all of its children, and can
187+ # be evaluated in a single step.
188+ result = {k : [node ] for k in result }
189+ # If the result is either empty or has more than one key, leave the result
190+ # alone. Each key represents a different nest (with a blank string for the base),
191+ # and the value is the highest point in the expression tree where the expression
192+ # was still within a single nest.
193+ return result
194+
195+
136196class NestedFrame (pd .DataFrame ):
137197 """A Pandas Dataframe extension with support for nested structure.
138198
@@ -457,6 +517,39 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
457517 kwargs ["parser" ] = "nested-pandas"
458518 return super ().eval (expr , ** kwargs )
459519
520+ def extract_nest_names (
521+ self ,
522+ expr : str ,
523+ local_dict = None ,
524+ global_dict = None ,
525+ resolvers = (),
526+ level : int = 0 ,
527+ target = None ,
528+ ** kwargs ,
529+ ) -> set [str ]:
530+ """
531+ Given a string expression, parse it and visit the resulting expression tree,
532+ surfacing the nesting types. The purpose is to identify expressions that attempt
533+ to mix base and nested columns, or columns from two different nests.
534+ """
535+ index_resolvers = self ._get_index_resolvers ()
536+ column_resolvers = self ._get_cleaned_column_resolvers ()
537+ resolvers = resolvers + (_NestResolver (self ), column_resolvers , index_resolvers )
538+ # Parser needs to be the "nested-pandas" parser.
539+ # We also need the same variable context that eval() will have, so that
540+ # backtick-quoted names are substituted as expected.
541+ env = ensure_scope (
542+ level + 1 ,
543+ global_dict = global_dict ,
544+ local_dict = local_dict ,
545+ resolvers = resolvers ,
546+ target = target ,
547+ )
548+ parsed_expr = Expr (expr , parser = "nested-pandas" , env = env )
549+ expr_tree = parsed_expr .terms
550+ separable = _subexprs_by_nest ([], expr_tree )
551+ return set (separable .keys ())
552+
460553 def query (self , expr : str , * , inplace : bool = False , ** kwargs ) -> NestedFrame | None :
461554 """
462555 Query the columns of a NestedFrame with a boolean expression. Specified
@@ -514,7 +607,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
514607 # At present, the query expression must be either entirely within a
515608 # single nest, or have nothing but base columns. Mixed structures are not
516609 # supported, so preflight the expression.
517- nest_names = extract_nest_names (expr )
610+ nest_names = self . extract_nest_names (expr , ** kwargs )
518611 if len (nest_names ) > 1 :
519612 raise ValueError ("Queries cannot target multiple structs/layers, write a separate query for each" )
520613 result = self .eval (expr , ** kwargs )
0 commit comments