From 850bd034352188ab802ed06981102ede96331816 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Wed, 9 Oct 2024 08:38:25 -0700 Subject: [PATCH 01/11] Use custom resolver for query and eval with nested frames. Verify preflighting of nested expressions using AST visitation. Remove logic for splitting queries by string. Now the evaluation is handled by a nested column resolver, and the mixed-mode expressions are preflighted by examining the parsed abstract syntax tree for the query expression. --- .gitignore | 3 + docs/tutorials/data_loading_notebook.ipynb | 16 +- docs/tutorials/data_manipulation.ipynb | 4 +- docs/tutorials/low_level.ipynb | 18 +- docs/tutorials/nested_spectra.ipynb | 10 +- src/nested_pandas/nestedframe/core.py | 237 ++++++++++++++---- src/nested_pandas/nestedframe/utils.py | 39 +++ .../nestedframe/test_nestedframe.py | 42 ++++ .../nestedframe/test_nestedframe_utils.py | 17 -- tests/nested_pandas/utils/test_utils.py | 27 ++ 10 files changed, 320 insertions(+), 93 deletions(-) delete mode 100644 tests/nested_pandas/nestedframe/test_nestedframe_utils.py diff --git a/.gitignore b/.gitignore index 50990fec..dfe2e99a 100644 --- a/.gitignore +++ b/.gitignore @@ -133,6 +133,9 @@ dmypy.json # vscode .vscode/ +# PyCharm +.idea/ + # dask dask-worker-space/ diff --git a/docs/tutorials/data_loading_notebook.ipynb b/docs/tutorials/data_loading_notebook.ipynb index 6b54f862..8aa5e628 100644 --- a/docs/tutorials/data_loading_notebook.ipynb +++ b/docs/tutorials/data_loading_notebook.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "With a valid Python environment, nested-pandas and it's dependencies are easy to install using the `pip` package manager. The following command can be used to install it:" + "With a valid Python environment, nested-pandas and its dependencies are easy to install using the `pip` package manager. The following command can be used to install it:" ] }, { @@ -47,7 +47,7 @@ "\n", "We can use the `NestedFrame` constructor to create our base frame from a dictionary of our columns.\n", "\n", - "We can then create an addtional pandas dataframes and pack them into our `NestedFrame` with `NestedFrame.add_nested`" + "We can then create an addtional pandas dataframes and pack them into our `NestedFrame` with `NestedFrame.add_nested`." ] }, { @@ -97,7 +97,7 @@ "# Note: that we use the `tempfile` module to create and then cleanup a temporary directory.\n", "# You can of course remove this and use your own directory and real files on your system.\n", "with tempfile.TemporaryDirectory() as temp_path:\n", - " # Generates parquet files with random data within our temporary directorye.\n", + " # Generates parquet files with random data within our temporary directory.\n", " generate_parquet_file(10, {\"nested1\": 100, \"nested2\": 10}, temp_path, file_per_layer=True)\n", "\n", " # Read each individual parquet file into its own dataframe.\n", @@ -148,7 +148,7 @@ "source": [ "So inspect `nf`, a `NestedFrame` we created from our call to `read_parquet` with the `to_pack` argument, we're able to pack nested parquet files according to the shared index values with the index in `base.parquet`.\n", "\n", - "The resulting `NestedFrame` having the same number of rows as `base.parquet` and with `nested1.parquet` and `nested2.parquet` packed into the 'nested1' and 'nested2' columns respectively." + "The resulting `NestedFrame` having the same number of rows as `base.parquet` and with `nested1.parquet` and `nested2.parquet` packed into the `nested1` and `nested2` columns respectively." ] }, { @@ -164,7 +164,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Since we loaded each individual parquet file into its own dataframe, we can also verify that using `read_parquet` with the `to_pack` argument is equivalent to the following method of packing the dataframes directly with `NestedFrame.add_nested`" + "Since we loaded each individual parquet file into its own dataframe, we can also verify that using `read_parquet` with the `to_pack` argument is equivalent to the following method of packing the dataframes directly with `NestedFrame.add_nested`." ] }, { @@ -189,11 +189,11 @@ "source": [ "# Saving NestedFrames to Parquet Files\n", "\n", - "Additionally we can save an existing `NestedFrame` as one of more parquet files using `NestedFrame.to_parquet``\n", + "Additionally we can save an existing `NestedFrame` as one of more parquet files using `NestedFrame.to_parquet`.\n", "\n", "When `by_layer=True` we save each individual layer of the NestedFrame into its own parquet file in a specified output directory.\n", "\n", - "The base layer will be outputted to \"base.parquet\", and each nested layer will be written to a file based on its column name. So the nested layer in column `nested1` will be written to \"nested1.parquet\"." + "The base layer will be outputted to `base.parquet`, and each nested layer will be written to a file based on its column name. So the nested layer in column `nested1` will be written to `nested1.parquet`." ] }, { @@ -233,7 +233,7 @@ "source": [ "We also support saving a `NestedFrame` as a single parquet file where the packed layers are still packed in their respective columns.\n", "\n", - "Here we provide `NestedFrame.to_parquet` with the desired path of the *single* output file (rather than the path of a directory to store *multiple* output files) and use `per_layer=False'\n", + "Here we provide `NestedFrame.to_parquet` with the desired path of the *single* output file (rather than the path of a directory to store *multiple* output files) and use `per_layer=False`.\n", "\n", "Our `read_parquet` function can load a `NestedFrame` saved in this single file parquet without requiring any additional arguments. " ] diff --git a/docs/tutorials/data_manipulation.ipynb b/docs/tutorials/data_manipulation.ipynb index 941de7a6..a0aa822d 100644 --- a/docs/tutorials/data_manipulation.ipynb +++ b/docs/tutorials/data_manipulation.ipynb @@ -49,7 +49,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, we can directly fetch a column from our nested column (aptly called \"nested\"). For example, below we can fetch the time column, \"t\", by specifying `\"nested.t\"` as the column to retrieve. This returns a \"flat\" view of the nested t column, where all rows from all dataframes are present in one dataframe." + "First, we can directly fetch a column from our nested column (aptly called \"nested\"). For example, below we can fetch the time column, \"t\", by specifying `\"nested.t\"` as the column to retrieve. This returns a \"flat\" view of the nested `t` column, where all rows from all dataframes are present in one dataframe." ] }, { @@ -170,7 +170,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This is functionally equivalent to using `add_nested`" + "This is functionally equivalent to using `add_nested`:" ] }, { diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb index 307366c7..02f815ba 100644 --- a/docs/tutorials/low_level.ipynb +++ b/docs/tutorials/low_level.ipynb @@ -8,7 +8,7 @@ "# Lower-level interface for performance and flexibility\n", "## Reveal the hidden power of nested Series\n", "\n", - "This section is for users looking to optimize the performance, both computationally and in memory-usage, of their workflows. This section also details a broader suite of data representations usable within `nested-pandas`.\n", + "This section is for users looking to optimize both the compute and memory performance of their workflows. This section also details a broader suite of data representations usable within `nested-pandas`.\n", "It shows how to deal with individual nested columns: add, remove, and modify data using both \"flat-array\" and \"list-array\" representations.\n", "It also demonstrates how to convert nested Series to and from different data types, like `pd.ArrowDtype`d Series, flat dataframes, list-array dataframes, and collections of nested elements." ] @@ -36,7 +36,7 @@ "source": [ "## Generate some data and get a Series of `NestedDtype` type\n", "\n", - "We are going to use built-in data generator to get a `NestedFrame` with a \"nested\" column being a `Series` of `NestedDtype` type.\n", + "We are going to use the built-in data generator to get a `NestedFrame` with a \"nested\" column being a `Series` of `NestedDtype` type.\n", "This column would represent [light curves](https://en.wikipedia.org/wiki/Light_curve) of some astronomical objects. " ] }, @@ -94,7 +94,7 @@ "id": "33d8caacf0bf042e", "metadata": {}, "source": [ - "You can also get a list of fields with `.fields` attribute" + "You can also get a list of fields with `.fields` attribute:" ] }, { @@ -130,7 +130,7 @@ "id": "7167f5a9c947d96f", "metadata": {}, "source": [ - "You can also get a subset of nested columns as a new nested Series" + "You can also get a subset of nested columns as a new nested Series:" ] }, { @@ -479,7 +479,7 @@ "source": [ "#### pd.Series from an array\n", "\n", - "Construction with `pyarrow` struct arrays is the cheapest way to create a nested Series. It is very semilliar to initialisation of a `pd.Series` of `pd.ArrowDtype` type." + "Construction with `pyarrow` struct arrays is the cheapest way to create a nested Series. It is very similar to the initialization of a `pd.Series` of `pd.ArrowDtype` type." ] }, { @@ -611,21 +611,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.12.6" } }, "nbformat": 4, diff --git a/docs/tutorials/nested_spectra.ipynb b/docs/tutorials/nested_spectra.ipynb index d311655d..8d2fffd4 100644 --- a/docs/tutorials/nested_spectra.ipynb +++ b/docs/tutorials/nested_spectra.ipynb @@ -79,7 +79,7 @@ "flux = np.array([])\n", "err = np.array([])\n", "index = np.array([])\n", - "# Loop over each spectrum, adding it's data to the arrays\n", + "# Loop over each spectrum, adding its data to the arrays\n", "for i, hdu in enumerate(sp):\n", " wave = np.append(wave, 10 ** hdu[\"COADD\"].data.loglam) # * u.angstrom\n", " flux = np.append(flux, hdu[\"COADD\"].data.flux * 1e-17) # * u.erg/u.second/u.centimeter**2/u.angstrom\n", @@ -115,7 +115,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "And we can see that each object now has the \"coadd_spectrum\" nested column with the full spectrum available." + "And we can see that each object now has the `coadd_spectrum` nested column with the full spectrum available." ] }, { @@ -161,7 +161,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -175,9 +175,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.12.6" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index c244c0aa..ff05ff4a 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -13,7 +13,26 @@ from nested_pandas.series import packer from nested_pandas.series.dtype import NestedDtype -from .utils import _ensure_spacing +from ..series.packer import pack_sorted_df_into_struct +from .utils import NestingType, check_expr_nesting + + +class NestedSeries(pd.Series): + """ + Series that were unpacked from a nest. + """ + + _metadata = ["nest_name", "flat_nest"] + + @property + def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 + return NestedSeries + + @property + def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821 + return NestedSeries + + __pandas_priority__ = 3500 class NestedFrame(pd.DataFrame): @@ -22,8 +41,7 @@ class NestedFrame(pd.DataFrame): See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures """ - # normal properties - _metadata = ["added_property"] + __pandas_priority__ = 4500 @property def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 @@ -71,7 +89,7 @@ def __getitem__(self, item): """Adds custom __getitem__ behavior for nested columns""" if isinstance(item, str): - # Pre-empt the nested check if the item is a base column + # Preempt the nested check if the item is a base column if item in self.columns: return super().__getitem__(item) # If a nested column name is passed, return a flat series for that column @@ -289,38 +307,110 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): else: return NestedFrame(packed_df.to_frame()) - def _split_query(self, expr) -> dict: - """Splits a pandas query into multiple subqueries for nested and base layers""" - # Ensure query has needed spacing for upcoming split - expr = _ensure_spacing(expr) - nest_exprs = {col: [] for col in self.nested_columns + ["base"]} # type: dict - split_expr = expr.split(" ") - - i = 0 - current_focus = "base" - while i < len(split_expr): - expr_slice = split_expr[i].strip("()") - # Check if it's a nested column - if self._is_known_hierarchical_column(expr_slice): - nested, colname = split_expr[i].split(".") - current_focus = nested.strip("()") - # account for parentheses - j = 0 - while j < len(nested): - if nested[j] == "(": - nest_exprs[current_focus].append("(") - j += 1 - nest_exprs[current_focus].append(colname) - # or if it's a top-level column - elif expr_slice in self.columns: - current_focus = "base" - nest_exprs[current_focus].append(split_expr[i]) - else: - nest_exprs[current_focus].append(split_expr[i]) - i += 1 - return {expr: " ".join(nest_exprs[expr]) for expr in nest_exprs if len(nest_exprs[expr]) > 0} + def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: + """ + + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + Works the same way as `pd.DataFrame.eval`, except that this method + will also automatically unpack nested columns into NestedSeries, + and the resulting expression will have the dimensions of the unpacked + series. + + Parameters + ---------- + expr : str + The expression string to evaluate. + inplace : bool, default False + If the expression contains an assignment, whether to perform the + operation inplace and mutate the existing DataFrame. Otherwise, + a new DataFrame is returned. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ndarray, scalar, pandas object, or None + The result of the evaluation or None if ``inplace=True``. + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.assign : Can evaluate an expression or function to create new + values for a column. + eval : Evaluate a Python expression as a string using various + backends. - def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 + Notes + ----- + For more details see the API documentation for :func:`~eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + """ + nested_resolvers = self._get_nested_column_resolvers() + kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (nested_resolvers,) + kwargs["inplace"] = inplace + return super().eval(expr, **kwargs) + + def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None: """ Query the columns of a NestedFrame with a boolean expression. Specified queries can target nested columns in addition to the typical column set @@ -348,6 +438,12 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. + inplace : bool + Whether to modify the DataFrame rather than creating a new one. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. + Returns ------- DataFrame @@ -363,25 +459,62 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821 >>> df.query("mynested.a > 2") """ - - # Rebuild queries for each specified nested/base layer - exprs_to_use = self._split_query(expr) - - # For now (simplicity), limit query to only operating on one layer - if len(exprs_to_use.keys()) != 1: + if not isinstance(expr, str): + msg = f"expr must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + kwargs["level"] = kwargs.pop("level", 0) + 1 + kwargs["target"] = None + # At present, the query expression must be either entirely within the + # nested namespace or the base namespace. Mixed structures are not + # supported, so preflight the expression. + nesting_types = check_expr_nesting(expr) + if NestingType.NESTED in nesting_types and NestingType.BASE in nesting_types: raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") - - # Send queries to layers - # We'll only execute 1 per the Error above, but the loop will be useful - # for when/if we allow multi-layer queries - result = self.copy() - for expr in exprs_to_use: - if expr == "base": - result = super().query(exprs_to_use["base"], inplace=False) + result = self.eval(expr, **kwargs) + # If the result is a NestedSeries, then the evaluation has caused unpacking, + # which means that a nested attribute was referenced. Apply this result + # to the nest and repack. Otherwise, apply it to this instance as usual, + # since it operated on the base attributes. + try: + if isinstance(result, NestedSeries): + nest_name, flat_nest = result.nest_name, result.flat_nest + new_flat_nest = flat_nest.loc[result] + result = self.copy() + result[nest_name] = pack_sorted_df_into_struct(new_flat_nest) else: - # TODO: does not work with queries that empty the dataframe - result[expr] = result[expr].nest.query_flat(exprs_to_use[expr]) - return result + result = self.loc[result] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + result = self[result] + + if inplace: + self._update_inplace(result) + return None + else: + return result + + def _get_nested_column_resolvers(self): + class NestResolver: + def __init__(self, nest_name: str, outer): + self._nest_name = nest_name + # Save the outer frame with an eye toward repacking. + self._outer = outer + # Flattened only once for every access of this particular nest + # within the expression. + self._flat_nest = outer[nest_name].nest.to_flat() + + def __getattr__(self, item_name: str): + if item_name in self._flat_nest: + result = NestedSeries(self._flat_nest[item_name]) + # Assigning these properties directly in order to avoid any complication + # or interference with the inherited pd.Series constructor. + result.nest_name = self._nest_name + result.flat_nest = self._flat_nest + return result + raise AttributeError(f"No attribute {item_name}") + + return {name: NestResolver(name, self) for name in self.nested_columns} def _resolve_dropna_target(self, on_nested, subset): """resolves the target layer for a given set of dropna kwargs""" diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py index 765ae73d..1ca1e0a2 100644 --- a/src/nested_pandas/nestedframe/utils.py +++ b/src/nested_pandas/nestedframe/utils.py @@ -1,3 +1,7 @@ +import ast +from enum import Enum + + def _ensure_spacing(expr) -> str: """Ensure that an eval string has spacing""" single_val_operators = {"+", "-", "*", "/", "%", ">", "<", "|", "&", "~", "="} # omit "(" and ")" @@ -33,3 +37,38 @@ def _ensure_spacing(expr) -> str: spaced_expr += " " i += 1 return spaced_expr + + +class NestingType(Enum): + """Types of sub-expressions possible in a NestedFrame string expression.""" + + BASE = "base" + NESTED = "nested" + + +def _expr_nesting_type(node: ast.expr | None) -> set[NestingType]: + if not isinstance(node, ast.expr): + return set() + if isinstance(node, ast.Name): + return {NestingType.BASE} + if isinstance(node, ast.Attribute): + return {NestingType.NESTED} + sources = ( + [getattr(node, "left", None), getattr(node, "right", None)] + + getattr(node, "values", []) + + getattr(node, "comparators", []) + ) + result: set[NestingType] = set() + for s in sources: + result.update(_expr_nesting_type(s)) + return result + + +def check_expr_nesting(expr: str) -> set[NestingType]: + """ + Given a string expression, parse it and visit the resulting AST, surfacing + the nesting types. The purpose is to identify expressions that attempt + to mix base and nested columns, which will need to be handled specially. + """ + expr_tree = ast.parse(expr, mode="eval").body + return set(_expr_nesting_type(expr_tree)) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index b29881d2..1f506a8a 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -4,6 +4,7 @@ import pytest from nested_pandas import NestedFrame from nested_pandas.datasets import generate_data +from nested_pandas.nestedframe.core import NestedSeries from pandas.testing import assert_frame_equal @@ -689,3 +690,44 @@ def cols_allclose(col1, col2): assert_frame_equal( result, pd.DataFrame({"allclose": [True, True, True]}, index=pd.Index([0, 1, 2], name="idx")) ) + + +def test_scientific_notation(): + """ + Test that NestedFrame.query handles constants that are written in scientific notation. + """ + # https://github.com/lincc-frameworks/nested-pandas/issues/59 + base = NestedFrame({"a": [1, 1e-2, 3]}, index=[0, 1, 2]) + selected = base.query("a > 1e-1") + assert list(selected.index) == [0, 2] + + +def test_eval(): + """ + Test basic behavior of NestedFrame.eval, and that it can handle nested references + the same as the nest accessor. + """ + nf = NestedFrame( + data={"a": [1, 2, 3], "b": [2, 4, 6]}, + index=pd.Index([0, 1, 2], name="idx"), + ) + + to_pack = pd.DataFrame( + data={ + "time": [1, 2, 3, 1, 2, 4, 2, 1, 4], + "c": [0, 2, 4, 10, 4, 3, 1, 4, 1], + "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], + }, + index=pd.Index([0, 0, 0, 1, 1, 1, 2, 2, 2], name="idx"), + ) + + nf = nf.add_nested(to_pack, "packed") + p5 = nf.eval("packed.d > 5") + assert isinstance(p5, NestedSeries) + assert p5.any() + assert not p5.all() + assert list(p5.loc[p5].index) == [0, 2] + + r1 = nf.eval("packed.c + packed.d") + r2 = nf["packed"].nest["c"] + nf["packed"].nest["d"] + assert (r1 == r2).all() diff --git a/tests/nested_pandas/nestedframe/test_nestedframe_utils.py b/tests/nested_pandas/nestedframe/test_nestedframe_utils.py deleted file mode 100644 index 3908cb80..00000000 --- a/tests/nested_pandas/nestedframe/test_nestedframe_utils.py +++ /dev/null @@ -1,17 +0,0 @@ -import pytest -from nested_pandas.nestedframe import utils - - -@pytest.mark.parametrize( - "in_out", - [ - ("a>3", "a > 3"), - ("test.a>5&b==2", "test.a > 5 & b == 2"), - ("b > 3", "b > 3"), - ("(a.b > 3)&(a.c == 'f')", "(a.b > 3) & (a.c == 'f')"), - ], -) -def test_ensure_spacing(in_out): - """test a set of input queries to make sure spacing is done correctly""" - expr, output = in_out - assert utils._ensure_spacing(expr) == output diff --git a/tests/nested_pandas/utils/test_utils.py b/tests/nested_pandas/utils/test_utils.py index 5397d757..d76d5928 100644 --- a/tests/nested_pandas/utils/test_utils.py +++ b/tests/nested_pandas/utils/test_utils.py @@ -2,6 +2,7 @@ import pandas as pd import pytest from nested_pandas import NestedFrame +from nested_pandas.nestedframe.utils import NestingType, check_expr_nesting from nested_pandas.utils import count_nested @@ -43,3 +44,29 @@ def test_count_nested(join): else: assert total_counts.columns.tolist() == ["n_nested"] assert label_counts.columns.tolist() == ["n_nested_a", "n_nested_b"] + + +def test_check_expr_nesting(): + """ + Test the correctness of the evaluation expression pre-flight checks, which are + used to ensure that an expression-based query does not try to combine base and nested + sub-expressions. + """ + assert check_expr_nesting("a > 2 & nested.c > 1") == {NestingType.NESTED, NestingType.BASE} + assert check_expr_nesting("(nested.c > 1) and (nested.d>2)") == {NestingType.NESTED} + assert check_expr_nesting("-1.52e-5 < abc < 35.2e2") == {NestingType.BASE} + assert check_expr_nesting("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == { + NestingType.NESTED, + NestingType.BASE, + } + + # NOTE: this correctly captures the desired behavior here, but suggests that the two nests + # are interoperable, which is too strong a claim. + assert check_expr_nesting("a.b > 2 & c.d < 5") == {NestingType.NESTED} + + assert check_expr_nesting("a>3") == {NestingType.BASE} + assert check_expr_nesting("a > 3") == {NestingType.BASE} + assert check_expr_nesting("test.a>5&b==2") == {NestingType.NESTED, NestingType.BASE} + assert check_expr_nesting("test.a > 5 & b == 2") == {NestingType.NESTED, NestingType.BASE} + assert check_expr_nesting("(a.b > 3)&(a.c == 'f')") == {NestingType.NESTED} + assert check_expr_nesting("(a.b > 3) & (a.c == 'f')") == {NestingType.NESTED} From ee57673d70e066ce918f79ba0d137d07a6933b09 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Wed, 9 Oct 2024 10:17:57 -0700 Subject: [PATCH 02/11] Import type hint syntax from the future. Also remove `_ensure_spacing`, no longer used. --- src/nested_pandas/nestedframe/utils.py | 40 ++------------------------ 1 file changed, 3 insertions(+), 37 deletions(-) diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py index 1ca1e0a2..0a2767ee 100644 --- a/src/nested_pandas/nestedframe/utils.py +++ b/src/nested_pandas/nestedframe/utils.py @@ -1,44 +1,10 @@ +# typing.Self and "|" union syntax don't exist in Python 3.9 +from __future__ import annotations + import ast from enum import Enum -def _ensure_spacing(expr) -> str: - """Ensure that an eval string has spacing""" - single_val_operators = {"+", "-", "*", "/", "%", ">", "<", "|", "&", "~", "="} # omit "(" and ")" - check_for_doubles = {"=", "/", "*", ">", "<"} - double_val_operators = {"==", "//", "**", ">=", "<="} - expr_list = expr - - i = 0 - spaced_expr = "" - while i < len(expr_list): - if expr_list[i] not in single_val_operators: - spaced_expr += expr_list[i] - else: - if expr_list[i] in check_for_doubles: - if "".join(expr_list[i : i + 2]) in double_val_operators: - if spaced_expr[-1] != " ": - spaced_expr += " " - spaced_expr += expr_list[i : i + 2] - if expr_list[i + 2] != " ": - spaced_expr += " " - i += 1 # skip ahead an extra time - else: - if spaced_expr[-1] != " ": - spaced_expr += " " - spaced_expr += expr_list[i] - if expr_list[i + 1] != " ": - spaced_expr += " " - else: - if spaced_expr[-1] != " ": - spaced_expr += " " - spaced_expr += expr_list[i] - if expr_list[i + 1] != " ": - spaced_expr += " " - i += 1 - return spaced_expr - - class NestingType(Enum): """Types of sub-expressions possible in a NestedFrame string expression.""" From 838ae68fe7f09f2d8f0ae492fa4dff10596fc74b Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Wed, 9 Oct 2024 12:41:48 -0700 Subject: [PATCH 03/11] Improve test coverage of new methods. --- src/nested_pandas/nestedframe/core.py | 2 +- .../nestedframe/test_nestedframe.py | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index ff05ff4a..1ad9e749 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -30,7 +30,7 @@ def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 @property def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821 - return NestedSeries + return NestedFrame __pandas_priority__ = 3500 diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 1f506a8a..77980cbf 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -15,6 +15,25 @@ def test_nestedframe_construction(): assert isinstance(base, NestedFrame) +def test_nestedseries_construction(): + """Test NestedSeries construction""" + series = NestedSeries([1, 2, 3], index=[0, 2, 4]) + + assert isinstance(series, NestedSeries) + assert series[4] == 3 + + # Exercise the constructor used during promoting operations + combine_left = NestedSeries([1, 2, 3], index=[0, 2, 4]) + pd.Series([1, 2, 3], index=[0, 2, 4]) + assert isinstance(combine_left, NestedSeries) + combine_right = pd.Series([1, 2, 3], index=[0, 2, 4]) + NestedSeries([1, 2, 3], index=[0, 2, 4]) + assert isinstance(combine_right, NestedSeries) + + # Exercising the expanddim constructor + frame = series.to_frame() + assert isinstance(frame, NestedFrame) + assert (frame[0] == [1, 2, 3]).all() + + def test_all_columns(): """Test the all_columns function""" @@ -436,6 +455,31 @@ def test_query(): nest_queried = base.query("(nested.c > 1) and (nested.d>2)") assert len(nest_queried.nested.nest.to_flat()) == 4 + # Check edge conditions + with pytest.raises(ValueError): + # Expression must be a string + base.query(3 + 4) + + # Verify that inplace queries will change the shape of the instance. + base.query("(a % 2) == 1", inplace=True) + assert base.shape == (2, 3) + # A chunk of the nested rows will be gone, too. + assert base["nested.c"].shape == (6,) + assert base["nested.d"].shape == (6,) + + # Now query into the nest, throwing away most rows. First, check that + # without inplace=True, the original is not affected. + assert base.query("nested.c + nested.d > 9")["nested.c"].shape == (2,) + assert base.query("nested.c + nested.d > 9")["nested.d"].shape == (2,) + # and verify the original: + assert base["nested.c"].shape == (6,) + assert base["nested.d"].shape == (6,) + + # Then, with inplace=True, 'base' should be changed in-place. + base.query("nested.c + nested.d > 9", inplace=True) + assert base["nested.c"].shape == (2,) + assert base["nested.d"].shape == (2,) + def test_dropna(): """Test that dropna works on all layers""" From 9ad43139b1ae6d9cf62fc57b92d117954cdd20de Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Wed, 9 Oct 2024 13:01:34 -0700 Subject: [PATCH 04/11] Remove the multi-dimensional error fallback from NestedFrame.query. This logic was copied from pd.DataFrame.query, and the accompanying comment said that it was to handle an occasional case where `self.loc[b]` would raise an error on a multi-dimensional `b`, but `self[b]` would succeed. I can't cause this error with `.loc` anymore, so the code coverage complains about the unexercised exception clause. Removing it since the limitation that inspired it seems to be gone now. --- src/nested_pandas/nestedframe/core.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 1ad9e749..35038b94 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -475,18 +475,13 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | # which means that a nested attribute was referenced. Apply this result # to the nest and repack. Otherwise, apply it to this instance as usual, # since it operated on the base attributes. - try: - if isinstance(result, NestedSeries): - nest_name, flat_nest = result.nest_name, result.flat_nest - new_flat_nest = flat_nest.loc[result] - result = self.copy() - result[nest_name] = pack_sorted_df_into_struct(new_flat_nest) - else: - result = self.loc[result] - except ValueError: - # when res is multi-dimensional loc raises, but this is sometimes a - # valid query - result = self[result] + if isinstance(result, NestedSeries): + nest_name, flat_nest = result.nest_name, result.flat_nest + new_flat_nest = flat_nest.loc[result] + result = self.copy() + result[nest_name] = pack_sorted_df_into_struct(new_flat_nest) + else: + result = self.loc[result] if inplace: self._update_inplace(result) From bc83087e96724fb8e34db5aebccb7a1aeb332258 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Wed, 9 Oct 2024 15:33:21 -0700 Subject: [PATCH 05/11] Raise NestResolver to module context, address other feedback. --- .gitignore | 3 -- src/nested_pandas/nestedframe/core.py | 45 +++++++++++++++----------- src/nested_pandas/nestedframe/utils.py | 2 +- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index dfe2e99a..50990fec 100644 --- a/.gitignore +++ b/.gitignore @@ -133,9 +133,6 @@ dmypy.json # vscode .vscode/ -# PyCharm -.idea/ - # dask dask-worker-space/ diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 35038b94..a728472e 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -35,6 +35,32 @@ def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: __pandas_priority__ = 3500 +class NestResolver: + """ + Used by NestedFrame.eval to resolve the names of nested columns when + encountered in expressions, interpreting __getattr__ in terms of a + specific nest context. + """ + + def __init__(self, nest_name: str, outer: NestedFrame): + self._nest_name = nest_name + # Save the outer frame with an eye toward repacking. + self._outer = outer + # Flattened only once for every access of this particular nest + # within the expression. + self._flat_nest = outer[nest_name].nest.to_flat() + + def __getattr__(self, item_name: str): + if item_name in self._flat_nest: + result = NestedSeries(self._flat_nest[item_name]) + # Assigning these properties directly in order to avoid any complication + # or interference with the inherited pd.Series constructor. + result.nest_name = self._nest_name + result.flat_nest = self._flat_nest + return result + raise AttributeError(f"No attribute {item_name}") + + class NestedFrame(pd.DataFrame): """A Pandas Dataframe extension with support for nested structure. @@ -490,25 +516,6 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | return result def _get_nested_column_resolvers(self): - class NestResolver: - def __init__(self, nest_name: str, outer): - self._nest_name = nest_name - # Save the outer frame with an eye toward repacking. - self._outer = outer - # Flattened only once for every access of this particular nest - # within the expression. - self._flat_nest = outer[nest_name].nest.to_flat() - - def __getattr__(self, item_name: str): - if item_name in self._flat_nest: - result = NestedSeries(self._flat_nest[item_name]) - # Assigning these properties directly in order to avoid any complication - # or interference with the inherited pd.Series constructor. - result.nest_name = self._nest_name - result.flat_nest = self._flat_nest - return result - raise AttributeError(f"No attribute {item_name}") - return {name: NestResolver(name, self) for name in self.nested_columns} def _resolve_dropna_target(self, on_nested, subset): diff --git a/src/nested_pandas/nestedframe/utils.py b/src/nested_pandas/nestedframe/utils.py index 0a2767ee..b219cea1 100644 --- a/src/nested_pandas/nestedframe/utils.py +++ b/src/nested_pandas/nestedframe/utils.py @@ -37,4 +37,4 @@ def check_expr_nesting(expr: str) -> set[NestingType]: to mix base and nested columns, which will need to be handled specially. """ expr_tree = ast.parse(expr, mode="eval").body - return set(_expr_nesting_type(expr_tree)) + return _expr_nesting_type(expr_tree) From d996fd70dc701f9eb44c0c177b1bfb5829b77a7d Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Thu, 10 Oct 2024 10:11:11 -0700 Subject: [PATCH 06/11] Rename `NestedSeries` to `_SeriesFromNest`. Prevent users from depending on the public interface, and also from assuming that a `NestedSeries` has a nest within it, since that is the existing meaning of that prefix. --- src/nested_pandas/nestedframe/core.py | 8 ++++---- .../nestedframe/test_nestedframe.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index a728472e..fa3844c3 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -17,7 +17,7 @@ from .utils import NestingType, check_expr_nesting -class NestedSeries(pd.Series): +class _SeriesFromNest(pd.Series): """ Series that were unpacked from a nest. """ @@ -26,7 +26,7 @@ class NestedSeries(pd.Series): @property def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 - return NestedSeries + return _SeriesFromNest @property def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821 @@ -52,7 +52,7 @@ def __init__(self, nest_name: str, outer: NestedFrame): def __getattr__(self, item_name: str): if item_name in self._flat_nest: - result = NestedSeries(self._flat_nest[item_name]) + result = _SeriesFromNest(self._flat_nest[item_name]) # Assigning these properties directly in order to avoid any complication # or interference with the inherited pd.Series constructor. result.nest_name = self._nest_name @@ -501,7 +501,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | # which means that a nested attribute was referenced. Apply this result # to the nest and repack. Otherwise, apply it to this instance as usual, # since it operated on the base attributes. - if isinstance(result, NestedSeries): + if isinstance(result, _SeriesFromNest): nest_name, flat_nest = result.nest_name, result.flat_nest new_flat_nest = flat_nest.loc[result] result = self.copy() diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 77980cbf..54896833 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -4,7 +4,7 @@ import pytest from nested_pandas import NestedFrame from nested_pandas.datasets import generate_data -from nested_pandas.nestedframe.core import NestedSeries +from nested_pandas.nestedframe.core import _SeriesFromNest from pandas.testing import assert_frame_equal @@ -17,16 +17,16 @@ def test_nestedframe_construction(): def test_nestedseries_construction(): """Test NestedSeries construction""" - series = NestedSeries([1, 2, 3], index=[0, 2, 4]) + series = _SeriesFromNest([1, 2, 3], index=[0, 2, 4]) - assert isinstance(series, NestedSeries) + assert isinstance(series, _SeriesFromNest) assert series[4] == 3 # Exercise the constructor used during promoting operations - combine_left = NestedSeries([1, 2, 3], index=[0, 2, 4]) + pd.Series([1, 2, 3], index=[0, 2, 4]) - assert isinstance(combine_left, NestedSeries) - combine_right = pd.Series([1, 2, 3], index=[0, 2, 4]) + NestedSeries([1, 2, 3], index=[0, 2, 4]) - assert isinstance(combine_right, NestedSeries) + combine_left = _SeriesFromNest([1, 2, 3], index=[0, 2, 4]) + pd.Series([1, 2, 3], index=[0, 2, 4]) + assert isinstance(combine_left, _SeriesFromNest) + combine_right = pd.Series([1, 2, 3], index=[0, 2, 4]) + _SeriesFromNest([1, 2, 3], index=[0, 2, 4]) + assert isinstance(combine_right, _SeriesFromNest) # Exercising the expanddim constructor frame = series.to_frame() @@ -767,7 +767,7 @@ def test_eval(): nf = nf.add_nested(to_pack, "packed") p5 = nf.eval("packed.d > 5") - assert isinstance(p5, NestedSeries) + assert isinstance(p5, _SeriesFromNest) assert p5.any() assert not p5.all() assert list(p5.loc[p5].index) == [0, 2] From bdce04236d439a1355d9ef4f2af414c9d36478d5 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Thu, 10 Oct 2024 15:20:50 -0700 Subject: [PATCH 07/11] Add more tests of NestedFrame.eval --- src/nested_pandas/nestedframe/core.py | 4 +- .../nestedframe/test_nestedframe.py | 44 +++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index fa3844c3..d936d7c3 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -148,7 +148,7 @@ def __setitem__(self, key, value): # Adding a new nested structure from a column # Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5 - elif "." in key: + if "." in key: new_nested, col = key.split(".") if isinstance(value, pd.Series): value.name = col @@ -497,7 +497,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | if NestingType.NESTED in nesting_types and NestingType.BASE in nesting_types: raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") result = self.eval(expr, **kwargs) - # If the result is a NestedSeries, then the evaluation has caused unpacking, + # If the result is a _SeriesFromNest, then the evaluation has caused unpacking, # which means that a nested attribute was referenced. Apply this result # to the nest and repack. Otherwise, apply it to this instance as usual, # since it operated on the base attributes. diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 54896833..10fa65bf 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -774,4 +774,48 @@ def test_eval(): r1 = nf.eval("packed.c + packed.d") r2 = nf["packed"].nest["c"] + nf["packed"].nest["d"] + r3 = nf["packed.c"] + nf["packed.d"] assert (r1 == r2).all() + assert (r2 == r3).all() + + +def test_eval_funcs(): + """ + Test the ability to use expected methods and functions within eval(), + on nested columns. + """ + # Verifies https://github.com/lincc-frameworks/nested-pandas/issues/146 + nf = NestedFrame.from_flat(NestedFrame({"a": [1, 2], "b": [3, None]}, index=[1, 1]), base_columns=[]) + assert nf["nested.b"].shape == (2,) + assert nf.query("nested.b.isna()")["nested.b"].shape == (1,) + + assert nf["nested.a"].max() == nf.eval("nested.a.max()") == 2 + assert nf["nested.a"].min() == nf.eval("nested.a.min()") == 1 + + +def test_mixed_eval_funcs(): + """ + Test operations across base and nested. Whether these evaluations + work is data-dependent, since the dimensions of the base and + nested columns are not guaranteed to be compatible, but when they + are, it should work as expected. + """ + nf = NestedFrame( + data={"a": [1, 2, 3], "b": [2, 4, 6]}, + index=pd.Index([0, 1, 2], name="idx"), + ) + + to_pack = pd.DataFrame( + data={ + "time": [1, 2, 3, 1, 2, 4, 2, 1, 4], + "c": [0, 2, 4, 10, 4, 3, 1, 4, 1], + "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], + }, + index=pd.Index([0, 0, 0, 1, 1, 1, 2, 2, 2], name="idx"), + ) + # Reduction + nf = nf.add_nested(to_pack, "packed") + assert (nf.eval("a + packed.c.median()") == pd.Series([4, 5, 6])).all() + + # Across the nest: each base column element applies to each of its indexes + assert (nf.eval("a + packed.c") == nf["a"] + nf["packed.c"]).all() From 67c49ffeb5e8a925b316b2c1f9367da48b29f31c Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Fri, 11 Oct 2024 13:46:33 -0700 Subject: [PATCH 08/11] Improve documentation. --- src/nested_pandas/nestedframe/core.py | 86 +++++---------------------- 1 file changed, 16 insertions(+), 70 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index d936d7c3..c991863b 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -32,6 +32,10 @@ def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821 return NestedFrame + # https://pandas.pydata.org/docs/development/extending.html#arithmetic-with-3rd-party-types + # The __pandas_priority__ of Series is 3000, so give _SeriesFromNest a + # higher priority, so that binary operations involving this class and + # Series produce instances of this class, preserving the type and origin. __pandas_priority__ = 3500 @@ -67,6 +71,10 @@ class NestedFrame(pd.DataFrame): See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures """ + # https://pandas.pydata.org/docs/development/extending.html#arithmetic-with-3rd-party-types + # The __pandas_priority__ of DataFrame is 4000, so give NestedFrame a + # higher priority, so that binary operations involving this class and + # Series produce instances of this class, preserving the type and origin. __pandas_priority__ = 4500 @property @@ -336,7 +344,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: """ - Evaluate a string describing operations on DataFrame columns. + Evaluate a string describing operations on NestedFrame columns. Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code @@ -353,83 +361,21 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: The expression string to evaluate. inplace : bool, default False If the expression contains an assignment, whether to perform the - operation inplace and mutate the existing DataFrame. Otherwise, - a new DataFrame is returned. + operation inplace and mutate the existing NestedFrame. Otherwise, + a new NestedFrame is returned. **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + :meth:`~pandas.NestedFrame.eval`. Returns ------- - ndarray, scalar, pandas object, or None + ndarray, scalar, pandas object, nested-pandas object, or None The result of the evaluation or None if ``inplace=True``. See Also -------- - DataFrame.query : Evaluates a boolean expression to query the columns - of a frame. - DataFrame.assign : Can evaluate an expression or function to create new - values for a column. - eval : Evaluate a Python expression as a string using various - backends. - - Notes - ----- - For more details see the API documentation for :func:`~eval`. - For detailed examples see :ref:`enhancing performance with eval - `. - - Examples - -------- - >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) - >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 - >>> df.eval('A + B') - 0 11 - 1 10 - 2 9 - 3 8 - 4 7 - dtype: int64 - - Assignment is allowed though by default the original DataFrame is not - modified. - - >>> df.eval('C = A + B') - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 - >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 - - Multiple columns can be assigned to using multi-line expressions: - - >>> df.eval( - ... ''' - ... C = A + B - ... D = A - B - ... ''' - ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 + https://pandas.pydata.org/docs/reference/api/pandas.eval.html """ nested_resolvers = self._get_nested_column_resolvers() kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (nested_resolvers,) @@ -472,8 +418,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | Returns ------- - DataFrame - DataFrame resulting from the provided query expression. + NestedFrame + NestedFrame resulting from the provided query expression. Notes ----- From d37abd1ada79ad4b0091f036c2be7c00a9c65d17 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Mon, 14 Oct 2024 09:55:49 -0700 Subject: [PATCH 09/11] Extend PandasExprVisitor to validate and accept nested assignment. Permits the creation of new nests and new nested columns within the `.eval()` method. --- src/nested_pandas/nestedframe/core.py | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index c991863b..fca71dcb 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -1,6 +1,7 @@ # typing.Self and "|" union syntax don't exist in Python 3.9 from __future__ import annotations +import ast import os import numpy as np @@ -9,6 +10,7 @@ from pandas._libs import lib from pandas._typing import Any, AnyAll, Axis, IndexLabel from pandas.api.extensions import no_default +from pandas.core.computation.expr import PARSERS, PandasExprVisitor from nested_pandas.series import packer from nested_pandas.series.dtype import NestedDtype @@ -17,6 +19,34 @@ from .utils import NestingType, check_expr_nesting +class NestedPandasExprVisitor(PandasExprVisitor): + """ + Custom expression visitor for NestedFrame evaluations, which may assign to + nested columns. + """ + + def visit_Assign(self, node, **kwargs): # noqa: N802 + """ + Visit an assignment node, which may assign to a nested column. + """ + if not isinstance(node.targets[0], ast.Attribute): + # If the target is not an attribute, then it's a simple assignment as usual + return super().visit_Assign(node) + target = node.targets[0] + if not isinstance(target.value, ast.Name): + raise ValueError("Assignments to nested columns must be of the form `nested.col = ...`") + # target.value.id will be the name of the nest, target.attr is the column name. + # Describing the proper target for the assigner is enough for both overwrite and + # creation of new columns. The assigner will be a string like "nested.col". + # This works both for the creation of new nest members and new nests. + self.assigner = f"{target.value.id}.{target.attr}" + # Continue visiting. + return self.visit(node.value, **kwargs) + + +PARSERS["nested-pandas"] = NestedPandasExprVisitor + + class _SeriesFromNest(pd.Series): """ Series that were unpacked from a nest. @@ -380,6 +410,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: nested_resolvers = self._get_nested_column_resolvers() kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (nested_resolvers,) kwargs["inplace"] = inplace + kwargs["parser"] = "nested-pandas" return super().eval(expr, **kwargs) def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None: From 2b338c6a2661784449f04bbb2b6ce8f8d6f82153 Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Mon, 14 Oct 2024 16:56:30 -0700 Subject: [PATCH 10/11] Support multi-line evaluation with nest assignment. Include more unit tests of `NestedFrame.eval`. --- src/nested_pandas/nestedframe/core.py | 53 +++++++++++--- .../nestedframe/test_nestedframe.py | 72 +++++++++++++++++++ 2 files changed, 117 insertions(+), 8 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index fca71dcb..d772b1cb 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -69,11 +69,52 @@ def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: __pandas_priority__ = 3500 -class NestResolver: +class NestResolver(dict): """ - Used by NestedFrame.eval to resolve the names of nested columns when + Used by NestedFrame.eval to resolve the names of nests at the top level. + While the resolver is normally a dictionary, with values that are fixed + upon entering evaluation, this object needs to be dynamic so that it can + support multi-line expressions, where new nests may be created during + evaluation. + """ + + def __init__(self, outer: NestedFrame): + self._outer = outer + super().__init__() + + def __contains__(self, item): + if not isinstance(item, str): + return False + top_nest = item if "." not in item else item.split(".")[0].strip() + return top_nest in self._outer.nested_columns + + def __len__(self): + return len(self._outer.nested_columns) + + def __getitem__(self, item): + if not isinstance(item, str): + raise KeyError(f"Unknown nest {item}") + top_nest = item if "." not in item else item.split(".")[0].strip() + if not super().__contains__(top_nest): + if top_nest not in self._outer.nested_columns: + raise KeyError(f"Unknown nest {top_nest}") + super().__setitem__(top_nest, NestedFieldResolver(top_nest, self._outer)) + return super().__getitem__(top_nest) + + def __setitem__(self, key, value): + # Called to update the resolver with intermediate values. + # The important point is to intercept the call so that the evaluator + # does not create any new resolvers on the fly. Storing the value + # is not important, since that will have been done already in + # the NestedFrame. + pass + + +class NestedFieldResolver: + """ + Used by NestedFrame.eval to resolve the names of fields in nested columns when encountered in expressions, interpreting __getattr__ in terms of a - specific nest context. + specific nest. """ def __init__(self, nest_name: str, outer: NestedFrame): @@ -407,8 +448,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: -------- https://pandas.pydata.org/docs/reference/api/pandas.eval.html """ - nested_resolvers = self._get_nested_column_resolvers() - kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (nested_resolvers,) + kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (NestResolver(self),) kwargs["inplace"] = inplace kwargs["parser"] = "nested-pandas" return super().eval(expr, **kwargs) @@ -492,9 +532,6 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | else: return result - def _get_nested_column_resolvers(self): - return {name: NestResolver(name, self) for name in self.nested_columns} - def _resolve_dropna_target(self, on_nested, subset): """resolves the target layer for a given set of dropna kwargs""" diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 10fa65bf..1590529b 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -819,3 +819,75 @@ def test_mixed_eval_funcs(): # Across the nest: each base column element applies to each of its indexes assert (nf.eval("a + packed.c") == nf["a"] + nf["packed.c"]).all() + + +def test_eval_assignment(): + """ + Test eval strings that perform assignment, within base columns, nested columns, + and across base and nested. + """ + nf = NestedFrame( + data={"a": [1, 2, 3], "b": [2, 4, 6]}, + index=pd.Index([0, 1, 2], name="idx"), + ) + to_pack = pd.DataFrame( + data={ + "time": [1, 2, 3, 1, 2, 4, 2, 1, 4], + "c": [0, 2, 4, 10, 4, 3, 1, 4, 1], + "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], + }, + index=pd.Index([0, 0, 0, 1, 1, 1, 2, 2, 2], name="idx"), + ) + nf = nf.add_nested(to_pack, "packed") + # Assigning to new base columns from old base columns + nf_b = nf.eval("c = a + 1") + assert len(nf_b.columns) == len(nf.columns) + 1 + assert (nf_b["c"] == nf["a"] + 1).all() + + # Assigning to new nested columns from old nested columns + nf_nc = nf.eval("packed.e = packed.c + 1") + assert len(nf_nc.packed.nest.fields) == len(nf["packed"].nest.fields) + 1 + assert (nf_nc["packed.e"] == nf["packed.c"] + 1).all() + + # Verify that overwriting a nested column works + nf_nc_2 = nf_nc.eval("packed.e = packed.c * 2") + assert len(nf_nc_2.packed.nest.fields) == len(nf_nc["packed"].nest.fields) + assert (nf_nc_2["packed.e"] == nf["packed.c"] * 2).all() + + # Assigning to new nested columns from a combo of base and nested + nf_nx = nf.eval("packed.f = a + packed.c") + assert len(nf_nx.packed.nest.fields) == len(nf["packed"].nest.fields) + 1 + assert (nf_nx["packed.f"] == nf["a"] + nf["packed.c"]).all() + assert (nf_nx["packed.f"] == pd.Series([1, 3, 5, 12, 6, 5, 4, 7, 4], index=to_pack.index)).all() + + # Assigning to new base columns from nested columns. This can't be done because + # it would attempt to create base column values that were "between indexes", or as + # Pandas puts, duplicate index labels. + with pytest.raises(ValueError): + nf.eval("g = packed.c * 2") + + # Create new nests via eval() + nf_n2 = nf.eval("p2.c2 = packed.c * 2") + assert len(nf_n2.p2.nest.fields) == 1 + assert (nf_n2["p2.c2"] == nf["packed.c"] * 2).all() + assert (nf_n2["p2.c2"] == pd.Series([0, 4, 8, 20, 8, 6, 2, 8, 2], index=to_pack.index)).all() + assert len(nf_n2.columns) == len(nf.columns) + 1 # new packed column + assert len(nf_n2.p2.nest.fields) == 1 + + # Assigning to new columns across two different nests + nf_n3 = nf_n2.eval("p2.d = p2.c2 + packed.d * 2 + b") + assert len(nf_n3.p2.nest.fields) == 2 + assert (nf_n3["p2.d"] == nf_n2["p2.c2"] + nf["packed.d"] * 2 + nf["b"]).all() + + # Now test multiline and inplace=True + nf.eval( + """ + c = a + b + p2.e = packed.d * 2 + c + p2.f = p2.e + b + """, + inplace=True, + ) + assert len(nf.p2.nest.fields) == 2 + assert (nf["p2.e"] == nf["packed.d"] * 2 + nf.c).all() + assert (nf["p2.f"] == nf["p2.e"] + nf.b).all() From 437cca7b76930630a704d32a531a83a889cafdfc Mon Sep 17 00:00:00 2001 From: Derek Jones Date: Tue, 15 Oct 2024 10:06:30 -0700 Subject: [PATCH 11/11] Make helper classes private, address code coverage issues. Some of the dict-like overloads were an overachievement. --- src/nested_pandas/nestedframe/core.py | 15 ++++----------- .../nested_pandas/nestedframe/test_nestedframe.py | 4 ++++ 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index d772b1cb..bf2ea6fb 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -69,7 +69,7 @@ def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: __pandas_priority__ = 3500 -class NestResolver(dict): +class _NestResolver(dict): """ Used by NestedFrame.eval to resolve the names of nests at the top level. While the resolver is normally a dictionary, with values that are fixed @@ -83,22 +83,15 @@ def __init__(self, outer: NestedFrame): super().__init__() def __contains__(self, item): - if not isinstance(item, str): - return False top_nest = item if "." not in item else item.split(".")[0].strip() return top_nest in self._outer.nested_columns - def __len__(self): - return len(self._outer.nested_columns) - def __getitem__(self, item): - if not isinstance(item, str): - raise KeyError(f"Unknown nest {item}") top_nest = item if "." not in item else item.split(".")[0].strip() if not super().__contains__(top_nest): if top_nest not in self._outer.nested_columns: raise KeyError(f"Unknown nest {top_nest}") - super().__setitem__(top_nest, NestedFieldResolver(top_nest, self._outer)) + super().__setitem__(top_nest, _NestedFieldResolver(top_nest, self._outer)) return super().__getitem__(top_nest) def __setitem__(self, key, value): @@ -110,7 +103,7 @@ def __setitem__(self, key, value): pass -class NestedFieldResolver: +class _NestedFieldResolver: """ Used by NestedFrame.eval to resolve the names of fields in nested columns when encountered in expressions, interpreting __getattr__ in terms of a @@ -448,7 +441,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: -------- https://pandas.pydata.org/docs/reference/api/pandas.eval.html """ - kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (NestResolver(self),) + kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (_NestResolver(self),) kwargs["inplace"] = inplace kwargs["parser"] = "nested-pandas" return super().eval(expr, **kwargs) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 1590529b..eb7c3f3d 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -860,6 +860,10 @@ def test_eval_assignment(): assert (nf_nx["packed.f"] == nf["a"] + nf["packed.c"]).all() assert (nf_nx["packed.f"] == pd.Series([1, 3, 5, 12, 6, 5, 4, 7, 4], index=to_pack.index)).all() + # Only supporting one level of nesting at present. + with pytest.raises(ValueError): + nf.eval("packed.c.inner = packed.c * 2 + packed.d") + # Assigning to new base columns from nested columns. This can't be done because # it would attempt to create base column values that were "between indexes", or as # Pandas puts, duplicate index labels.