From 0bc249f43576b047d0df6f500e0419d8fdd92162 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Tue, 22 Apr 2025 13:38:46 -0400 Subject: [PATCH 1/2] PPT 2.0.6 --- .copier-answers.yml | 2 +- .pre-commit-config.yaml | 2 +- .setup_dev.sh | 15 ++++++++++++--- pyproject.toml | 3 +++ 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.copier-answers.yml b/.copier-answers.yml index 86868879..9ab11c2c 100644 --- a/.copier-answers.yml +++ b/.copier-answers.yml @@ -1,5 +1,5 @@ # Changes here will be overwritten by Copier -_commit: v2.0.5 +_commit: v2.0.6 _src_path: gh:lincc-frameworks/python-project-template author_email: brantd@uw.edu author_name: LINCC Frameworks diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 34410d40..c619f918 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -97,7 +97,7 @@ repos: "-d", # Flag for cached environment and doctrees "./docs/_build/doctrees", # Directory "-D", # Flag to override settings in conf.py - "exclude_patterns=notebooks/*", # Exclude our notebooks from pre-commit + "exclude_patterns=notebooks/*,_build", # Exclude notebooks and build dir from pre-commit ] # Run unit tests, verify that they pass. Note that coverage is run against # the ./src directory here because that is what will be committed. In the diff --git a/.setup_dev.sh b/.setup_dev.sh index d8cd955c..5286e41e 100644 --- a/.setup_dev.sh +++ b/.setup_dev.sh @@ -1,10 +1,19 @@ #!/usr/bin/env bash +# Bash Unofficial strict mode (http://redsymbol.net/articles/unofficial-bash-strict-mode/) +# and (https://disconnected.systems/blog/another-bash-strict-mode/) +set -o nounset # Any uninitialized variable is an error +set -o errexit # Exit the script on the failure of any command to execute without error +set -o pipefail # Fail command pipelines on the failure of any individual step +IFS=$'\n\t' #set internal field separator to avoid iteration errors +# Trap all exits and output something helpful +trap 's=$?; echo "$0: Error on line "$LINENO": $BASH_COMMAND"; exit $s' ERR + # This script should be run by new developers to install this package in # editable mode and configure their local environment echo "Checking virtual environment" -if [ -z "${VIRTUAL_ENV}" ] && [ -z "${CONDA_PREFIX}" ]; then +if [ "${VIRTUAL_ENV:-missing}" = "missing" ] && [ "${CONDA_PREFIX:-missing}" = "missing" ]; then echo 'No virtual environment detected: none of $VIRTUAL_ENV or $CONDA_PREFIX is set.' echo echo "=== This script is going to install the project in the system python environment ===" @@ -20,7 +29,7 @@ fi echo "Checking pip version" MINIMUM_PIP_VERSION=22 -pipversion=( $(python -m pip --version | awk '{print $2}' | sed 's/\./ /g') ) +pipversion=( $(python -m pip --version | awk '{print $2}' | sed 's/\./\n\t/g') ) if let "${pipversion[0]}<${MINIMUM_PIP_VERSION}"; then echo "Insufficient version of pip found. Requires at least version ${MINIMUM_PIP_VERSION}." echo "See https://lincc-ppt.readthedocs.io/ for details." @@ -32,7 +41,7 @@ python -m pip install -e . > /dev/null echo "Installing developer dependencies in local environment" python -m pip install -e .'[dev]' > /dev/null -if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt; fi +if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi echo "Installing pre-commit" pre-commit install > /dev/null diff --git a/pyproject.toml b/pyproject.toml index 18a86cac..59359078 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,10 @@ write_to = "src/nested_pandas/_version.py" [tool.pytest.ini_options] testpaths = [ "tests", + "src", + "docs", ] +addopts = "--doctest-modules --doctest-glob=*.rst" [tool.black] line-length = 110 From 1417e5f824b1aa5ce7d332b9fcea3cd10ababce6 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Tue, 22 Apr 2025 14:38:09 -0400 Subject: [PATCH 2/2] Fix doc-strings and NestedDtype.__repr__ --- src/nested_pandas/datasets/generation.py | 5 ++- src/nested_pandas/nestedframe/core.py | 48 +++++++++++++++--------- src/nested_pandas/nestedframe/io.py | 4 +- src/nested_pandas/series/accessor.py | 2 +- src/nested_pandas/series/dtype.py | 3 ++ src/nested_pandas/utils/utils.py | 6 ++- 6 files changed, 45 insertions(+), 23 deletions(-) diff --git a/src/nested_pandas/datasets/generation.py b/src/nested_pandas/datasets/generation.py index a37ee919..b8f263ec 100644 --- a/src/nested_pandas/datasets/generation.py +++ b/src/nested_pandas/datasets/generation.py @@ -24,8 +24,9 @@ def generate_data(n_base, n_layer, seed=None) -> NestedFrame: Examples -------- - >>> nested_pandas.datasets.generate_data(10,100) - >>> nested_pandas.datasets.generate_data(10, {"nested_a": 100, "nested_b": 200}) + >>> from nested_pandas.datasets import generate_data + >>> nf1 = generate_data(10,100) + >>> nf2 = generate_data(10, {"nested_a": 100, "nested_b": 200}) """ # use provided seed, "None" acts as if no seed is provided randomstate = np.random.RandomState(seed=seed) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 4005ff9e..36fed5f7 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -278,7 +278,7 @@ def add_nested( Examples -------- - + >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, ... index=[0,1,2]) >>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]}, @@ -320,11 +320,12 @@ def nest_lists(self, name: str, columns: list[str]) -> NestedFrame: Examples -------- + >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6], ... "e":[[1,2,3], [4,5,6], [7,8,9]]}, ... index=[0,1,2]) - >>> nf.nest_lists(columns=["c","d"], name="nested") + >>> nf.nest_lists(columns=["e"], name="nested") c d nested 0 1 2 [{e: 1}; …] (3 rows) 1 2 4 [{e: 4}; …] (3 rows) @@ -367,6 +368,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, Examples -------- + >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"a":[1,1,1,2,2], "b":[2,2,2,4,4], ... "c":[1,2,3,4,5], "d":[2,4,6,8,10]}, ... index=[0,0,0,1,1]) @@ -424,6 +426,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): Examples -------- + >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6], ... "e":[[1,2,3], [4,5,6], [7,8,9]]}, ... index=[0,1,2]) @@ -605,7 +608,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | >>> nf = nf.query("nested.t > 10") >>> nf - a b nested + a b nested 0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, band: 'g'}; …]... 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]... 2 0.000114 0.691121 [{t: 11.173797, flux: 28.044399, band: 'r'}; …... @@ -619,9 +622,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | with rows of that particular nested structure filtered. For example, querying the NestedFrame "df" with nested structure "my_nested" as below will return all rows of df, but with mynested filtered by the - condition: - - >>> df.query("mynested.a > 2") + condition: `nf.query("mynested.a > 2")` """ if not isinstance(expr, str): msg = f"expr must be a string to be evaluated, {type(expr)} given" @@ -786,7 +787,7 @@ def dropna( >>> # this query empties several of the nested dataframes >>> nf = nf.query("nested.t > 19") >>> nf - a b nested + a b nested 0 0.417022 0.184677 None 1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, band: 'r'}] 2 0.000114 0.691121 [{t: 19.157791, flux: 14.672857, band: 'r'}] @@ -796,7 +797,7 @@ def dropna( >>> # dropna removes rows with those emptied dataframes >>> nf.dropna(subset="nested") - a b nested + a b nested 1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, band: 'r'}] 2 0.000114 0.691121 [{t: 19.157791, flux: 14.672857, band: 'r'}] @@ -806,9 +807,20 @@ def dropna( >>> nf = generate_data(5,5, seed=1) >>> # Either on the whole dataframe >>> nf.dropna(on_nested="nested") + a b nested + 0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, band: 'r'}; …] ... + 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]... + 2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, band: 'g'}; …]... + 3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, band: 'g'}; …]... + 4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, band: 'g'}; …]... >>> # or on a specific nested column >>> nf.dropna(subset="nested.t") - + a b nested + 0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, band: 'r'}; …] ... + 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]... + 2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, band: 'g'}; …]... + 3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, band: 'g'}; …]... + 4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, band: 'g'}; …]... Notes ----- @@ -909,7 +921,7 @@ def sort_values( >>> # Sort nested values >>> nf.sort_values(by="nested.band") - a b nested + a b nested 0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, band: 'g'}; …]... 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]... 2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, band: 'g'}; …]... @@ -1017,13 +1029,15 @@ def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame: # t >>> from nested_pandas.datasets.generation import generate_data >>> import numpy as np >>> nf = generate_data(5,5, seed=1) - + >>> >>> # define a custom user function + >>> # reduce will return a NestedFrame with two columns >>> def example_func(base_col, nested_col): - >>> '''reduce will return a NestedFrame with two columns''' - >>> return {"mean": np.mean(nested_col), - ... "mean_minus_base": np.mean(nested_col) - base_col} - + ... return { + ... "mean": np.mean(nested_col), + ... "mean_minus_base": np.mean(nested_col) - base_col, + ... } + >>> >>> # apply the function >>> nf.reduce(example_func, "a", "nested.t") mean mean_minus_base @@ -1038,8 +1052,8 @@ def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame: # t >>> # define a custom user function that returns nested structure >>> def example_func(base_col1, base_col2, nested_col): - >>> '''reduce will return a NestedFrame with nested structure''' - >>> return {"offsets.t_a": nested_col - base_col1, + ... '''reduce will return a NestedFrame with nested structure''' + ... return {"offsets.t_a": nested_col - base_col1, ... "offsets.t_b": nested_col - base_col2} By giving both output columns the prefix "offsets.", we signal diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index 1b3d0296..4b817706 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -64,12 +64,12 @@ def read_parquet( Simple loading example: >>> import nested_pandas as npd - >>> nf = npd.read_parquet("path/to/file.parquet") + >>> nf = npd.read_parquet("path/to/file.parquet") # doctest: +SKIP Partial loading: >>> #Load only the "flux" sub-column of the "nested" column - >>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"]) + >>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"]) # doctest: +SKIP """ # Type convergence for reject_nesting diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index 0757d149..550c6b88 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -258,7 +258,7 @@ def with_list_field(self, field: str, value: ArrayLike) -> pd.Series: ... [["g","g"], ... ["r","r"]]) >>> # Look at one row of the series - >>> nested_with_avg[0] + >>> nf_new_band[0] t flux band new_band 0 2.935118 39.676747 g g 1 3.725204 41.919451 r g diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py index c6231b51..4ae9ea5b 100644 --- a/src/nested_pandas/series/dtype.py +++ b/src/nested_pandas/series/dtype.py @@ -56,6 +56,9 @@ def name(self) -> str: fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()]) return f"nested<{fields}>" + def __repr__(self) -> str: + return self.name + @classmethod def construct_array_type(cls) -> Type[ExtensionArray]: """Corresponded array type, always NestedExtensionArray""" diff --git a/src/nested_pandas/utils/utils.py b/src/nested_pandas/utils/utils.py index f8cf625d..dceee867 100644 --- a/src/nested_pandas/utils/utils.py +++ b/src/nested_pandas/utils/utils.py @@ -27,8 +27,12 @@ def count_nested(df, nested, by=None, join=True) -> NestedFrame: Examples -------- + >>> import pandas as pd + >>> # Show all columns + >>> pd.set_option("display.width", 200) + >>> pd.set_option("display.max_columns", None) >>> from nested_pandas.datasets.generation import generate_data - >>> nf = generate_data(5,10,seed=1) + >>> nf = generate_data(5, 10, seed=1) >>> from nested_pandas.utils import count_nested >>> count_nested(nf, "nested")