Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .copier-answers.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Changes here will be overwritten by Copier
_commit: v2.0.5
_commit: v2.0.6
_src_path: gh:lincc-frameworks/python-project-template
author_email: [email protected]
author_name: LINCC Frameworks
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ repos:
"-d", # Flag for cached environment and doctrees
"./docs/_build/doctrees", # Directory
"-D", # Flag to override settings in conf.py
"exclude_patterns=notebooks/*", # Exclude our notebooks from pre-commit
"exclude_patterns=notebooks/*,_build", # Exclude notebooks and build dir from pre-commit
]
# Run unit tests, verify that they pass. Note that coverage is run against
# the ./src directory here because that is what will be committed. In the
Expand Down
15 changes: 12 additions & 3 deletions .setup_dev.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
#!/usr/bin/env bash

# Bash Unofficial strict mode (http://redsymbol.net/articles/unofficial-bash-strict-mode/)
# and (https://disconnected.systems/blog/another-bash-strict-mode/)
set -o nounset # Any uninitialized variable is an error
set -o errexit # Exit the script on the failure of any command to execute without error
set -o pipefail # Fail command pipelines on the failure of any individual step
IFS=$'\n\t' #set internal field separator to avoid iteration errors
# Trap all exits and output something helpful
trap 's=$?; echo "$0: Error on line "$LINENO": $BASH_COMMAND"; exit $s' ERR

# This script should be run by new developers to install this package in
# editable mode and configure their local environment

echo "Checking virtual environment"
if [ -z "${VIRTUAL_ENV}" ] && [ -z "${CONDA_PREFIX}" ]; then
if [ "${VIRTUAL_ENV:-missing}" = "missing" ] && [ "${CONDA_PREFIX:-missing}" = "missing" ]; then
echo 'No virtual environment detected: none of $VIRTUAL_ENV or $CONDA_PREFIX is set.'
echo
echo "=== This script is going to install the project in the system python environment ==="
Expand All @@ -20,7 +29,7 @@ fi

echo "Checking pip version"
MINIMUM_PIP_VERSION=22
pipversion=( $(python -m pip --version | awk '{print $2}' | sed 's/\./ /g') )
pipversion=( $(python -m pip --version | awk '{print $2}' | sed 's/\./\n\t/g') )
if let "${pipversion[0]}<${MINIMUM_PIP_VERSION}"; then
echo "Insufficient version of pip found. Requires at least version ${MINIMUM_PIP_VERSION}."
echo "See https://lincc-ppt.readthedocs.io/ for details."
Expand All @@ -32,7 +41,7 @@ python -m pip install -e . > /dev/null

echo "Installing developer dependencies in local environment"
python -m pip install -e .'[dev]' > /dev/null
if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt; fi
if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi

echo "Installing pre-commit"
pre-commit install > /dev/null
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ write_to = "src/nested_pandas/_version.py"
[tool.pytest.ini_options]
testpaths = [
"tests",
"src",
"docs",
]
addopts = "--doctest-modules --doctest-glob=*.rst"

[tool.black]
line-length = 110
Expand Down
5 changes: 3 additions & 2 deletions src/nested_pandas/datasets/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ def generate_data(n_base, n_layer, seed=None) -> NestedFrame:

Examples
--------
>>> nested_pandas.datasets.generate_data(10,100)
>>> nested_pandas.datasets.generate_data(10, {"nested_a": 100, "nested_b": 200})
>>> from nested_pandas.datasets import generate_data
>>> nf1 = generate_data(10,100)
>>> nf2 = generate_data(10, {"nested_a": 100, "nested_b": 200})
"""
# use provided seed, "None" acts as if no seed is provided
randomstate = np.random.RandomState(seed=seed)
Expand Down
48 changes: 31 additions & 17 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def add_nested(
Examples
--------


>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]},
... index=[0,1,2])
>>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]},
Expand Down Expand Up @@ -320,11 +320,12 @@ def nest_lists(self, name: str, columns: list[str]) -> NestedFrame:
Examples
--------

>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6],
... "e":[[1,2,3], [4,5,6], [7,8,9]]},
... index=[0,1,2])

>>> nf.nest_lists(columns=["c","d"], name="nested")
>>> nf.nest_lists(columns=["e"], name="nested")
c d nested
0 1 2 [{e: 1}; …] (3 rows)
1 2 4 [{e: 4}; …] (3 rows)
Expand Down Expand Up @@ -367,6 +368,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None,
Examples
--------

>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"a":[1,1,1,2,2], "b":[2,2,2,4,4],
... "c":[1,2,3,4,5], "d":[2,4,6,8,10]},
... index=[0,0,0,1,1])
Expand Down Expand Up @@ -424,6 +426,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
Examples
--------

>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6],
... "e":[[1,2,3], [4,5,6], [7,8,9]]},
... index=[0,1,2])
Expand Down Expand Up @@ -605,7 +608,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |

>>> nf = nf.query("nested.t > 10")
>>> nf
a b nested
a b nested
0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, band: 'g'}; …]...
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
2 0.000114 0.691121 [{t: 11.173797, flux: 28.044399, band: 'r'}; …...
Expand All @@ -619,9 +622,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
with rows of that particular nested structure filtered. For example,
querying the NestedFrame "df" with nested structure "my_nested" as
below will return all rows of df, but with mynested filtered by the
condition:

>>> df.query("mynested.a > 2")
condition: `nf.query("mynested.a > 2")`
"""
if not isinstance(expr, str):
msg = f"expr must be a string to be evaluated, {type(expr)} given"
Expand Down Expand Up @@ -786,7 +787,7 @@ def dropna(
>>> # this query empties several of the nested dataframes
>>> nf = nf.query("nested.t > 19")
>>> nf
a b nested
a b nested
0 0.417022 0.184677 None
1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, band: 'r'}]
2 0.000114 0.691121 [{t: 19.157791, flux: 14.672857, band: 'r'}]
Expand All @@ -796,7 +797,7 @@ def dropna(

>>> # dropna removes rows with those emptied dataframes
>>> nf.dropna(subset="nested")
a b nested
a b nested
1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, band: 'r'}]
2 0.000114 0.691121 [{t: 19.157791, flux: 14.672857, band: 'r'}]

Expand All @@ -806,9 +807,20 @@ def dropna(
>>> nf = generate_data(5,5, seed=1)
>>> # Either on the whole dataframe
>>> nf.dropna(on_nested="nested")
a b nested
0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, band: 'r'}; …] ...
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, band: 'g'}; …]...
3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, band: 'g'}; …]...
4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, band: 'g'}; …]...
>>> # or on a specific nested column
>>> nf.dropna(subset="nested.t")

a b nested
0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, band: 'r'}; …] ...
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, band: 'g'}; …]...
3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, band: 'g'}; …]...
4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, band: 'g'}; …]...

Notes
-----
Expand Down Expand Up @@ -909,7 +921,7 @@ def sort_values(

>>> # Sort nested values
>>> nf.sort_values(by="nested.band")
a b nested
a b nested
0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, band: 'g'}; …]...
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, band: 'g'}; …]...
2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, band: 'g'}; …]...
Expand Down Expand Up @@ -1017,13 +1029,15 @@ def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame: # t
>>> from nested_pandas.datasets.generation import generate_data
>>> import numpy as np
>>> nf = generate_data(5,5, seed=1)

>>>
>>> # define a custom user function
>>> # reduce will return a NestedFrame with two columns
>>> def example_func(base_col, nested_col):
>>> '''reduce will return a NestedFrame with two columns'''
>>> return {"mean": np.mean(nested_col),
... "mean_minus_base": np.mean(nested_col) - base_col}

... return {
... "mean": np.mean(nested_col),
... "mean_minus_base": np.mean(nested_col) - base_col,
... }
>>>
>>> # apply the function
>>> nf.reduce(example_func, "a", "nested.t")
mean mean_minus_base
Expand All @@ -1038,8 +1052,8 @@ def reduce(self, func, *args, infer_nesting=True, **kwargs) -> NestedFrame: # t

>>> # define a custom user function that returns nested structure
>>> def example_func(base_col1, base_col2, nested_col):
>>> '''reduce will return a NestedFrame with nested structure'''
>>> return {"offsets.t_a": nested_col - base_col1,
... '''reduce will return a NestedFrame with nested structure'''
... return {"offsets.t_a": nested_col - base_col1,
... "offsets.t_b": nested_col - base_col2}

By giving both output columns the prefix "offsets.", we signal
Expand Down
4 changes: 2 additions & 2 deletions src/nested_pandas/nestedframe/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ def read_parquet(
Simple loading example:

>>> import nested_pandas as npd
>>> nf = npd.read_parquet("path/to/file.parquet")
>>> nf = npd.read_parquet("path/to/file.parquet") # doctest: +SKIP

Partial loading:

>>> #Load only the "flux" sub-column of the "nested" column
>>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"])
>>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"]) # doctest: +SKIP
"""

# Type convergence for reject_nesting
Expand Down
2 changes: 1 addition & 1 deletion src/nested_pandas/series/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def with_list_field(self, field: str, value: ArrayLike) -> pd.Series:
... [["g","g"],
... ["r","r"]])
>>> # Look at one row of the series
>>> nested_with_avg[0]
>>> nf_new_band[0]
t flux band new_band
0 2.935118 39.676747 g g
1 3.725204 41.919451 r g
Expand Down
3 changes: 3 additions & 0 deletions src/nested_pandas/series/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ def name(self) -> str:
fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()])
return f"nested<{fields}>"

def __repr__(self) -> str:
return self.name

@classmethod
def construct_array_type(cls) -> Type[ExtensionArray]:
"""Corresponded array type, always NestedExtensionArray"""
Expand Down
6 changes: 5 additions & 1 deletion src/nested_pandas/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,12 @@ def count_nested(df, nested, by=None, join=True) -> NestedFrame:
Examples
--------

>>> import pandas as pd
>>> # Show all columns
>>> pd.set_option("display.width", 200)
>>> pd.set_option("display.max_columns", None)
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,10,seed=1)
>>> nf = generate_data(5, 10, seed=1)

>>> from nested_pandas.utils import count_nested
>>> count_nested(nf, "nested")
Expand Down