Skip to content
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
bb5527d
WIP: reorg validation
msschwartz21 Jul 22, 2025
6074605
Specify mandatory int node id dtype in specification docs
msschwartz21 Jul 22, 2025
8bd1096
Validate that node ids are integers
msschwartz21 Jul 22, 2025
2df5987
Update test to reflect node dtype change
msschwartz21 Jul 22, 2025
7b3d01a
Move basic structural validation into a function that can be run on n…
msschwartz21 Jul 22, 2025
0e61b87
Add function for validating zarr structure with regards to axis metadata
msschwartz21 Jul 22, 2025
25ec2fa
Merge remote-tracking branch 'origin/main' into validator-framework
msschwartz21 Jul 23, 2025
b025edd
Mock up data validation function
msschwartz21 Jul 23, 2025
7e7914b
Update mock up for validation data/optional data api
msschwartz21 Jul 23, 2025
ce4c01a
Merge remote-tracking branch 'origin/main' into validator-framework
msschwartz21 Jul 23, 2025
7cd0ffe
Add edge/lineage/tracklet validation options
msschwartz21 Jul 23, 2025
885917e
Add in progress tests for new validate functions
msschwartz21 Jul 23, 2025
39d1427
WIP: Add shapes data validation
bentaculum Jul 23, 2025
7b771d3
Merge remote-tracking branch 'origin/main' into validator-framework
msschwartz21 Aug 13, 2025
b4dc731
Introduce data validate module
msschwartz21 Aug 13, 2025
77ce558
Fix typing problems post merge
msschwartz21 Aug 13, 2025
5adc83f
Fix tests
msschwartz21 Aug 13, 2025
be92065
Refactor shapes validation functions into stand alones
msschwartz21 Aug 13, 2025
7ddcc30
Merge remote-tracking branch 'origin/main' into validator-framework
msschwartz21 Aug 13, 2025
9b5a9e6
Test validate ellipsoid and update to handle arbitrary spatial dimens…
msschwartz21 Aug 14, 2025
04c5f85
Test sphere validation
msschwartz21 Aug 14, 2025
a7e1897
Make validate_axes_structure private
msschwartz21 Aug 23, 2025
fff4fdc
refactor: :construction: Refactor validation tests
cmalinmayor Aug 23, 2025
08e3937
Reorganize validate structure tests and remove some code that will be…
msschwartz21 Aug 23, 2025
b4e1b75
Use _path in validation tests
msschwartz21 Aug 23, 2025
5d362d6
Update validate node/edge/props subfunction tests to call the lower l…
msschwartz21 Aug 23, 2025
66dbfac
Refactor tests to be test classes instead of functions
msschwartz21 Aug 23, 2025
99f6538
Create separate testing file for validate.data
msschwartz21 Aug 23, 2025
aefc420
Test validate optional data
msschwartz21 Aug 23, 2025
3677b99
Expose new validation options in read functions
msschwartz21 Aug 23, 2025
a4d68ca
Add tests to improve coverage
msschwartz21 Aug 23, 2025
8bc9656
Validate unique node ids
msschwartz21 Aug 24, 2025
5ccfdce
Remove cast to tuple
msschwartz21 Aug 24, 2025
40584c5
Fix sad docstring
msschwartz21 Aug 24, 2025
5c22dc9
Consolidate all data validation into one function (instead of graph/o…
msschwartz21 Aug 24, 2025
f4bd40a
Fix benchmarking
msschwartz21 Aug 24, 2025
b6713c0
Node ids must be positive integers
msschwartz21 Aug 24, 2025
2eab0a5
Node ids must be unsigned int
msschwartz21 Aug 24, 2025
76cb9ee
Revert to just integer node ids for now, will enforce uint in another PR
msschwartz21 Aug 24, 2025
b8f2887
Merge branch 'main' into validator-framework
msschwartz21 Aug 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/specification.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ The optional `extra` object is a free-form dictionary that can hold any addition
## The `nodes` group
The nodes group will contain an `ids` array and optionally a `props` group.
### The `ids` array
The `nodes\ids` array is a 1D array of node IDs of length `N` >= 0, where `N` is the number of nodes in the graph. Node ids must be unique. Node IDs can have any type supported by zarr (except floats), but we recommend integer dtypes. For large graphs, `uint64` might be necessary to provide enough range for every node to have a unique ID. In the minimal case of an empty graph, the `ids` array will be present but empty.
The `nodes\ids` array is a 1D array of node IDs of length `N` >= 0, where `N` is the number of nodes in the graph. Node ids must be unique. Node IDs must have an integer dtype. For large graphs, `uint64` might be necessary to provide enough range for every node to have a unique ID. In the minimal case of an empty graph, the `ids` array will be present but empty.


### The `props` group and `node property` groups
Expand Down
17 changes: 16 additions & 1 deletion src/geff/_graph_libs/_api_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from geff._typing import PropDictNpArray
from geff.metadata._schema import GeffMetadata
from geff.validate.data import ValidationConfig

SupportedBackend = Literal["networkx", "rustworkx", "spatial-graph"]

Expand Down Expand Up @@ -111,6 +112,8 @@ def read(
node_props: list[str] | None = None,
edge_props: list[str] | None = None,
backend: Literal["networkx"] = "networkx",
validate_data: bool = False,
validate_opt_data: ValidationConfig | None = None,
) -> tuple[nx.Graph | nx.DiGraph, GeffMetadata]: ...


Expand All @@ -121,6 +124,8 @@ def read(
node_props: list[str] | None,
edge_props: list[str] | None,
backend: Literal["rustworkx"],
validate_data: bool = False,
validate_opt_data: ValidationConfig | None = None,
) -> tuple[rx.PyGraph | rx.PyDiGraph, GeffMetadata]: ...


Expand All @@ -131,6 +136,8 @@ def read(
node_props: list[str] | None,
edge_props: list[str] | None,
backend: Literal["spatial-graph"],
validate_data: bool = False,
validate_opt_data: ValidationConfig | None = None,
*,
position_attr: str = "position",
) -> tuple[sg.SpatialGraph | sg.SpatialDiGraph, GeffMetadata]: ...
Expand All @@ -142,6 +149,8 @@ def read(
node_props: list[str] | None = None,
edge_props: list[str] | None = None,
backend: SupportedBackend = "networkx",
validate_data: bool = False,
validate_opt_data: ValidationConfig | None = None,
**backend_kwargs: Any,
) -> tuple[Any, GeffMetadata]:
"""
Expand All @@ -159,14 +168,20 @@ def read(
if None all properties will be loaded, defaults to None.
backend ({"networkx", "rustworkx", "spatial-graph"}): Flag for the chosen backend, default
is "networkx".
validate_data (bool, optional): Flag indicating whether to perform validation on the
underlying data of the geff, e.g. edges. Defaults to False.
validate_opt_data (ValidationConfig, optional): Optional configuration for which
optional types of data to validate
backend_kwargs (Any): Additional kwargs that may be accepted by
the backend when reading the data.

Returns:
tuple[Any, GeffMetadata]: Graph object of the chosen backend, and the GEFF metadata.
"""
construct_func = get_construct_func(backend)
in_memory_geff = read_to_memory(store, validate, node_props, edge_props)
in_memory_geff = read_to_memory(
store, validate, node_props, edge_props, validate_data, validate_opt_data
)
return (
construct_func(**in_memory_geff, **backend_kwargs),
in_memory_geff["metadata"],
Expand Down
11 changes: 10 additions & 1 deletion src/geff/_graph_libs/_networkx.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from zarr.storage import StoreLike

from geff._typing import PropDictNpArray
from geff.validate.data import ValidationConfig

import logging

Expand Down Expand Up @@ -193,6 +194,8 @@ def read_nx(
validate: bool = True,
node_props: list[str] | None = None,
edge_props: list[str] | None = None,
validate_data: bool = False,
validate_opt_data: ValidationConfig | None = None,
) -> tuple[nx.Graph, GeffMetadata]:
"""Read a geff file into a networkx graph. Metadata properties will be stored in
the graph properties, accessed via `G.graph[key]` where G is a networkx graph.
Expand All @@ -207,11 +210,17 @@ def read_nx(
if None all properties will be loaded, defaults to None.
edge_props (list of str, optional): The names of the edge properties to load,
if None all properties will be loaded, defaults to None.
validate_data (bool, optional): Flag indicating whether to perform validation on the
underlying data of the geff, e.g. edges. Defaults to False.
validate_opt_data (ValidationConfig, optional): Optional configuration for which
optional types of data to validate

Returns:
A networkx graph containing the graph that was stored in the geff file format
"""
in_memory_geff = read_to_memory(store, validate, node_props, edge_props)
in_memory_geff = read_to_memory(
store, validate, node_props, edge_props, validate_data, validate_opt_data
)
graph = construct_nx(**in_memory_geff)

return graph, in_memory_geff["metadata"]
11 changes: 10 additions & 1 deletion src/geff/_graph_libs/_rustworkx.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from zarr.storage import StoreLike

from geff._typing import PropDictNpArray
from geff.validate.data import ValidationConfig


def get_roi_rx(
Expand Down Expand Up @@ -253,6 +254,8 @@ def read_rx(
validate: bool = True,
node_props: list[str] | None = None,
edge_props: list[str] | None = None,
validate_data: bool = False,
validate_opt_data: ValidationConfig | None = None,
) -> tuple[rx.PyGraph | rx.PyDiGraph, GeffMetadata]:
"""Read a geff file into a rustworkx graph.
Metadata properties will be stored in the graph.attrs dict
Expand All @@ -269,11 +272,17 @@ def read_rx(
if None all properties will be loaded, defaults to None.
edge_props: The names of the edge properties to load,
if None all properties will be loaded, defaults to None.
validate_data (bool, optional): Flag indicating whether to perform validation on the
underlying data of the geff, e.g. edges. Defaults to False.
validate_opt_data (ValidationConfig, optional): Optional configuration for which
optional types of data to validate

Returns:
A tuple containing the rustworkx graph and the metadata.
"""
graph_dict = read_to_memory(store, validate, node_props, edge_props)
graph_dict = read_to_memory(
store, validate, node_props, edge_props, validate_data, validate_opt_data
)
graph = construct_rx(**graph_dict)

return graph, graph_dict["metadata"]
21 changes: 10 additions & 11 deletions src/geff/_graph_libs/_spatial_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from zarr.storage import StoreLike

from geff._typing import PropDictNpArray
from geff.validate.data import ValidationConfig

import geff
from geff.core_io import write_arrays
Expand Down Expand Up @@ -120,6 +121,8 @@ def read_sg(
position_attr: str = "position",
node_props: list[str] | None = None,
edge_props: list[str] | None = None,
validate_data: bool = False,
validate_opt_data: ValidationConfig | None = None,
) -> tuple[sg.SpatialGraph | sg.SpatialDiGraph, GeffMetadata]:
"""Read a geff file into a SpatialGraph.

Expand All @@ -129,37 +132,33 @@ def read_sg(
Args:

store (Path | str | zarr store):

The path to the root of the geff zarr, where the .attrs contains
the geff metadata.

validate (bool, optional):

Flag indicating whether to perform validation on the geff file
before loading into memory. If set to False and there are format
issues, will likely fail with a cryptic error. Defaults to True.

position_attr (str, optional):

How to call the position attribute in the returned SpatialGraph.
Defaults to "position".

node_props (list of str, optional):

The names of the node properties to load, if None all properties
will be loaded, defaults to None.

edge_props (list of str, optional):

The names of the edge properties to load, if None all properties
will be loaded, defaults to None.
validate_data (bool, optional): Flag indicating whether to perform validation on the
underlying data of the geff, e.g. edges. Defaults to False.
validate_opt_data (ValidationConfig, optional): Optional configuration for which
optional types of data to validate

Returns:

A tuple containing the spatial_graph graph and the metadata.
"""

in_memory_geff = read_to_memory(store, validate, node_props, edge_props)
in_memory_geff = read_to_memory(
store, validate, node_props, edge_props, validate_data, validate_opt_data
)
graph = construct_sg(**in_memory_geff, position_attr=position_attr)

return graph, in_memory_geff["metadata"]
Expand Down
19 changes: 16 additions & 3 deletions src/geff/core_io/_base_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from geff import _path
from geff.core_io import _utils
from geff.metadata._schema import GeffMetadata
from geff.validate.data import ValidationConfig, validate_optional_data, validate_zarr_data
from geff.validate.structure import validate_structure

if TYPE_CHECKING:
Expand Down Expand Up @@ -217,6 +218,8 @@ def read_to_memory(
validate: bool = True,
node_props: Iterable[str] | None = None,
edge_props: Iterable[str] | None = None,
validate_data: bool = False,
validate_opt_data: ValidationConfig | None = None,
) -> InMemoryGeff:
"""
Read a GEFF zarr file to into memory as a series of numpy arrays in a dictionary.
Expand All @@ -227,9 +230,12 @@ def read_to_memory(
Args:
source (str | Path | zarr store): Either a path to the root of the geff zarr
(where the .attrs contains the geff metadata), or a zarr store object
validate (bool, optional): Flag indicating whether to perform validation on the
geff file before loading into memory. If set to False and there are
format issues, will likely fail with a cryptic error. Defaults to True.
validate (bool, optional): Flag indicating whether to perform metadata/structure
validation on the geff file before loading into memory. If set to False and
there are format issues, will likely fail with a cryptic error. Defaults to True.
validate_data (bool, optional): Flag indicating whether to perform validation on the
underlying data of the geff, e.g. edges. Defaults to False.
validate_opt_data (ValidationConfig, optional): Optional configuration for which
node_props (iterable of str, optional): The names of the node properties to load,
if None all properties will be loaded, defaults to None.
edge_props (iterable of str, optional): The names of the edge properties to load,
Expand All @@ -246,4 +252,11 @@ def read_to_memory(
file_reader.read_edge_props(edge_props)

in_memory_geff = file_reader.build()

if validate_data:
validate_zarr_data(in_memory_geff)

if validate_opt_data is not None:
validate_optional_data(config=validate_opt_data, memory_geff=in_memory_geff)

return in_memory_geff
1 change: 1 addition & 0 deletions src/geff/testing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
if TYPE_CHECKING:
from numpy.typing import NDArray


DTypeStr = Literal["double", "int", "int8", "uint8", "int16", "uint16", "float32", "float64", "str"]
NodeIdDTypeStr = Literal["int", "int8", "uint8", "int16", "uint16"]
Axes = Literal["t", "z", "y", "x"]
Expand Down
86 changes: 86 additions & 0 deletions src/geff/validate/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from pydantic import BaseModel

from geff.validate.graph import (
validate_no_repeated_edges,
validate_no_self_edges,
validate_nodes_for_edges,
)
from geff.validate.shapes import validate_ellipsoid, validate_sphere
from geff.validate.tracks import (
validate_lineages,
validate_tracklets,
)

if TYPE_CHECKING:
from geff._typing import InMemoryGeff


def validate_zarr_data(memory_geff: InMemoryGeff) -> None:
"""Checks whether the graph meets spec requirements
Args:
memory_geff (InMemoryGeff): An InMemoryGeff object which contains metadata and
dictionaries of node/edge property arrays
"""
node_ids = memory_geff["node_ids"]
edge_ids = memory_geff["edge_ids"]

valid, invalid_edges = validate_nodes_for_edges(node_ids, edge_ids)
if not valid:
raise ValueError(f"Some edges are missing nodes:\n{invalid_edges}")

valid, invalid_edges = validate_no_self_edges(edge_ids)
if not valid:
raise ValueError(f"Self edges found in data:\n{invalid_edges}")

valid, invalid_edges = validate_no_repeated_edges(edge_ids)
if not valid:
raise ValueError(f"Repeated edges found in data:\n{invalid_edges}")


class ValidationConfig(BaseModel):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts on adding nodes/edges to this config and then only passing one argument to the read/write functions? I assume you considered it and rejected, and I don't have a strong opinion, but bringing it up just in case

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ben and I had created a separation in our brains between the graph (as mandatory data) and optional attribute data. We hadn't considered putting the graph into the validation config, but I like it. I'll make that change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed elsewhere, could consider rolling the structure into the same argument but up to you

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I decided to leave structure out, but consolidated all the data once into the validation config

sphere: bool = False
ellipsoid: bool = False
lineage: bool = False
tracklet: bool = False


def validate_optional_data(config: ValidationConfig, memory_geff: InMemoryGeff) -> None:
"""Run data validation on optional data types based on the input
Args:
config (ValidationConfig): Configuration for which validation to run
memory_geff (InMemoryGeff): A graphdict object which contains metadata and
dictionaries of node/edge property arrays
"""
meta = memory_geff["metadata"]
if config.sphere and meta.sphere is not None:
radius = memory_geff["node_props"][meta.sphere]["values"]
validate_sphere(radius)

if config.ellipsoid and meta.ellipsoid is not None:
covariance = memory_geff["node_props"][meta.ellipsoid]["values"]
validate_ellipsoid(covariance, memory_geff["metadata"].axes)

if meta.track_node_props is not None:
if config.tracklet and "tracklet" in meta.track_node_props:
node_ids = memory_geff["node_ids"]
edge_ids = memory_geff["edge_ids"]
tracklet_key = meta.track_node_props["tracklet"]
tracklet_ids = memory_geff["node_props"][tracklet_key]["values"]
valid, errors = validate_tracklets(node_ids, edge_ids, tracklet_ids)
if not valid:
raise ValueError("Found invalid tracklets:\n", "\n".join(errors))

if config.lineage and "lineage" in meta.track_node_props:
node_ids = memory_geff["node_ids"]
edge_ids = memory_geff["edge_ids"]
lineage_key = meta.track_node_props["lineage"]
lineage_ids = memory_geff["node_props"][lineage_key]["values"]
valid, errors = validate_lineages(node_ids, edge_ids, lineage_ids)
if not valid:
raise ValueError("Found invalid lineages:\n", "\n".join(errors))
12 changes: 5 additions & 7 deletions src/geff/validate/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@
from numpy.typing import ArrayLike


def validate_nodes_for_edges(
node_ids: ArrayLike, edge_ids: ArrayLike
) -> tuple[bool, list[tuple[int, int]]]:
def validate_nodes_for_edges(node_ids: ArrayLike, edge_ids: ArrayLike) -> tuple[bool, np.ndarray]:
"""
Validates that all edges in `edge_ids` reference node IDs present in `node_ids`.

Expand All @@ -24,9 +22,9 @@ def validate_nodes_for_edges(
(source, target).

Returns:
tuple[bool, list[tuple[int, int]]]:
tuple[bool, np.ndarray]:
- all_edges_valid (bool): True if all edges reference valid node IDs.
- invalid_edges (list of tuple[int, int]): List of (source, target) pairs for
- invalid_edges (np.ndarray): Array of (source, target) pairs for
invalid edges.
"""

Expand All @@ -39,8 +37,8 @@ def validate_nodes_for_edges(
mask = valid_src & valid_tgt

# Find invalid edges
invalid_edges = [tuple(edge) for edge in edge_ids[~mask]]
all_edges_valid = not invalid_edges
invalid_edges = np.asarray([tuple(edge) for edge in edge_ids[~mask]])
all_edges_valid = len(invalid_edges) == 0
return all_edges_valid, invalid_edges


Expand Down
Loading
Loading