live-image-tracking-tools · msschwartz21 · Aug 25, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/docs/specification.md b/docs/specification.md
@@ -42,7 +42,7 @@ The optional `extra` object is a free-form dictionary that can hold any addition
 ## The `nodes` group
 The nodes group will contain an `ids` array and optionally a `props` group. 
 ### The `ids` array
-The `nodes\ids` array is a 1D array of node IDs of length `N` >= 0, where `N` is the number of nodes in the graph. Node ids must be unique. Node IDs can have any type supported by zarr (except floats), but we recommend integer dtypes. For large graphs, `uint64` might be necessary to provide enough range for every node to have a unique ID. In the minimal case of an empty graph, the `ids` array will be present but empty. 
+The `nodes\ids` array is a 1D array of node IDs of length `N` >= 0, where `N` is the number of nodes in the graph. Node ids must be unique. Node IDs must have an integer dtype. For large graphs, `uint64` might be necessary to provide enough range for every node to have a unique ID. In the minimal case of an empty graph, the `ids` array will be present but empty. 
 
 
 ### The `props` group and `node property` groups

diff --git a/src/geff/_graph_libs/_api_wrapper.py b/src/geff/_graph_libs/_api_wrapper.py
@@ -16,6 +16,7 @@
 
     from geff._typing import PropDictNpArray
     from geff.metadata._schema import GeffMetadata
+    from geff.validate.data import ValidationConfig
 
 SupportedBackend = Literal["networkx", "rustworkx", "spatial-graph"]
 
@@ -111,6 +112,8 @@ def read(
     node_props: list[str] | None = None,
     edge_props: list[str] | None = None,
     backend: Literal["networkx"] = "networkx",
+    validate_data: bool = False,
+    validate_opt_data: ValidationConfig | None = None,
 ) -> tuple[nx.Graph | nx.DiGraph, GeffMetadata]: ...
 
 
@@ -121,6 +124,8 @@ def read(
     node_props: list[str] | None,
     edge_props: list[str] | None,
     backend: Literal["rustworkx"],
+    validate_data: bool = False,
+    validate_opt_data: ValidationConfig | None = None,
 ) -> tuple[rx.PyGraph | rx.PyDiGraph, GeffMetadata]: ...
 
 
@@ -131,6 +136,8 @@ def read(
     node_props: list[str] | None,
     edge_props: list[str] | None,
     backend: Literal["spatial-graph"],
+    validate_data: bool = False,
+    validate_opt_data: ValidationConfig | None = None,
     *,
     position_attr: str = "position",
 ) -> tuple[sg.SpatialGraph | sg.SpatialDiGraph, GeffMetadata]: ...
@@ -142,6 +149,8 @@ def read(
     node_props: list[str] | None = None,
     edge_props: list[str] | None = None,
     backend: SupportedBackend = "networkx",
+    validate_data: bool = False,
+    validate_opt_data: ValidationConfig | None = None,
     **backend_kwargs: Any,
 ) -> tuple[Any, GeffMetadata]:
     """
@@ -159,14 +168,20 @@ def read(
             if None all properties will be loaded, defaults to None.
         backend ({"networkx", "rustworkx", "spatial-graph"}): Flag for the chosen backend, default
             is "networkx".
+        validate_data (bool, optional): Flag indicating whether to perform validation on the
+            underlying data of the geff, e.g. edges. Defaults to False.
+        validate_opt_data (ValidationConfig, optional): Optional configuration for which
+            optional types of data to validate
         backend_kwargs (Any): Additional kwargs that may be accepted by
             the backend when reading the data.
 
     Returns:
         tuple[Any, GeffMetadata]: Graph object of the chosen backend, and the GEFF metadata.
     """
     construct_func = get_construct_func(backend)
-    in_memory_geff = read_to_memory(store, validate, node_props, edge_props)
+    in_memory_geff = read_to_memory(
+        store, validate, node_props, edge_props, validate_data, validate_opt_data
+    )
     return (
         construct_func(**in_memory_geff, **backend_kwargs),
         in_memory_geff["metadata"],

diff --git a/src/geff/_graph_libs/_networkx.py b/src/geff/_graph_libs/_networkx.py
@@ -18,6 +18,7 @@
     from zarr.storage import StoreLike
 
     from geff._typing import PropDictNpArray
+    from geff.validate.data import ValidationConfig
 
 import logging
 
@@ -193,6 +194,8 @@ def read_nx(
     validate: bool = True,
     node_props: list[str] | None = None,
     edge_props: list[str] | None = None,
+    validate_data: bool = False,
+    validate_opt_data: ValidationConfig | None = None,
 ) -> tuple[nx.Graph, GeffMetadata]:
     """Read a geff file into a networkx graph. Metadata properties will be stored in
     the graph properties, accessed via `G.graph[key]` where G is a networkx graph.
@@ -207,11 +210,17 @@ def read_nx(
             if None all properties will be loaded, defaults to None.
         edge_props (list of str, optional): The names of the edge properties to load,
             if None all properties will be loaded, defaults to None.
+        validate_data (bool, optional): Flag indicating whether to perform validation on the
+            underlying data of the geff, e.g. edges. Defaults to False.
+        validate_opt_data (ValidationConfig, optional): Optional configuration for which
+            optional types of data to validate
 
     Returns:
         A networkx graph containing the graph that was stored in the geff file format
     """
-    in_memory_geff = read_to_memory(store, validate, node_props, edge_props)
+    in_memory_geff = read_to_memory(
+        store, validate, node_props, edge_props, validate_data, validate_opt_data
+    )
     graph = construct_nx(**in_memory_geff)
 
     return graph, in_memory_geff["metadata"]
diff --git a/src/geff/_graph_libs/_rustworkx.py b/src/geff/_graph_libs/_rustworkx.py
@@ -28,6 +28,7 @@
     from zarr.storage import StoreLike
 
     from geff._typing import PropDictNpArray
+    from geff.validate.data import ValidationConfig
 
 
 def get_roi_rx(
@@ -253,6 +254,8 @@ def read_rx(
     validate: bool = True,
     node_props: list[str] | None = None,
     edge_props: list[str] | None = None,
+    validate_data: bool = False,
+    validate_opt_data: ValidationConfig | None = None,
 ) -> tuple[rx.PyGraph | rx.PyDiGraph, GeffMetadata]:
     """Read a geff file into a rustworkx graph.
     Metadata properties will be stored in the graph.attrs dict
@@ -269,11 +272,17 @@ def read_rx(
             if None all properties will be loaded, defaults to None.
         edge_props: The names of the edge properties to load,
             if None all properties will be loaded, defaults to None.
+        validate_data (bool, optional): Flag indicating whether to perform validation on the
+            underlying data of the geff, e.g. edges. Defaults to False.
+        validate_opt_data (ValidationConfig, optional): Optional configuration for which
+            optional types of data to validate
 
     Returns:
         A tuple containing the rustworkx graph and the metadata.
     """
-    graph_dict = read_to_memory(store, validate, node_props, edge_props)
+    graph_dict = read_to_memory(
+        store, validate, node_props, edge_props, validate_data, validate_opt_data
+    )
     graph = construct_rx(**graph_dict)
 
     return graph, graph_dict["metadata"]
diff --git a/src/geff/_graph_libs/_spatial_graph.py b/src/geff/_graph_libs/_spatial_graph.py
@@ -21,6 +21,7 @@
     from zarr.storage import StoreLike
 
     from geff._typing import PropDictNpArray
+    from geff.validate.data import ValidationConfig
 
 import geff
 from geff.core_io import write_arrays
@@ -120,6 +121,8 @@ def read_sg(
     position_attr: str = "position",
     node_props: list[str] | None = None,
     edge_props: list[str] | None = None,
+    validate_data: bool = False,
+    validate_opt_data: ValidationConfig | None = None,
 ) -> tuple[sg.SpatialGraph | sg.SpatialDiGraph, GeffMetadata]:
     """Read a geff file into a SpatialGraph.
 
@@ -129,37 +132,33 @@ def read_sg(
     Args:
 
         store (Path | str | zarr store):
-
             The path to the root of the geff zarr, where the .attrs contains
             the geff  metadata.
-
         validate (bool, optional):
-
             Flag indicating whether to perform validation on the geff file
             before loading into memory. If set to False and there are format
             issues, will likely fail with a cryptic error. Defaults to True.
-
         position_attr (str, optional):
-
             How to call the position attribute in the returned SpatialGraph.
             Defaults to "position".
-
         node_props (list of str, optional):
-
             The names of the node properties to load, if None all properties
             will be loaded, defaults to None.
-
         edge_props (list of str, optional):
-
             The names of the edge properties to load, if None all properties
             will be loaded, defaults to None.
+        validate_data (bool, optional): Flag indicating whether to perform validation on the
+            underlying data of the geff, e.g. edges. Defaults to False.
+        validate_opt_data (ValidationConfig, optional): Optional configuration for which
+            optional types of data to validate
 
     Returns:
-
         A tuple containing the spatial_graph graph and the metadata.
     """
 
-    in_memory_geff = read_to_memory(store, validate, node_props, edge_props)
+    in_memory_geff = read_to_memory(
+        store, validate, node_props, edge_props, validate_data, validate_opt_data
+    )
     graph = construct_sg(**in_memory_geff, position_attr=position_attr)
 
     return graph, in_memory_geff["metadata"]

diff --git a/src/geff/core_io/_base_read.py b/src/geff/core_io/_base_read.py
@@ -8,6 +8,7 @@
 from geff import _path
 from geff.core_io import _utils
 from geff.metadata._schema import GeffMetadata
+from geff.validate.data import ValidationConfig, validate_optional_data, validate_zarr_data
 from geff.validate.structure import validate_structure
 
 if TYPE_CHECKING:
@@ -217,6 +218,8 @@ def read_to_memory(
     validate: bool = True,
     node_props: Iterable[str] | None = None,
     edge_props: Iterable[str] | None = None,
+    validate_data: bool = False,
+    validate_opt_data: ValidationConfig | None = None,
 ) -> InMemoryGeff:
     """
     Read a GEFF zarr file to into memory as a series of numpy arrays in a dictionary.
@@ -227,9 +230,12 @@ def read_to_memory(
     Args:
         source (str | Path | zarr store): Either a path to the root of the geff zarr
             (where the .attrs contains the geff metadata), or a zarr store object
-        validate (bool, optional): Flag indicating whether to perform validation on the
-            geff file before loading into memory. If set to False and there are
-            format issues, will likely fail with a cryptic error. Defaults to True.
+        validate (bool, optional): Flag indicating whether to perform metadata/structure
+            validation on the geff file before loading into memory. If set to False and
+            there are format issues, will likely fail with a cryptic error. Defaults to True.
+        validate_data (bool, optional): Flag indicating whether to perform validation on the
+            underlying data of the geff, e.g. edges. Defaults to False.
+        validate_opt_data (ValidationConfig, optional): Optional configuration for which
         node_props (iterable of str, optional): The names of the node properties to load,
             if None all properties will be loaded, defaults to None.
         edge_props (iterable of str, optional): The names of the edge properties to load,
@@ -246,4 +252,11 @@ def read_to_memory(
     file_reader.read_edge_props(edge_props)
 
     in_memory_geff = file_reader.build()
+
+    if validate_data:
+        validate_zarr_data(in_memory_geff)
+
+    if validate_opt_data is not None:
+        validate_optional_data(config=validate_opt_data, memory_geff=in_memory_geff)
+
     return in_memory_geff
diff --git a/src/geff/testing/data.py b/src/geff/testing/data.py
@@ -70,6 +70,7 @@
 if TYPE_CHECKING:
     from numpy.typing import NDArray
 
+
 DTypeStr = Literal["double", "int", "int8", "uint8", "int16", "uint16", "float32", "float64", "str"]
 NodeIdDTypeStr = Literal["int", "int8", "uint8", "int16", "uint16"]
 Axes = Literal["t", "z", "y", "x"]

diff --git a/src/geff/validate/data.py b/src/geff/validate/data.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pydantic import BaseModel
+
+from geff.validate.graph import (
+    validate_no_repeated_edges,
+    validate_no_self_edges,
+    validate_nodes_for_edges,
+)
+from geff.validate.shapes import validate_ellipsoid, validate_sphere
+from geff.validate.tracks import (
+    validate_lineages,
+    validate_tracklets,
+)
+
+if TYPE_CHECKING:
+    from geff._typing import InMemoryGeff
+
+
+def validate_zarr_data(memory_geff: InMemoryGeff) -> None:
+    """Checks whether the graph meets spec requirements
+
+    Args:
+        memory_geff (InMemoryGeff): An InMemoryGeff object which contains metadata and
+            dictionaries of node/edge property arrays
+    """
+    node_ids = memory_geff["node_ids"]
+    edge_ids = memory_geff["edge_ids"]
+
+    valid, invalid_edges = validate_nodes_for_edges(node_ids, edge_ids)
+    if not valid:
+        raise ValueError(f"Some edges are missing nodes:\n{invalid_edges}")
+
+    valid, invalid_edges = validate_no_self_edges(edge_ids)
+    if not valid:
+        raise ValueError(f"Self edges found in data:\n{invalid_edges}")
+
+    valid, invalid_edges = validate_no_repeated_edges(edge_ids)
+    if not valid:
+        raise ValueError(f"Repeated edges found in data:\n{invalid_edges}")
+
+
+class ValidationConfig(BaseModel):
+    sphere: bool = False
+    ellipsoid: bool = False
+    lineage: bool = False
+    tracklet: bool = False
+
+
+def validate_optional_data(config: ValidationConfig, memory_geff: InMemoryGeff) -> None:
+    """Run data validation on optional data types based on the input
+
+    Args:
+        config (ValidationConfig): Configuration for which validation to run
+        memory_geff (InMemoryGeff): A graphdict object which contains metadata and
+            dictionaries of node/edge property arrays
+    """
+    meta = memory_geff["metadata"]
+    if config.sphere and meta.sphere is not None:
+        radius = memory_geff["node_props"][meta.sphere]["values"]
+        validate_sphere(radius)
+
+    if config.ellipsoid and meta.ellipsoid is not None:
+        covariance = memory_geff["node_props"][meta.ellipsoid]["values"]
+        validate_ellipsoid(covariance, memory_geff["metadata"].axes)
+
+    if meta.track_node_props is not None:
+        if config.tracklet and "tracklet" in meta.track_node_props:
+            node_ids = memory_geff["node_ids"]
+            edge_ids = memory_geff["edge_ids"]
+            tracklet_key = meta.track_node_props["tracklet"]
+            tracklet_ids = memory_geff["node_props"][tracklet_key]["values"]
+            valid, errors = validate_tracklets(node_ids, edge_ids, tracklet_ids)
+            if not valid:
+                raise ValueError("Found invalid tracklets:\n", "\n".join(errors))
+
+        if config.lineage and "lineage" in meta.track_node_props:
+            node_ids = memory_geff["node_ids"]
+            edge_ids = memory_geff["edge_ids"]
+            lineage_key = meta.track_node_props["lineage"]
+            lineage_ids = memory_geff["node_props"][lineage_key]["values"]
+            valid, errors = validate_lineages(node_ids, edge_ids, lineage_ids)
+            if not valid:
+                raise ValueError("Found invalid lineages:\n", "\n".join(errors))
diff --git a/src/geff/validate/graph.py b/src/geff/validate/graph.py
@@ -8,9 +8,7 @@
     from numpy.typing import ArrayLike
 
 
-def validate_nodes_for_edges(
-    node_ids: ArrayLike, edge_ids: ArrayLike
-) -> tuple[bool, list[tuple[int, int]]]:
+def validate_nodes_for_edges(node_ids: ArrayLike, edge_ids: ArrayLike) -> tuple[bool, np.ndarray]:
     """
     Validates that all edges in `edge_ids` reference node IDs present in `node_ids`.
 
@@ -24,9 +22,9 @@ def validate_nodes_for_edges(
             (source, target).
 
     Returns:
-        tuple[bool, list[tuple[int, int]]]:
+        tuple[bool, np.ndarray]:
             - all_edges_valid (bool): True if all edges reference valid node IDs.
-            - invalid_edges (list of tuple[int, int]): List of (source, target) pairs for
+            - invalid_edges (np.ndarray): Array of (source, target) pairs for
               invalid edges.
     """
 
@@ -39,8 +37,8 @@ def validate_nodes_for_edges(
     mask = valid_src & valid_tgt
 
     # Find invalid edges
-    invalid_edges = [tuple(edge) for edge in edge_ids[~mask]]
-    all_edges_valid = not invalid_edges
+    invalid_edges = np.asarray([tuple(edge) for edge in edge_ids[~mask]])
+    all_edges_valid = len(invalid_edges) == 0
     return all_edges_valid, invalid_edges