Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions icechunk-python/python/icechunk/testing/stateful_subsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Helpers for running Hypothesis stateful tests on rule subsets.

Hypothesis doesn't provide an API for testing specific rule subsets
(see https://github.com/HypothesisWorks/hypothesis/issues/4682).

This module provides a workaround: dynamically subclass a state machine and
override excluded rules with plain methods, which strips the ``@rule`` marker
so Hypothesis ignores them.
"""

from __future__ import annotations

import inspect
from collections.abc import Sequence

from hypothesis import settings as Settings
from hypothesis.stateful import (
RuleBasedStateMachine,
run_state_machine_as_test,
)

# Private Hypothesis constant — stable, needed to discover which methods are rules.
RULE_MARKER = "hypothesis_stateful_rule"
PRECONDITIONS_MARKER = "hypothesis_stateful_preconditions"


def test_subsets(
machine_cls: type[RuleBasedStateMachine],
*,
with_rules: Sequence[tuple[set[str], Settings | int]] = (),
without_rules: Sequence[tuple[set[str], Settings | int]] = (),
) -> None:
"""Run *machine_cls* multiple times, each time with a different rule subset.

Parameters
----------
machine_cls : type[RuleBasedStateMachine]
The base state machine class.
with_rules : Sequence[tuple[set[str], Settings | int]]
Each entry is ``(rule_names_to_keep, settings_or_max_examples)``.
All rules **not** in the set are disabled for that run.
without_rules : Sequence[tuple[set[str], Settings | int]]
Each entry is ``(rule_names_to_remove, settings_or_max_examples)``.
The listed rules are disabled; everything else is kept.

Raises
------
ValueError
If a rule name doesn't exist on the machine.

Examples
--------
Run only GC-related rules with 50 examples::

test_subsets(
MyMachine,
with_rules=[
({"commit", "expire", "gc"}, settings(max_examples=50)),
],
)
"""
all_names = {
name
for name, f in inspect.getmembers(machine_cls)
if getattr(f, RULE_MARKER, None) is not None
}

for keep, s in with_rules:
_validate(keep, all_names)
_run_subset(
machine_cls, all_names, remove=all_names - keep, settings=_to_settings(s)
)

for remove, s in without_rules:
_validate(remove, all_names)
_run_subset(machine_cls, all_names, remove=remove, settings=_to_settings(s))


def _validate(names: set[str], all_names: set[str]) -> None:
unknown = names - all_names
if unknown:
raise ValueError(f"Unknown rules: {unknown}. Available: {sorted(all_names)}")


def _to_settings(s: Settings | int) -> Settings:
if isinstance(s, int):
return Settings(parent=Settings(), max_examples=s)
return s


def _run_subset(
machine_cls: type[RuleBasedStateMachine],
all_names: set[str],
remove: set[str],
settings: Settings,
) -> None:
"""Create a subclass with *remove* rules disabled, then run it."""
if not remove:
raise ValueError("remove is empty — this would run the full machine unchanged")

def strip_rule(name: str):
"""Walk the __wrapped__ chain until we find a function without the rule marker."""
func = getattr(machine_cls, name)
while hasattr(func, RULE_MARKER) or hasattr(func, PRECONDITIONS_MARKER):
if not hasattr(func, "__wrapped__"):
raise ValueError(
f"{name} has {RULE_MARKER} or {PRECONDITIONS_MARKER} "
f"but no __wrapped__ — cannot strip rule"
)
func = func.__wrapped__
return func

overrides = {name: strip_rule(name) for name in remove}
kept = sorted(all_names - remove)
subset_name = f"{machine_cls.__name__}[{'+'.join(kept)}]"
subset_cls = type(subset_name, (machine_cls,), overrides)
run_state_machine_as_test(subset_cls, settings=settings)
102 changes: 100 additions & 2 deletions icechunk-python/tests/test_stateful_repo_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import operator
import sys
import textwrap
import time
from collections.abc import Iterator
from dataclasses import dataclass, fields
from functools import partial
Expand All @@ -27,7 +28,7 @@
import hypothesis.extra.numpy as npst
import hypothesis.strategies as st
import pytest
from hypothesis import assume, event, note
from hypothesis import assume, event, note, settings
from hypothesis.stateful import (
Bundle,
RuleBasedStateMachine,
Expand Down Expand Up @@ -676,6 +677,19 @@ def new_store(self) -> None:
def sync_store(self) -> NewSyncStoreWrapper:
return NewSyncStoreWrapper(self.session.store)

@precondition(lambda self: self.model.branch is not None)
@rule(target=commits)
def empty_commit(self) -> str:
"""Make a single empty commit with a small sleep for timestamp spread."""
time.sleep(0.005)
branch = self.session.branch
assert branch is not None
snap_id = self.session.commit("empty", allow_empty=True)
snapinfo = next(iter(self.repo.ancestry(branch=branch)))
self.session = self.repo.writable_session(branch)
self.model.commit(snapinfo)
return snap_id

@precondition(lambda self: self.model.branch is not None)
@rule(data=st.data(), ncommits=st.integers(3, 10), target=commits)
def set_and_commit(self, data: st.DataObject, ncommits: int) -> Any:
Expand Down Expand Up @@ -1044,7 +1058,7 @@ def expire_snapshots(
# Check that expired snapshots are actually removed from ancestry
remaining_snapshot_ids = set()
for branch in branches_after:
remaining_snapshot_ids.add(
remaining_snapshot_ids.update(
snapshot.id for snapshot in self.repo.ancestry(branch=branch)
)

Expand Down Expand Up @@ -1260,3 +1274,87 @@ def check_ops_log(self) -> None:


VersionControlTest = VersionControlStateMachine.TestCase

# ---- Focused expire tests ----

GC_RULES = {
"empty_commit",
"reset_branch",
"expire_snapshots",
}


def test_expire_subset():
"""Run GC-related rules in isolation via VersionControlStateMachine subset."""
from icechunk.testing.stateful_subsets import test_subsets

test_subsets(
VersionControlStateMachine,
with_rules=[(GC_RULES, settings(max_examples=200, stateful_step_count=30))],
)


class ExpireOrphanMachine(RuleBasedStateMachine):
"""Minimal machine exploring GC expire + branch reset interactions."""

commits = Bundle("commits")

def __init__(self):
super().__init__()
self.repo = None
self.storage = None

@initialize(target=commits)
def init(self):
self.storage = ic.in_memory_storage()
self.repo = ic.Repository.create(storage=self.storage, spec_version=2)
return INITIAL_SNAPSHOT

@rule(target=commits)
def make_commit(self):
time.sleep(0.005)
s = self.repo.writable_session(DEFAULT_BRANCH)
return s.commit("commit", allow_empty=True)

@rule(commit=commits)
def reset_to(self, commit):
try:
self.repo.reset_branch(DEFAULT_BRANCH, commit)
except ic.IcechunkError:
pass

@rule(data=st.data())
def expire(self, data):
objects = list(self.storage.list_objects_metadata(prefix="snapshots"))
if len(objects) < 3:
return
times = sorted(set(obj.created_at for obj in objects))
if len(times) < 2:
return
idx = data.draw(st.integers(0, len(times) - 2))
older_than = times[idx] + datetime.timedelta(microseconds=1)
self.repo.expire_snapshots(older_than)

@invariant()
def check_ancestry(self):
if self.repo is None:
return
for branch in self.repo.list_branches():
ancestry = list(self.repo.ancestry(branch=branch))
assert ancestry[-1].id == INITIAL_SNAPSHOT, (
f"ancestry from {branch} terminates at {ancestry[-1].id}, "
f"expected {INITIAL_SNAPSHOT}"
)
assert ancestry[-1].parent_id is None
for snap in ancestry[:-1]:
assert snap.parent_id is not None, (
f"Snapshot {snap.id} has parent_id=None but is not root"
)


ExpireOrphanTest = ExpireOrphanMachine.TestCase
ExpireOrphanTest.settings = settings(
max_examples=200,
stateful_step_count=30,
deadline=None,
)
4 changes: 3 additions & 1 deletion icechunk/src/ops/gc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -954,7 +954,9 @@ async fn expire_v2_one_attempt(
return Some(new_parent.clone());
}
}
None
// Snapshot not reachable from any ref (e.g. branch was reset away).
// Re-parent to the initial snapshot rather than orphaning with None.
Some(Snapshot::INITIAL_SNAPSHOT_ID)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In #1932 i change the expectation so we assert that ancestry ends at a node with parent_id == None.

};

debug!("Finding ref tips");
Expand Down
Loading