Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 2 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,31 +52,7 @@ Install and run:
pip install datamimic-ce
```

### Deterministic generation

```python
from datamimic_ce.domains.facade import generate_domain

request = {
"domain": "person",
"version": "v1",
"count": 1,
"seed": "docs-demo",
"locale": "en_US",
"clock": "2025-01-01T00:00:00Z"
}

response = generate_domain(request)
print(response["items"][0]["id"])
```

Same input → same output.
Seeds, clocks, and UUIDv5 namespaces guarantee reproducibility across CI, dev, and analytics pipelines.
Here’s a sharper, more README-friendly rewrite that feels technical yet inviting — something that speaks equally to devs and agent builders. It keeps the essence but polishes framing, rhythm, and clarity:

---

### Deterministic Data Generation
## Deterministic Data Generation

DATAMIMIC lets you generate the *same* data, every time across machines, environments, or CI pipelines.
Seeds, clocks, and UUIDv5 namespaces ensure your synthetic datasets remain reproducible and traceable, no matter where or when they’re generated.
Expand Down Expand Up @@ -106,7 +82,7 @@ Behind the scenes, every deterministic request combines:
* A **frozen clock** (for time-dependent values), and
* A **UUIDv5 namespace** (for globally consistent identifiers).

Together, they form a reproducibility contract. Ideal for CI/CD pipelines, agent workflows, and analytics verification.
Together, they form a reproducibility contract. Ideal for CI/CD pipelines, agentic pipelines, and analytics verification.

Agents can safely re-invoke the same generation call and receive byte-for-byte identical data.

Expand Down
2 changes: 2 additions & 0 deletions datamimic_ce/domains/common/demographics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Demographic profile domain package."""

from .api import build_sampler_with_profile_groups
from .loader import DemographicProfileError, load_demographic_profile
from .profile import DemographicProfile, DemographicProfileId, normalize_sex
from .profile_meta import profile_group_refs
Expand All @@ -14,4 +15,5 @@
"load_demographic_profile",
"normalize_sex",
"profile_group_refs",
"build_sampler_with_profile_groups",
]
48 changes: 48 additions & 0 deletions datamimic_ce/domains/common/demographics/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Convenience API for building a demographic sampler with metadata-applied groups.

Provide a simple integration that loads a demographic profile and applies
group references from profile metadata in one call, so callers don't have to
manually stitch loader + profile_meta + sampler wiring.
"""

from __future__ import annotations

from pathlib import Path

from .loader import load_demographic_profile
from .profile_meta import profile_group_refs
from .sampler import DemographicSampler


def build_sampler_with_profile_groups(
*,
directory: Path,
dataset: str,
version: str,
profile_id: str,
request_hash: str,
) -> DemographicSampler:
"""Load profile CSVs and return a sampler with profile group masks applied.

Parameters
- directory: Folder containing age_pyramid.dmgrp.csv and condition_rates.dmgrp.csv
- dataset: Dataset code matching the CSV rows (e.g., "US")
- version: Profile version (e.g., "v1")
- profile_id: Profile metadata row id to look up group refs
- request_hash: Hash or identifier for error context tracking
"""

# Load the core demographic profile (pure domain model)
profile = load_demographic_profile(directory, dataset, version)
sampler = DemographicSampler(profile)

# Apply group references from profile metadata if available
refs = profile_group_refs(
dataset=dataset,
version=version,
profile_id=profile_id,
request_hash=request_hash,
)
if refs:
sampler.apply_profile_groups(refs, dataset, version)
return sampler
File renamed without changes.
3 changes: 2 additions & 1 deletion docs/examples/person_generation.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ from random import Random
patient_service = PatientService(dataset="US", rng=Random(77))
patient = patient_service.generate()
print(f"Patient: {patient.full_name}, ID: {patient.patient_id}")
```

## Reproducible Runs with Seeds

Expand All @@ -195,7 +196,7 @@ svc_a = PersonService(dataset="US", rng=Random(123))
svc_b = PersonService(dataset="US", rng=Random(123))
assert svc_a.generate().to_dict() == svc_b.generate().to_dict()
```
```


Example output:
```
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Integration test for building a sampler with profile metadata group refs applied."""

from __future__ import annotations

from pathlib import Path

import pytest

from datamimic_ce.domains.common.demographics import build_sampler_with_profile_groups


@pytest.fixture()
def demo_profile_dir(tmp_path: Path) -> Path:
# Create minimal demographic CSVs for dataset US/v1
(tmp_path / "age_pyramid.dmgrp.csv").write_text(
"""
dataset,version,sex,age_min,age_max,weight
US,v1,F,0,17,0.3
US,v1,F,18,44,0.5
US,v1,F,45,90,0.2
US,v1,M,0,17,0.3
US,v1,M,18,44,0.5
US,v1,M,45,90,0.2
""".strip()
+ "\n",
encoding="utf-8",
)
(tmp_path / "condition_rates.dmgrp.csv").write_text(
"""
dataset,version,condition,sex,age_min,age_max,prevalence
US,v1,Hypertension,,0,120,0.2
US,v1,Type 2 Diabetes,,0,120,0.15
""".strip()
+ "\n",
encoding="utf-8",
)
return tmp_path


def test_build_sampler_with_profile_groups_applies_masks(demo_profile_dir: Path) -> None:
sampler = build_sampler_with_profile_groups(
directory=demo_profile_dir,
dataset="US",
version="v1",
profile_id="urban_adult",
request_hash="it",
)

# The integration should apply at least one group mask from metadata.
# Gender mask is well-formed across keys in repo data (age mask may be rejected in non-strict mode
# if bounds are violated by single-band groups), so assert on gender.
gender_mask = sampler.group_mask("gender_category")
assert gender_mask, "Expected gender_category mask to be applied from profile metadata"
# Provenance must track group file usage
assert sampler.provenance_hash(), "Expected provenance to be recorded for applied group tables"