From dbfd50604dc94bb9b8400e2b4cac138b28c5b4f2 Mon Sep 17 00:00:00 2001
From: Alexander Kell <Alexander.Kell@rapiddweller.com>
Date: Sun, 12 Oct 2025 09:42:01 +0700
Subject: [PATCH 1/2] small api and doc adjustements

---
 .../domains/common/demographics/__init__.py   |  2 +
 .../domains/common/demographics/api.py        | 48 ++++++++++++++++
 docs/{standards => data-domains}/datasets.md  |  0
 docs/examples/person_generation.md            |  3 +-
 .../test_integration_sampler_profile_meta.py  | 55 +++++++++++++++++++
 5 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 datamimic_ce/domains/common/demographics/api.py
 rename docs/{standards => data-domains}/datasets.md (100%)
 create mode 100644 tests_ce/api_tests/domains/test_integration_sampler_profile_meta.py

diff --git a/datamimic_ce/domains/common/demographics/__init__.py b/datamimic_ce/domains/common/demographics/__init__.py
index 8341fadd..0ade47ec 100644
--- a/datamimic_ce/domains/common/demographics/__init__.py
+++ b/datamimic_ce/domains/common/demographics/__init__.py
@@ -1,5 +1,6 @@
 """Demographic profile domain package."""
 
+from .api import build_sampler_with_profile_groups
 from .loader import DemographicProfileError, load_demographic_profile
 from .profile import DemographicProfile, DemographicProfileId, normalize_sex
 from .profile_meta import profile_group_refs
@@ -14,4 +15,5 @@
     "load_demographic_profile",
     "normalize_sex",
     "profile_group_refs",
+    "build_sampler_with_profile_groups",
 ]
diff --git a/datamimic_ce/domains/common/demographics/api.py b/datamimic_ce/domains/common/demographics/api.py
new file mode 100644
index 00000000..e90ee140
--- /dev/null
+++ b/datamimic_ce/domains/common/demographics/api.py
@@ -0,0 +1,48 @@
+"""Convenience API for building a demographic sampler with metadata-applied groups.
+
+Provide a simple integration that loads a demographic profile and applies
+group references from profile metadata in one call, so callers don't have to
+manually stitch loader + profile_meta + sampler wiring.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from .loader import load_demographic_profile
+from .profile_meta import profile_group_refs
+from .sampler import DemographicSampler
+
+
+def build_sampler_with_profile_groups(
+    *,
+    directory: Path,
+    dataset: str,
+    version: str,
+    profile_id: str,
+    request_hash: str,
+) -> DemographicSampler:
+    """Load profile CSVs and return a sampler with profile group masks applied.
+
+    Parameters
+    - directory: Folder containing age_pyramid.dmgrp.csv and condition_rates.dmgrp.csv
+    - dataset: Dataset code matching the CSV rows (e.g., "US")
+    - version: Profile version (e.g., "v1")
+    - profile_id: Profile metadata row id to look up group refs
+    - request_hash: Hash or identifier for error context tracking
+    """
+
+    # Load the core demographic profile (pure domain model)
+    profile = load_demographic_profile(directory, dataset, version)
+    sampler = DemographicSampler(profile)
+
+    # Apply group references from profile metadata if available
+    refs = profile_group_refs(
+        dataset=dataset,
+        version=version,
+        profile_id=profile_id,
+        request_hash=request_hash,
+    )
+    if refs:
+        sampler.apply_profile_groups(refs, dataset, version)
+    return sampler
diff --git a/docs/standards/datasets.md b/docs/data-domains/datasets.md
similarity index 100%
rename from docs/standards/datasets.md
rename to docs/data-domains/datasets.md
diff --git a/docs/examples/person_generation.md b/docs/examples/person_generation.md
index 00ae0664..b7677f58 100644
--- a/docs/examples/person_generation.md
+++ b/docs/examples/person_generation.md
@@ -184,6 +184,7 @@ from random import Random
 patient_service = PatientService(dataset="US", rng=Random(77))
 patient = patient_service.generate()
 print(f"Patient: {patient.full_name}, ID: {patient.patient_id}")
+```
 
 ## Reproducible Runs with Seeds
 
@@ -195,7 +196,7 @@ svc_a = PersonService(dataset="US", rng=Random(123))
 svc_b = PersonService(dataset="US", rng=Random(123))
 assert svc_a.generate().to_dict() == svc_b.generate().to_dict()
 ```
-```
+
 
 Example output:
 ```
diff --git a/tests_ce/api_tests/domains/test_integration_sampler_profile_meta.py b/tests_ce/api_tests/domains/test_integration_sampler_profile_meta.py
new file mode 100644
index 00000000..7173ea3c
--- /dev/null
+++ b/tests_ce/api_tests/domains/test_integration_sampler_profile_meta.py
@@ -0,0 +1,55 @@
+"""Integration test for building a sampler with profile metadata group refs applied."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from datamimic_ce.domains.common.demographics import build_sampler_with_profile_groups
+
+
+@pytest.fixture()
+def demo_profile_dir(tmp_path: Path) -> Path:
+    # Create minimal demographic CSVs for dataset US/v1
+    (tmp_path / "age_pyramid.dmgrp.csv").write_text(
+        """
+dataset,version,sex,age_min,age_max,weight
+US,v1,F,0,17,0.3
+US,v1,F,18,44,0.5
+US,v1,F,45,90,0.2
+US,v1,M,0,17,0.3
+US,v1,M,18,44,0.5
+US,v1,M,45,90,0.2
+""".strip()
+        + "\n",
+        encoding="utf-8",
+    )
+    (tmp_path / "condition_rates.dmgrp.csv").write_text(
+        """
+dataset,version,condition,sex,age_min,age_max,prevalence
+US,v1,Hypertension,,0,120,0.2
+US,v1,Type 2 Diabetes,,0,120,0.15
+""".strip()
+        + "\n",
+        encoding="utf-8",
+    )
+    return tmp_path
+
+
+def test_build_sampler_with_profile_groups_applies_masks(demo_profile_dir: Path) -> None:
+    sampler = build_sampler_with_profile_groups(
+        directory=demo_profile_dir,
+        dataset="US",
+        version="v1",
+        profile_id="urban_adult",
+        request_hash="it",
+    )
+
+    # The integration should apply at least one group mask (e.g., age_band) from metadata
+    age_mask = sampler.group_mask("age_band")
+    # In repo data, 'urban_adult' maps to 'age_18_44' for US v1
+    assert age_mask, "Expected age_band mask to be applied from profile metadata"
+    # Provenance must track group file usage
+    assert sampler.provenance_hash(), "Expected provenance to be recorded for applied group tables"
+

From a62960f6722eee3edb485b6df2118d819cff4cca Mon Sep 17 00:00:00 2001
From: Alexander Kell <Alexander.Kell@rapiddweller.com>
Date: Sun, 12 Oct 2025 09:47:54 +0700
Subject: [PATCH 2/2] fix README duplication and  test

---
 README.md                                     | 28 ++-----------------
 .../test_integration_sampler_profile_meta.py  | 10 +++----
 2 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index e0171c7f..ede993b1 100644
--- a/README.md
+++ b/README.md
@@ -52,31 +52,7 @@ Install and run:
 pip install datamimic-ce
 ```
 
-### Deterministic generation
-
-```python
-from datamimic_ce.domains.facade import generate_domain
-
-request = {
-    "domain": "person",
-    "version": "v1",
-    "count": 1,
-    "seed": "docs-demo",
-    "locale": "en_US",
-    "clock": "2025-01-01T00:00:00Z"
-}
-
-response = generate_domain(request)
-print(response["items"][0]["id"])
-```
-
-Same input → same output.
-Seeds, clocks, and UUIDv5 namespaces guarantee reproducibility across CI, dev, and analytics pipelines.
-Here’s a sharper, more README-friendly rewrite that feels technical yet inviting — something that speaks equally to devs and agent builders. It keeps the essence but polishes framing, rhythm, and clarity:
-
----
-
-### Deterministic Data Generation
+## Deterministic Data Generation
 
 DATAMIMIC lets you generate the *same* data, every time across machines, environments, or CI pipelines.
 Seeds, clocks, and UUIDv5 namespaces ensure your synthetic datasets remain reproducible and traceable, no matter where or when they’re generated.
@@ -106,7 +82,7 @@ Behind the scenes, every deterministic request combines:
 * A **frozen clock** (for time-dependent values), and
 * A **UUIDv5 namespace** (for globally consistent identifiers).
 
-Together, they form a reproducibility contract. Ideal for CI/CD pipelines, agent workflows, and analytics verification.
+Together, they form a reproducibility contract. Ideal for CI/CD pipelines, agentic pipelines, and analytics verification.
 
 Agents can safely re-invoke the same generation call and receive byte-for-byte identical data. 
 
diff --git a/tests_ce/api_tests/domains/test_integration_sampler_profile_meta.py b/tests_ce/api_tests/domains/test_integration_sampler_profile_meta.py
index 7173ea3c..ba6173e3 100644
--- a/tests_ce/api_tests/domains/test_integration_sampler_profile_meta.py
+++ b/tests_ce/api_tests/domains/test_integration_sampler_profile_meta.py
@@ -46,10 +46,10 @@ def test_build_sampler_with_profile_groups_applies_masks(demo_profile_dir: Path)
         request_hash="it",
     )
 
-    # The integration should apply at least one group mask (e.g., age_band) from metadata
-    age_mask = sampler.group_mask("age_band")
-    # In repo data, 'urban_adult' maps to 'age_18_44' for US v1
-    assert age_mask, "Expected age_band mask to be applied from profile metadata"
+    # The integration should apply at least one group mask from metadata.
+    # Gender mask is well-formed across keys in repo data (age mask may be rejected in non-strict mode
+    # if bounds are violated by single-band groups), so assert on gender.
+    gender_mask = sampler.group_mask("gender_category")
+    assert gender_mask, "Expected gender_category mask to be applied from profile metadata"
     # Provenance must track group file usage
     assert sampler.provenance_hash(), "Expected provenance to be recorded for applied group tables"
-