Add RandomAllocationSampling to jax_privacy.batch_selection

Ryan McKenna · copybara-github · commit 93a9632b85ab · 2026-05-31T18:02:50.000-07:00
PiperOrigin-RevId: 924347562
diff --git a/jax_privacy/batch_selection.py b/jax_privacy/batch_selection.py
@@ -61,7 +61,6 @@
 from jax_privacy import sharding_utils
 import numpy as np
 
-
 RngType = np.random.Generator | int | None
 
 
@@ -301,6 +300,45 @@ def batch_iterator(
       yield groups[i % self.cycle_length]
 
 
+@dataclasses.dataclass(frozen=True)
+class RandomAllocationSampling(BatchSelectionStrategy):
+  """Implements k-out-of-t random allocation (aka balanced-iteration sampling).
+
+  Each example independently selects exactly k steps (out of iterations total)
+  to participate in, uniformly at random. See https://arxiv.org/abs/2602.17284
+  and https://arxiv.org/abs/2605.07072 for details about this strategy.
+
+  Formal guarantees of the batch_iterator:
+    - All batches consist of indices in the range [0, num_examples).
+    - Each example appears in exactly k of the iterations batches, chosen
+      uniformly at random without replacement from [0, iterations).
+    - The allocation for each example is independent of all other examples.
+
+  Attributes:
+    num_participations: The number of steps each example participates in (k).
+    iterations: The total number of iterations / batches to generate (t).
+  """
+
+  num_participations: int
+  iterations: int
+
+  def batch_iterator(
+      self, num_examples: int, rng: RngType = None
+  ) -> Iterator[np.ndarray]:
+    rng = np.random.default_rng(rng)
+    dtype = np.min_scalar_type(-num_examples)
+    # At step i, each example with r remaining participations and (t-i)
+    # remaining steps participates with probability r/(t-i). This is equivalent
+    # to each example choosing k steps uniformly without replacement, but uses
+    # only O(n) space instead of O(n*k).
+    remaining = np.full(num_examples, self.num_participations)
+    for i in range(self.iterations):
+      probs = remaining / (self.iterations - i)
+      mask = rng.random(num_examples) < probs
+      yield np.where(mask)[0].astype(dtype)
+      remaining -= mask
+
+
 @dataclasses.dataclass(frozen=True)
 class FixedBatchSampling(BatchSelectionStrategy):
   """Implements fixed-size batch sampling.
diff --git a/tests/batch_selection_test.py b/tests/batch_selection_test.py
@@ -242,6 +242,92 @@ def test_balls_in_bins_sampling_with_large_cycle_length(self):
     _check_no_repeated_indices(batches[:cycle_length])
     _check_cyclic_property(batches, cycle_length)
 
+  @parameterized.product(
+      num_examples=[10, 100],
+      num_participations=[1, 3, 5],
+      iterations=[10, 20],
+  )
+  def test_random_allocation_sampling(
+      self, num_examples, num_participations, iterations
+  ):
+    """Tests that random allocation gives exact k participations per example."""
+    strategy = batch_selection.RandomAllocationSampling(
+        num_participations=num_participations,
+        iterations=iterations,
+    )
+    batches = list(strategy.batch_iterator(num_examples, rng=0))
+    self.assertLen(batches, iterations)
+    _check_element_range(batches, num_examples)
+    _check_signed_indices(batches)
+    _check_max_participation(batches, num_participations)
+    # Each example must appear in *exactly* k batches.
+    all_indices = np.concatenate(batches)
+    counts = collections.Counter(int(x) for x in all_indices)
+    for example_idx in range(num_examples):
+      self.assertEqual(counts[example_idx], num_participations)
+    # Within each batch, no example should appear twice.
+    for batch in batches:
+      self.assertEqual(len(batch), len(set(batch.tolist())))
+
+  def test_random_allocation_sampling_k_equals_zero(self):
+    """All batches should be empty when num_participations=0."""
+    strategy = batch_selection.RandomAllocationSampling(
+        num_participations=0,
+        iterations=5,
+    )
+    batches = list(strategy.batch_iterator(10, rng=0))
+    self.assertLen(batches, 5)
+    for batch in batches:
+      self.assertEmpty(batch)
+
+  def test_random_allocation_sampling_k_equals_t(self):
+    """Every example should appear in every batch when k == t."""
+    strategy = batch_selection.RandomAllocationSampling(
+        num_participations=5,
+        iterations=5,
+    )
+    batches = list(strategy.batch_iterator(10, rng=0))
+    self.assertLen(batches, 5)
+    for batch in batches:
+      self.assertLen(batch, 10)
+    _check_element_range(batches, 10)
+
+  def test_random_allocation_sampling_expected_batch_size(self):
+    """Average batch size should be exactly n*k/t."""
+    num_examples = 1000
+    num_participations = 3
+    iterations = 50
+    strategy = batch_selection.RandomAllocationSampling(
+        num_participations=num_participations,
+        iterations=iterations,
+    )
+    batches = list(strategy.batch_iterator(num_examples, rng=0))
+    expected_batch_size = num_examples * num_participations / iterations
+    actual_mean = sum(len(b) for b in batches) / iterations
+    self.assertAlmostEqual(actual_mean, expected_batch_size, delta=1e-5)
+
+  def test_random_allocation_sampling_is_deterministic(self):
+    """RandomAllocationSampling should respect the provided RNG."""
+    strategy = batch_selection.RandomAllocationSampling(
+        num_participations=2,
+        iterations=10,
+    )
+    batches_a = list(strategy.batch_iterator(50, rng=0))
+    batches_b = list(strategy.batch_iterator(50, rng=0))
+    for batch_a, batch_b in zip(batches_a, batches_b, strict=True):
+      np.testing.assert_array_equal(batch_a, batch_b)
+
+  def test_random_allocation_sampling_zero_examples(self):
+    """Should produce empty batches when there are no examples."""
+    strategy = batch_selection.RandomAllocationSampling(
+        num_participations=2,
+        iterations=5,
+    )
+    batches = list(strategy.batch_iterator(0, rng=0))
+    self.assertLen(batches, 5)
+    for batch in batches:
+      self.assertEmpty(batch)
+
   def test_cyclic_poisson_sampling_independent_is_deterministic(self):
     """CyclicPoissonSampling should respect the provided RNG."""
     strategy = batch_selection.CyclicPoissonSampling(