From f3345f7187f9df4fd7160b652e5a23ede4af94b8 Mon Sep 17 00:00:00 2001
From: Samuel Monson <smonson@redhat.com>
Date: Tue, 10 Jun 2025 13:54:50 -0400
Subject: [PATCH 1/8] Add fixed prefix option to synthetic data

Signed-off-by: Samuel Monson <smonson@redhat.com>
---
 src/guidellm/dataset/synthetic.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/guidellm/dataset/synthetic.py b/src/guidellm/dataset/synthetic.py
index 9868ab52..854f478f 100644
--- a/src/guidellm/dataset/synthetic.py
+++ b/src/guidellm/dataset/synthetic.py
@@ -25,6 +25,11 @@
 
 
 class SyntheticDatasetConfig(BaseModel):
+    prefix_tokens: int = Field(
+        description="The number of shared prefix tokens to prepend to each prompt.",
+        ge=0,
+        default=0,
+    )
     prompt_tokens: int = Field(
         description="The average number of text tokens generated for prompts.",
         gt=0,
@@ -164,6 +169,10 @@ def __iter__(
         # ensure diff distribution from output tokens
         rand = random.Random(self.random_seed + 2)  # noqa: S311
 
+        prefix_index = rand.randint(0, len(self.text_creator.words))
+        prefix_tokens = self.config.prefix_tokens
+        prefix = self._create_prompt(prefix_tokens, prefix_index)
+
         for _, prompt_tokens, output_tokens in zip(
             range(self.config.samples),
             prompt_tokens_sampler,
@@ -171,8 +180,8 @@ def __iter__(
         ):
             start_index = rand.randint(0, len(self.text_creator.words))
             yield {
-                "prompt": self._create_prompt(prompt_tokens, start_index),
-                "prompt_tokens_count": prompt_tokens,
+                "prompt": prefix + self._create_prompt(prompt_tokens, start_index),
+                "prompt_tokens_count": prefix_tokens + prompt_tokens,
                 "output_tokens_count": output_tokens,
             }
 

From e4560e238e9f7aa792d58625f67cea5f92874499 Mon Sep 17 00:00:00 2001
From: Samuel Monson <smonson@redhat.com>
Date: Tue, 10 Jun 2025 14:35:18 -0400
Subject: [PATCH 2/8] Add prefix before decode

Signed-off-by: Samuel Monson <smonson@redhat.com>
---
 src/guidellm/dataset/synthetic.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/guidellm/dataset/synthetic.py b/src/guidellm/dataset/synthetic.py
index 854f478f..94dd3aa6 100644
--- a/src/guidellm/dataset/synthetic.py
+++ b/src/guidellm/dataset/synthetic.py
@@ -170,8 +170,7 @@ def __iter__(
         rand = random.Random(self.random_seed + 2)  # noqa: S311
 
         prefix_index = rand.randint(0, len(self.text_creator.words))
-        prefix_tokens = self.config.prefix_tokens
-        prefix = self._create_prompt(prefix_tokens, prefix_index)
+        prefix_tokens = self._create_prompt(self.config.prefix_tokens, prefix_index)
 
         for _, prompt_tokens, output_tokens in zip(
             range(self.config.samples),
@@ -179,15 +178,19 @@ def __iter__(
             output_tokens_sampler,
         ):
             start_index = rand.randint(0, len(self.text_creator.words))
+            prompt_text = self.processor.decode(
+                prefix_tokens + self._create_prompt(prompt_tokens, start_index),
+                skip_special_tokens=True,
+            )
             yield {
-                "prompt": prefix + self._create_prompt(prompt_tokens, start_index),
-                "prompt_tokens_count": prefix_tokens + prompt_tokens,
+                "prompt": prompt_text,
+                "prompt_tokens_count": self.config.prefix_tokens + prompt_tokens,
                 "output_tokens_count": output_tokens,
             }
 
-    def _create_prompt(self, prompt_tokens: int, start_index: int) -> str:
+    def _create_prompt(self, prompt_tokens: int, start_index: int) -> list[int]:
         if prompt_tokens <= 0:
-            return ""
+            return []
 
         left = start_index
         right = start_index + 4 * prompt_tokens
@@ -195,16 +198,17 @@ def _create_prompt(self, prompt_tokens: int, start_index: int) -> str:
         while left < right:
             mid = (left + right) // 2
             test_prompt = self.text_creator.create_text(start_index, mid - start_index)
-            test_tokens = len(self.processor.tokenize(test_prompt))
+            test_tokens = self.processor.encode(test_prompt)
 
-            if test_tokens == prompt_tokens:
-                return test_prompt
-            elif test_tokens < prompt_tokens:
+            if len(test_tokens) == prompt_tokens:
+                return test_tokens
+            elif len(test_tokens) < prompt_tokens:
                 left = mid + 1
             else:
                 right = mid
 
-        return self.text_creator.create_text(start_index, left - start_index)
+        final_text = self.text_creator.create_text(start_index, left - start_index)
+        return self.processor.encode(final_text)
 
 
 class SyntheticDatasetCreator(DatasetCreator):

From c748f007d65a3deedb5b5dd900585d82c8dec5b8 Mon Sep 17 00:00:00 2001
From: Samuel Monson <smonson@redhat.com>
Date: Wed, 11 Jun 2025 13:20:28 -0400
Subject: [PATCH 3/8] Document prefix_tokens arg

Signed-off-by: Samuel Monson <smonson@redhat.com>
---
 docs/datasets.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/datasets.md b/docs/datasets.md
index a5d0aa4e..781b23b8 100644
--- a/docs/datasets.md
+++ b/docs/datasets.md
@@ -76,6 +76,7 @@ guidellm benchmark \
 - `output_tokens_stdev`: Standard deviation for output tokens. If not supplied and min/max are not specified, no deviation is applied. If not supplied and min/max are specified, a uniform distribution is used.
 - `output_tokens_min`: Minimum number of tokens in outputs. If unset and `output_tokens_stdev` is set, the minimum is 1.
 - `output_tokens_max`: Maximum number of tokens in outputs. If unset and `output_tokens_stdev` is set, the maximum is 5 times the standard deviation.
+- `prefix_tokens`: Number of tokens to share as a prefix across all prompts. Is additive to the prompt tokens distribution so each request is `prefix_tokens + prompt_tokens_sample()`. If unset, defaults to 0.
 - `samples`: Number of samples to generate (default: 1000). More samples will increase the time taken to generate the dataset before benchmarking, but will also decrease the likelihood of caching requests.
 - `source`: Source text for generation (default: `data:prideandprejudice.txt.gz`). This can be any text file, URL containing a text file, or a compressed text file. The text is used to sample from at a word and punctuation granularity and then combined into a single string of the desired lengths.
 

From 85320d23a48c4a6f73a7d15812b71f3a15784157 Mon Sep 17 00:00:00 2001
From: Mehul <MEHTMEHUL@GMAIL.COM>
Date: Tue, 1 Jul 2025 16:48:01 -0400
Subject: [PATCH 4/8] Add unique single-token prefix to every request

Co-authored-by: Mehul <MEHTMEHUL@GMAIL.COM>
Co-authored-by: Samuel Monson <smonson@redhat.com>
Signed-off-by: Samuel Monson <smonson@redhat.com>
---
 src/guidellm/dataset/synthetic.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/guidellm/dataset/synthetic.py b/src/guidellm/dataset/synthetic.py
index 94dd3aa6..8c30f0f7 100644
--- a/src/guidellm/dataset/synthetic.py
+++ b/src/guidellm/dataset/synthetic.py
@@ -1,6 +1,7 @@
 import json
 import random
 from collections.abc import Iterable, Iterator
+from itertools import cycle
 from pathlib import Path
 from typing import Any, Literal, Optional, Union
 
@@ -168,6 +169,7 @@ def __iter__(
         )
         # ensure diff distribution from output tokens
         rand = random.Random(self.random_seed + 2)  # noqa: S311
+        unique_prefix_iter = cycle(self.processor.get_vocab().values())
 
         prefix_index = rand.randint(0, len(self.text_creator.words))
         prefix_tokens = self._create_prompt(self.config.prefix_tokens, prefix_index)
@@ -179,7 +181,10 @@ def __iter__(
         ):
             start_index = rand.randint(0, len(self.text_creator.words))
             prompt_text = self.processor.decode(
-                prefix_tokens + self._create_prompt(prompt_tokens, start_index),
+                prefix_tokens
+                + self._create_prompt(
+                    prompt_tokens, start_index, next(unique_prefix_iter)
+                ),
                 skip_special_tokens=True,
             )
             yield {
@@ -188,17 +193,20 @@ def __iter__(
                 "output_tokens_count": output_tokens,
             }
 
-    def _create_prompt(self, prompt_tokens: int, start_index: int) -> list[int]:
+    def _create_prompt(
+        self, prompt_tokens: int, start_index: int, unique_prefix: Optional[int] = None
+    ) -> list[int]:
         if prompt_tokens <= 0:
             return []
 
         left = start_index
         right = start_index + 4 * prompt_tokens
+        start_tokens = [unique_prefix] if unique_prefix else []
 
         while left < right:
             mid = (left + right) // 2
             test_prompt = self.text_creator.create_text(start_index, mid - start_index)
-            test_tokens = self.processor.encode(test_prompt)
+            test_tokens = start_tokens + self.processor.encode(test_prompt)
 
             if len(test_tokens) == prompt_tokens:
                 return test_tokens
@@ -208,7 +216,7 @@ def _create_prompt(self, prompt_tokens: int, start_index: int) -> list[int]:
                 right = mid
 
         final_text = self.text_creator.create_text(start_index, left - start_index)
-        return self.processor.encode(final_text)
+        return start_tokens + self.processor.encode(final_text)
 
 
 class SyntheticDatasetCreator(DatasetCreator):

From f25406653b7de7c5956540871421ecf70795c250 Mon Sep 17 00:00:00 2001
From: Samuel Monson <smonson@redhat.com>
Date: Mon, 18 Aug 2025 16:30:51 -0400
Subject: [PATCH 5/8] Add unit tests

Signed-off-by: Samuel Monson <smonson@redhat.com>
---
 tests/unit/dataset/__init__.py       |   0
 tests/unit/dataset/test_synthetic.py | 873 +++++++++++++++++++++++++++
 2 files changed, 873 insertions(+)
 create mode 100644 tests/unit/dataset/__init__.py
 create mode 100644 tests/unit/dataset/test_synthetic.py

diff --git a/tests/unit/dataset/__init__.py b/tests/unit/dataset/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/dataset/test_synthetic.py b/tests/unit/dataset/test_synthetic.py
new file mode 100644
index 00000000..e3110fa3
--- /dev/null
+++ b/tests/unit/dataset/test_synthetic.py
@@ -0,0 +1,873 @@
+"""
+Unit tests for guidellm.dataset.synthetic module.
+"""
+
+import json
+import tempfile
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+import pytest
+import yaml
+
+from guidellm.dataset.synthetic import (
+    SyntheticDatasetConfig,
+    SyntheticDatasetCreator,
+    SyntheticTextItemsGenerator,
+)
+
+
+class TestSyntheticDatasetConfig:
+    """Test cases for SyntheticDatasetConfig class.
+
+    ### WRITTEN BY AI ###
+    """
+
+    @pytest.mark.smoke
+    def test_config_creation_with_all_params(self):
+        """Test creating config with all parameters specified.
+
+        ### WRITTEN BY AI ###
+        """
+        config = SyntheticDatasetConfig(
+            prefix_tokens=5,
+            prompt_tokens=100,
+            prompt_tokens_stdev=10,
+            prompt_tokens_min=50,
+            prompt_tokens_max=150,
+            output_tokens=30,
+            output_tokens_stdev=5,
+            output_tokens_min=20,
+            output_tokens_max=40,
+            samples=500,
+            source="custom_text.txt",
+        )
+
+        assert config.prefix_tokens == 5
+        assert config.prompt_tokens == 100
+        assert config.prompt_tokens_stdev == 10
+        assert config.prompt_tokens_min == 50
+        assert config.prompt_tokens_max == 150
+        assert config.output_tokens == 30
+        assert config.output_tokens_stdev == 5
+        assert config.output_tokens_min == 20
+        assert config.output_tokens_max == 40
+        assert config.samples == 500
+        assert config.source == "custom_text.txt"
+
+    @pytest.mark.regression
+    def test_parse_json_string(self):
+        """Test parsing JSON string configuration.
+
+        ### WRITTEN BY AI ###
+        """
+        json_str = json.dumps(
+            {
+                "prompt_tokens": 75,
+                "output_tokens": 25,
+                "samples": 200,
+                "source": "test.txt",
+                "prefix_tokens": 10,
+            }
+        )
+
+        config = SyntheticDatasetConfig.parse_str(json_str)
+
+        assert config.prompt_tokens == 75
+        assert config.output_tokens == 25
+        assert config.samples == 200
+        assert config.source == "test.txt"
+        assert config.prefix_tokens == 10
+
+    @pytest.mark.regression
+    def test_parse_key_value_pairs(self):
+        """Test parsing key-value pairs configuration.
+
+        ### WRITTEN BY AI ###
+        """
+        kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt,prefix_tokens=5"  # noqa: E501
+
+        config = SyntheticDatasetConfig.parse_str(kv_str)
+
+        assert config.prompt_tokens == 80
+        assert config.output_tokens == 30
+        assert config.samples == 300
+        assert config.source == "data.txt"
+        assert config.prefix_tokens == 5
+
+    @pytest.mark.sanity
+    def test_parse_yaml_file(self):
+        """Test parsing YAML file configuration.
+
+        ### WRITTEN BY AI ###
+        """
+        config_data = {
+            "prompt_tokens": 60,
+            "output_tokens": 15,
+            "samples": 100,
+            "source": "yaml_test.txt",
+            "prefix_tokens": 3,
+        }
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            yaml.dump(config_data, f)
+            yaml_path = f.name
+
+        try:
+            config = SyntheticDatasetConfig.parse_str(yaml_path)
+
+            assert config.prompt_tokens == 60
+            assert config.output_tokens == 15
+            assert config.samples == 100
+            assert config.source == "yaml_test.txt"
+            assert config.prefix_tokens == 3
+        finally:
+            Path(yaml_path).unlink()
+
+    @pytest.mark.sanity
+    def test_parse_config_file(self):
+        """Test parsing .config file.
+
+        ### WRITTEN BY AI ###
+        """
+        config_data = {
+            "prompt_tokens": 90,
+            "output_tokens": 35,
+            "samples": 150,
+            "prefix_tokens": 2,
+        }
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".config", delete=False) as f:
+            yaml.dump(config_data, f)
+            config_path = f.name
+
+        try:
+            config = SyntheticDatasetConfig.parse_str(config_path)
+
+            assert config.prompt_tokens == 90
+            assert config.output_tokens == 35
+            assert config.samples == 150
+            assert config.prefix_tokens == 2
+        finally:
+            Path(config_path).unlink()
+
+    @pytest.mark.regression
+    def test_parse_path_object(self):
+        """Test parsing with Path object.
+
+        ### WRITTEN BY AI ###
+        """
+        config_data = {"prompt_tokens": 45, "output_tokens": 25}
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            yaml.dump(config_data, f)
+            yaml_path = Path(f.name)
+
+        try:
+            config = SyntheticDatasetConfig.parse_str(yaml_path)
+            assert config.prompt_tokens == 45
+            assert config.output_tokens == 25
+        finally:
+            yaml_path.unlink()
+
+    @pytest.mark.sanity
+    def test_parse_invalid_format(self):
+        """Test parsing invalid format raises ValueError.
+
+        ### WRITTEN BY AI ###
+        """
+        with pytest.raises(ValueError, match="Unsupported data format"):
+            SyntheticDatasetConfig.parse_str("invalid_format_string")
+
+    @pytest.mark.sanity
+    def test_validation_positive_values(self):
+        """Test that negative or zero values are rejected.
+
+        ### WRITTEN BY AI ###
+        """
+        with pytest.raises(ValueError):
+            SyntheticDatasetConfig(prompt_tokens=0, output_tokens=20)
+
+        with pytest.raises(ValueError):
+            SyntheticDatasetConfig(prompt_tokens=20, output_tokens=0)
+
+        with pytest.raises(ValueError):
+            SyntheticDatasetConfig(prompt_tokens=20, output_tokens=10, samples=0)
+
+        with pytest.raises(ValueError):
+            SyntheticDatasetConfig(prompt_tokens=20, output_tokens=10, prefix_tokens=-1)
+
+    @pytest.mark.regression
+    def test_validation_optional_positive_values(self):
+        """Test that optional parameters reject negative values.
+
+        ### WRITTEN BY AI ###
+        """
+        with pytest.raises(ValueError):
+            SyntheticDatasetConfig(
+                prompt_tokens=20, output_tokens=10, prompt_tokens_stdev=-1
+            )
+
+        with pytest.raises(ValueError):
+            SyntheticDatasetConfig(
+                prompt_tokens=20, output_tokens=10, prompt_tokens_min=-1
+            )
+
+        with pytest.raises(ValueError):
+            SyntheticDatasetConfig(
+                prompt_tokens=20, output_tokens=10, output_tokens_max=0
+            )
+
+    @pytest.mark.regression
+    def test_parse_json_method_directly(self):
+        """Test parse_json static method directly.
+
+        ### WRITTEN BY AI ###
+        """
+        json_data = {"prompt_tokens": 100, "output_tokens": 50}
+        json_str = json.dumps(json_data)
+
+        config = SyntheticDatasetConfig.parse_json(json_str)
+
+        assert config.prompt_tokens == 100
+        assert config.output_tokens == 50
+
+    @pytest.mark.regression
+    def test_parse_key_value_pairs_method_directly(self):
+        """Test parse_key_value_pairs static method directly.
+
+        ### WRITTEN BY AI ###
+        """
+        kv_str = "prompt_tokens=75,output_tokens=35"
+
+        config = SyntheticDatasetConfig.parse_key_value_pairs(kv_str)
+
+        assert config.prompt_tokens == 75
+        assert config.output_tokens == 35
+
+    @pytest.mark.regression
+    def test_parse_config_file_method_directly(self):
+        """Test parse_config_file static method directly.
+
+        ### WRITTEN BY AI ###
+        """
+        config_data = {"prompt_tokens": 65, "output_tokens": 45}
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            yaml.dump(config_data, f)
+            config_path = f.name
+
+        try:
+            config = SyntheticDatasetConfig.parse_config_file(config_path)
+            assert config.prompt_tokens == 65
+            assert config.output_tokens == 45
+        finally:
+            Path(config_path).unlink()
+
+
+class TestSyntheticTextItemsGenerator:
+    """Test cases for SyntheticTextItemsGenerator class.
+
+    ### WRITTEN BY AI ###
+    """
+
+    @pytest.fixture
+    def mock_tokenizer(self):
+        """Fixture to provide a mocked tokenizer.
+
+        ### WRITTEN BY AI ###
+        """
+        tokenizer = Mock()
+        tokenizer.get_vocab.return_value = {f"token_{i}": i for i in range(1000)}
+        tokenizer.encode.side_effect = lambda text: [1, 2, 3] * (len(text) // 10 + 1)
+        tokenizer.decode.side_effect = (
+            lambda tokens, skip_special_tokens=False: " ".join(
+                f"token_{t}" for t in tokens[:5]
+            )
+        )
+        return tokenizer
+
+    @pytest.fixture
+    def simple_config(self):
+        """Fixture for simple configuration.
+
+        ### WRITTEN BY AI ###
+        """
+        return SyntheticDatasetConfig(
+            prompt_tokens=15,
+            output_tokens=10,
+            samples=5,
+            source="The quick brown fox jumps over the lazy dog.",
+        )
+
+    @pytest.fixture
+    def config_with_prefix(self):
+        """Fixture for configuration with prefix tokens.
+
+        ### WRITTEN BY AI ###
+        """
+        return SyntheticDatasetConfig(
+            prefix_tokens=3,
+            prompt_tokens=15,
+            output_tokens=10,
+            samples=5,
+            source="The quick brown fox jumps over the lazy dog.",
+        )
+
+    @pytest.fixture
+    def complex_config(self):
+        """Fixture for complex configuration with variance.
+
+        ### WRITTEN BY AI ###
+        """
+        return SyntheticDatasetConfig(
+            prompt_tokens=20,
+            prompt_tokens_stdev=5,
+            prompt_tokens_min=10,
+            prompt_tokens_max=30,
+            output_tokens=15,
+            output_tokens_stdev=3,
+            output_tokens_min=10,
+            output_tokens_max=20,
+            samples=10,
+            source="The quick brown fox jumps over the lazy dog.",
+        )
+
+    @pytest.mark.smoke
+    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
+    def test_generator_initialization(
+        self, mock_text_creator, simple_config, mock_tokenizer
+    ):
+        """Test generator initialization.
+
+        ### WRITTEN BY AI ###
+        """
+        generator = SyntheticTextItemsGenerator(
+            simple_config, mock_tokenizer, random_seed=42
+        )
+
+        assert generator.config == simple_config
+        assert generator.processor == mock_tokenizer
+        assert generator.random_seed == 42
+        mock_text_creator.assert_called_once_with(data=simple_config.source)
+
+    @pytest.mark.smoke
+    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
+    @patch("guidellm.dataset.synthetic.IntegerRangeSampler")
+    def test_basic_iteration(
+        self, mock_sampler, mock_text_creator, simple_config, mock_tokenizer
+    ):
+        """Test basic iteration functionality.
+
+        ### WRITTEN BY AI ###
+        """
+        # Setup mocks
+        mock_text_creator_instance = Mock()
+        mock_text_creator_instance.words = ["word1", "word2", "word3"] * 100
+        mock_text_creator_instance.create_text.return_value = "sample text"
+        mock_text_creator.return_value = mock_text_creator_instance
+
+        # Mock IntegerRangeSampler to return iterators
+        def mock_sampler_side_effect(*args, **kwargs):
+            mock_instance = Mock()
+            mock_instance.__iter__ = Mock(return_value=iter([15, 15, 15, 15, 15]))
+            return mock_instance
+
+        mock_sampler.side_effect = mock_sampler_side_effect
+
+        generator = SyntheticTextItemsGenerator(
+            simple_config, mock_tokenizer, random_seed=42
+        )
+
+        items = list(generator)
+
+        # Verify we get the expected number of items
+        assert len(items) == simple_config.samples
+
+        # Verify each item has the required keys
+        for item in items:
+            assert "prompt" in item
+            assert "prompt_tokens_count" in item
+            assert "output_tokens_count" in item
+            assert isinstance(item["prompt"], str)
+            assert isinstance(item["prompt_tokens_count"], int)
+            assert isinstance(item["output_tokens_count"], int)
+
+    @pytest.mark.sanity
+    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
+    def test_create_prompt_method(
+        self, mock_text_creator, simple_config, mock_tokenizer
+    ):
+        """Test _create_prompt method.
+
+        ### WRITTEN BY AI ###
+        """
+        mock_text_creator_instance = Mock()
+        mock_text_creator_instance.words = ["word"] * 100
+        mock_text_creator_instance.create_text.return_value = "test text"
+        mock_text_creator.return_value = mock_text_creator_instance
+
+        mock_tokenizer.encode.return_value = [1, 2, 3]
+
+        generator = SyntheticTextItemsGenerator(
+            simple_config, mock_tokenizer, random_seed=42
+        )
+
+        # Test normal case
+        result = generator._create_prompt(5, 0, 42)
+        assert result == [42, 1, 2, 3]
+
+        # Test zero tokens
+        result = generator._create_prompt(0, 0, 42)
+        assert result == []
+
+        # Test without unique prefix
+        result = generator._create_prompt(3, 0)
+        assert result == [1, 2, 3]
+
+    @pytest.mark.regression
+    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
+    def test_create_prompt_binary_search(
+        self, mock_text_creator, simple_config, mock_tokenizer
+    ):
+        """Test binary search logic in _create_prompt.
+
+        ### WRITTEN BY AI ###
+        """
+        mock_text_creator_instance = Mock()
+        mock_text_creator_instance.words = ["word"] * 1000
+        mock_text_creator_instance.create_text.side_effect = lambda start, length: (
+            "text " * max(1, length // 4)
+        ).strip()
+        mock_text_creator.return_value = mock_text_creator_instance
+
+        # Mock tokenizer to return different lengths based on input
+        def mock_encode(text):
+            return [1] * len(text.split())
+
+        mock_tokenizer.encode.side_effect = mock_encode
+
+        generator = SyntheticTextItemsGenerator(
+            simple_config, mock_tokenizer, random_seed=42
+        )
+
+        # Test that binary search finds appropriate length
+        result = generator._create_prompt(5, 0, 42)
+        assert len(result) >= 4  # Should include prefix + some tokens
+
+    @pytest.mark.sanity
+    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
+    @patch("guidellm.dataset.synthetic.IntegerRangeSampler")
+    def test_prefix_tokens_integration(
+        self, mock_sampler, mock_text_creator, config_with_prefix, mock_tokenizer
+    ):
+        """Test integration with prefix tokens.
+
+        ### WRITTEN BY AI ###
+        """
+        # Setup mocks
+        mock_text_creator_instance = Mock()
+        mock_text_creator_instance.words = ["word"] * 100
+        mock_text_creator_instance.create_text.return_value = "sample text"
+        mock_text_creator.return_value = mock_text_creator_instance
+
+        mock_sampler_instance = Mock()
+        mock_sampler_instance.__iter__ = Mock(return_value=iter([15, 15, 15, 15, 15]))
+        mock_sampler.return_value = mock_sampler_instance
+
+        generator = SyntheticTextItemsGenerator(
+            config_with_prefix, mock_tokenizer, random_seed=42
+        )
+
+        items = list(generator)
+
+        # Verify prompt_tokens_count includes prefix
+        for item in items:
+            assert item["prompt_tokens_count"] == config_with_prefix.prefix_tokens + 15
+
+    @pytest.mark.regression
+    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
+    @patch("guidellm.dataset.synthetic.IntegerRangeSampler")
+    def test_random_seeding_consistency(
+        self, mock_sampler, mock_text_creator, simple_config, mock_tokenizer
+    ):
+        """Test that same seed produces consistent results.
+
+        ### WRITTEN BY AI ###
+        """
+        # Setup mocks
+        mock_text_creator_instance = Mock()
+        mock_text_creator_instance.words = ["word"] * 100
+        mock_text_creator_instance.create_text.return_value = "sample text"
+        mock_text_creator.return_value = mock_text_creator_instance
+
+        # Create consistent mock sampler behavior
+        call_count = 0
+
+        def mock_sampler_side_effect(*args, **kwargs):
+            nonlocal call_count
+            mock_instance = Mock()
+            # Return same sequence for both prompt and output tokens
+            if call_count % 2 == 0:  # prompt tokens
+                mock_instance.__iter__ = Mock(return_value=iter([15, 16, 17, 18, 19]))
+            else:  # output tokens
+                mock_instance.__iter__ = Mock(return_value=iter([10, 11, 12, 13, 14]))
+            call_count += 1
+            return mock_instance
+
+        mock_sampler.side_effect = mock_sampler_side_effect
+
+        # Create two generators with same seed
+        generator1 = SyntheticTextItemsGenerator(
+            simple_config, mock_tokenizer, random_seed=42
+        )
+        generator2 = SyntheticTextItemsGenerator(
+            simple_config, mock_tokenizer, random_seed=42
+        )
+
+        items1 = list(generator1)
+        items2 = list(generator2)
+
+        # Results should be identical with same seed
+        assert len(items1) == len(items2)
+        for item1, item2 in zip(items1, items2):
+            assert item1["prompt"] == item2["prompt"]
+            assert item1["prompt_tokens_count"] == item2["prompt_tokens_count"]
+            assert item1["output_tokens_count"] == item2["output_tokens_count"]
+
+    @pytest.mark.regression
+    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
+    @patch("guidellm.dataset.synthetic.IntegerRangeSampler")
+    def test_variance_configuration(
+        self, mock_sampler, mock_text_creator, complex_config, mock_tokenizer
+    ):
+        """Test that variance configuration is properly used.
+
+        ### WRITTEN BY AI ###
+        """
+        # Setup mocks
+        mock_text_creator_instance = Mock()
+        mock_text_creator_instance.words = ["word"] * 100
+        mock_text_creator_instance.create_text.return_value = "sample text"
+        mock_text_creator.return_value = mock_text_creator_instance
+
+        # Fix tokenizer mock to handle the create_text return properly
+        mock_tokenizer.encode.side_effect = (
+            lambda text: [1, 2, 3] if isinstance(text, str) else [1, 2, 3]
+        )
+
+        # Setup mock sampler to track calls
+        def mock_sampler_side_effect(*args, **kwargs):
+            mock_instance = Mock()
+            mock_instance.__iter__ = Mock(return_value=iter([20, 18, 22, 19, 21] * 2))
+            return mock_instance
+
+        mock_sampler.side_effect = mock_sampler_side_effect
+
+        generator = SyntheticTextItemsGenerator(
+            complex_config, mock_tokenizer, random_seed=42
+        )
+
+        # Initialize the generator to trigger sampler creation
+        generator_iter = iter(generator)
+        next(generator_iter)
+
+        # Verify that IntegerRangeSampler is called with correct parameters
+        assert mock_sampler.call_count == 2
+
+        # Check prompt tokens sampler call
+        prompt_call = mock_sampler.call_args_list[0]
+        assert prompt_call[1]["average"] == complex_config.prompt_tokens
+        assert prompt_call[1]["variance"] == complex_config.prompt_tokens_stdev
+        assert prompt_call[1]["min_value"] == complex_config.prompt_tokens_min
+        assert prompt_call[1]["max_value"] == complex_config.prompt_tokens_max
+        assert prompt_call[1]["random_seed"] == 42
+
+        # Check output tokens sampler call
+        output_call = mock_sampler.call_args_list[1]
+        assert output_call[1]["average"] == complex_config.output_tokens
+        assert output_call[1]["variance"] == complex_config.output_tokens_stdev
+        assert output_call[1]["min_value"] == complex_config.output_tokens_min
+        assert output_call[1]["max_value"] == complex_config.output_tokens_max
+        assert output_call[1]["random_seed"] == 43  # 42 + 1
+
+    @pytest.mark.regression
+    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
+    def test_unique_prefix_generation(
+        self, mock_text_creator, simple_config, mock_tokenizer
+    ):
+        """Test that unique prefixes are generated for each request.
+
+        ### WRITTEN BY AI ###
+        """
+        mock_text_creator_instance = Mock()
+        mock_text_creator_instance.words = ["word"] * 100
+        mock_text_creator_instance.create_text.return_value = "sample text"
+        mock_text_creator.return_value = mock_text_creator_instance
+
+        # Mock the cycle to return predictable values
+        with patch("guidellm.dataset.synthetic.cycle") as mock_cycle:
+            mock_cycle.return_value = iter([100, 101, 102, 103, 104])
+
+            generator = SyntheticTextItemsGenerator(
+                simple_config, mock_tokenizer, random_seed=42
+            )
+
+            # Access the iterator to trigger the cycle creation
+            generator_iter = iter(generator)
+            next(generator_iter)
+
+            # Verify cycle was called with vocab values
+            mock_cycle.assert_called_once()
+
+
+class TestSyntheticDatasetCreator:
+    """Test cases for SyntheticDatasetCreator class.
+
+    ### WRITTEN BY AI ###
+    """
+
+    @pytest.mark.sanity
+    def test_is_supported_path_config_file(self):
+        """Test is_supported with config file paths.
+
+        ### WRITTEN BY AI ###
+        """
+        with tempfile.NamedTemporaryFile(suffix=".config", delete=False) as f:
+            config_path = Path(f.name)
+
+        try:
+            assert SyntheticDatasetCreator.is_supported(config_path, None)
+        finally:
+            config_path.unlink()
+
+    @pytest.mark.sanity
+    def test_is_supported_path_yaml_file(self):
+        """Test is_supported with YAML file paths.
+
+        ### WRITTEN BY AI ###
+        """
+        with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f:
+            yaml_path = Path(f.name)
+
+        try:
+            assert SyntheticDatasetCreator.is_supported(yaml_path, None)
+        finally:
+            yaml_path.unlink()
+
+    @pytest.mark.smoke
+    def test_is_supported_json_string(self):
+        """Test is_supported with JSON string.
+
+        ### WRITTEN BY AI ###
+        """
+        json_str = '{"prompt_tokens": 50, "output_tokens": 25}'
+        assert SyntheticDatasetCreator.is_supported(json_str, None)
+
+    @pytest.mark.smoke
+    def test_is_supported_key_value_string(self):
+        """Test is_supported with key-value string.
+
+        ### WRITTEN BY AI ###
+        """
+        kv_str = "prompt_tokens=50,output_tokens=25"
+        assert SyntheticDatasetCreator.is_supported(kv_str, None)
+
+    @pytest.mark.sanity
+    def test_is_supported_config_filename_string(self):
+        """Test is_supported with config filename string.
+
+        ### WRITTEN BY AI ###
+        """
+        assert SyntheticDatasetCreator.is_supported("config.yaml", None)
+        assert SyntheticDatasetCreator.is_supported("settings.config", None)
+
+    @pytest.mark.sanity
+    def test_is_not_supported_regular_string(self):
+        """Test is_supported returns False for regular strings.
+
+        ### WRITTEN BY AI ###
+        """
+        assert not SyntheticDatasetCreator.is_supported("regular string", None)
+        assert not SyntheticDatasetCreator.is_supported("single=pair", None)
+
+    @pytest.mark.regression
+    def test_is_not_supported_non_existent_path(self):
+        """Test is_supported returns False for non-existent paths.
+
+        ### WRITTEN BY AI ###
+        """
+        non_existent_path = Path("/non/existent/path.config")
+        assert not SyntheticDatasetCreator.is_supported(non_existent_path, None)
+
+    @pytest.mark.regression
+    def test_is_not_supported_other_types(self):
+        """Test is_supported returns False for other data types.
+
+        ### WRITTEN BY AI ###
+        """
+        assert not SyntheticDatasetCreator.is_supported(123, None)
+        assert not SyntheticDatasetCreator.is_supported(["list"], None)
+        assert not SyntheticDatasetCreator.is_supported({"dict": "value"}, None)
+
+    @pytest.mark.smoke
+    @patch("guidellm.dataset.synthetic.check_load_processor")
+    @patch("guidellm.dataset.synthetic.SyntheticTextItemsGenerator")
+    @patch("guidellm.dataset.synthetic.Dataset")
+    def test_handle_create_basic(
+        self, mock_dataset, mock_generator, mock_check_processor
+    ):
+        """Test handle_create basic functionality.
+
+        ### WRITTEN BY AI ###
+        """
+        # Setup mocks
+        mock_processor = Mock()
+        mock_check_processor.return_value = mock_processor
+
+        mock_generator_instance = Mock()
+        mock_generator_instance.__iter__ = Mock(
+            return_value=iter(
+                [
+                    {
+                        "prompt": "test",
+                        "prompt_tokens_count": 10,
+                        "output_tokens_count": 5,
+                    }
+                ]
+            )
+        )
+        mock_generator.return_value = mock_generator_instance
+
+        mock_dataset_instance = Mock()
+        mock_dataset.from_list.return_value = mock_dataset_instance
+
+        # Test
+        data = '{"prompt_tokens": 50, "output_tokens": 25}'
+        result = SyntheticDatasetCreator.handle_create(
+            data=data,
+            data_args=None,
+            processor="gpt2",
+            processor_args=None,
+            random_seed=42,
+        )
+
+        # Verify
+        mock_check_processor.assert_called_once_with(
+            "gpt2",
+            None,
+            error_msg="Processor/tokenizer required for synthetic dataset generation.",
+        )
+        mock_generator.assert_called_once()
+        mock_dataset.from_list.assert_called_once()
+        assert result == mock_dataset_instance
+
+    @pytest.mark.sanity
+    @patch("guidellm.dataset.synthetic.check_load_processor")
+    def test_handle_create_processor_required(self, mock_check_processor):
+        """Test handle_create requires processor.
+
+        ### WRITTEN BY AI ###
+        """
+        mock_check_processor.side_effect = ValueError("Processor required")
+
+        data = '{"prompt_tokens": 50, "output_tokens": 25}'
+
+        with pytest.raises(ValueError, match="Processor required"):
+            SyntheticDatasetCreator.handle_create(
+                data=data,
+                data_args=None,
+                processor=None,
+                processor_args=None,
+                random_seed=42,
+            )
+
+    @pytest.mark.regression
+    @patch("guidellm.dataset.synthetic.check_load_processor")
+    @patch("guidellm.dataset.synthetic.SyntheticTextItemsGenerator")
+    @patch("guidellm.dataset.synthetic.Dataset")
+    def test_handle_create_with_data_args(
+        self, mock_dataset, mock_generator, mock_check_processor
+    ):
+        """Test handle_create with data_args.
+
+        ### WRITTEN BY AI ###
+        """
+        # Setup mocks
+        mock_processor = Mock()
+        mock_check_processor.return_value = mock_processor
+
+        mock_generator_instance = Mock()
+        mock_generator_instance.__iter__ = Mock(return_value=iter([]))
+        mock_generator.return_value = mock_generator_instance
+
+        mock_dataset_instance = Mock()
+        mock_dataset.from_list.return_value = mock_dataset_instance
+
+        # Test with data_args
+        data = '{"prompt_tokens": 50, "output_tokens": 25}'
+        data_args = {"features": "custom_features"}
+
+        SyntheticDatasetCreator.handle_create(
+            data=data,
+            data_args=data_args,
+            processor="gpt2",
+            processor_args=None,
+            random_seed=42,
+        )
+
+        # Verify data_args are passed to Dataset.from_list
+        mock_dataset.from_list.assert_called_once_with([], **data_args)
+
+    @pytest.mark.sanity
+    def test_extract_args_column_mappings_empty(self):
+        """Test extract_args_column_mappings with empty data_args.
+
+        ### WRITTEN BY AI ###
+        """
+        result = SyntheticDatasetCreator.extract_args_column_mappings(None)
+
+        expected = {
+            "prompt_column": "prompt",
+            "prompt_tokens_count_column": "prompt_tokens_count",
+            "output_tokens_count_column": "output_tokens_count",
+        }
+        assert result == expected
+
+    @pytest.mark.regression
+    def test_extract_args_column_mappings_with_parent_mappings(self):
+        """Test extract_args_column_mappings rejects column mappings.
+
+        ### WRITTEN BY AI ###
+        """
+        with (
+            patch.object(
+                SyntheticDatasetCreator.__bases__[0],
+                "extract_args_column_mappings",
+                return_value={"prompt_column": "custom_prompt"},
+            ),
+            pytest.raises(ValueError, match="Column mappings are not supported"),
+        ):
+            SyntheticDatasetCreator.extract_args_column_mappings({"some": "args"})
+
+    @pytest.mark.regression
+    def test_extract_args_column_mappings_no_parent_mappings(self):
+        """Test extract_args_column_mappings with no parent mappings.
+
+        ### WRITTEN BY AI ###
+        """
+        with patch.object(
+            SyntheticDatasetCreator.__bases__[0],
+            "extract_args_column_mappings",
+            return_value={},
+        ):
+            result = SyntheticDatasetCreator.extract_args_column_mappings(
+                {"some": "args"}
+            )
+
+            expected = {
+                "prompt_column": "prompt",
+                "prompt_tokens_count_column": "prompt_tokens_count",
+                "output_tokens_count_column": "output_tokens_count",
+            }
+            assert result == expected

From c8a847a9d69048a9e49ac964670c1d1dd708c789 Mon Sep 17 00:00:00 2001
From: Samuel Monson <smonson@redhat.com>
Date: Tue, 19 Aug 2025 11:45:47 -0400
Subject: [PATCH 6/8] Add advenced shared prefix support

Signed-off-by: Samuel Monson <smonson@redhat.com>
---
 src/guidellm/dataset/__init__.py  |  2 +
 src/guidellm/dataset/synthetic.py | 63 +++++++++++++++++++++++++++----
 2 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/src/guidellm/dataset/__init__.py b/src/guidellm/dataset/__init__.py
index b90b72ff..009ddf40 100644
--- a/src/guidellm/dataset/__init__.py
+++ b/src/guidellm/dataset/__init__.py
@@ -4,6 +4,7 @@
 from .hf_datasets import HFDatasetsCreator
 from .in_memory import InMemoryDatasetCreator
 from .synthetic import (
+    PrefixBucketConfig,
     SyntheticDatasetConfig,
     SyntheticDatasetCreator,
     SyntheticTextItemsGenerator,
@@ -15,6 +16,7 @@
     "FileDatasetCreator",
     "HFDatasetsCreator",
     "InMemoryDatasetCreator",
+    "PrefixBucketConfig",
     "SyntheticDatasetConfig",
     "SyntheticDatasetCreator",
     "SyntheticTextItemsGenerator",
diff --git a/src/guidellm/dataset/synthetic.py b/src/guidellm/dataset/synthetic.py
index 8c30f0f7..cc8c7d46 100644
--- a/src/guidellm/dataset/synthetic.py
+++ b/src/guidellm/dataset/synthetic.py
@@ -1,6 +1,6 @@
 import json
 import random
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterable, Iterator, Sequence
 from itertools import cycle
 from pathlib import Path
 from typing import Any, Literal, Optional, Union
@@ -19,18 +19,36 @@
 from guidellm.utils import EndlessTextCreator, IntegerRangeSampler, check_load_processor
 
 __all__ = [
+    "PrefixBucketConfig",
     "SyntheticDatasetConfig",
     "SyntheticDatasetCreator",
     "SyntheticTextItemsGenerator",
 ]
 
 
-class SyntheticDatasetConfig(BaseModel):
+class PrefixBucketConfig(BaseModel):
+    bucket_weight: int = Field(
+        description="Weight of this bucket in the overall distribution.",
+        gt=0,
+        default=100,
+    )
+    prefix_count: int = Field(
+        description="The number of unique prefixs to generate for this bucket.",
+        ge=1,
+        default=1,
+    )
     prefix_tokens: int = Field(
-        description="The number of shared prefix tokens to prepend to each prompt.",
+        description="The number of prefix tokens per-prompt for this bucket.",
         ge=0,
         default=0,
     )
+
+
+class SyntheticDatasetConfig(BaseModel):
+    prefix_buckets: Optional[list[PrefixBucketConfig]] = Field(
+        description="Buckets for the prefix tokens distribution.",
+        default=None,
+    )
     prompt_tokens: int = Field(
         description="The average number of text tokens generated for prompts.",
         gt=0,
@@ -169,17 +187,16 @@ def __iter__(
         )
         # ensure diff distribution from output tokens
         rand = random.Random(self.random_seed + 2)  # noqa: S311
+        shared_prefix_iter = iter(self._create_prefixes(rand))
         unique_prefix_iter = cycle(self.processor.get_vocab().values())
 
-        prefix_index = rand.randint(0, len(self.text_creator.words))
-        prefix_tokens = self._create_prompt(self.config.prefix_tokens, prefix_index)
-
         for _, prompt_tokens, output_tokens in zip(
             range(self.config.samples),
             prompt_tokens_sampler,
             output_tokens_sampler,
         ):
-            start_index = rand.randint(0, len(self.text_creator.words))
+            start_index = self._rand_start_index(rand)
+            prefix_tokens = next(shared_prefix_iter, [])
             prompt_text = self.processor.decode(
                 prefix_tokens
                 + self._create_prompt(
@@ -189,10 +206,40 @@ def __iter__(
             )
             yield {
                 "prompt": prompt_text,
-                "prompt_tokens_count": self.config.prefix_tokens + prompt_tokens,
+                "prompt_tokens_count": len(prefix_tokens) + prompt_tokens,
                 "output_tokens_count": output_tokens,
             }
 
+    def _rand_start_index(self, rand: random.Random) -> int:
+        """Generate a random start index for text generation."""
+        return rand.randint(0, len(self.text_creator.words) - 1)
+
+    def _create_prefixes(self, rand: random.Random) -> Sequence[list[int]]:
+        """Create an iterator for shared prefix tokens."""
+        buckets = self.config.prefix_buckets
+
+        if not buckets:
+            return []
+
+        total_weight = sum(bucket.bucket_weight for bucket in buckets)
+        if total_weight <= 0:
+            raise ValueError("Total weight of prefix buckets must be greater than 0.")
+
+        prompts = []
+        for bucket in buckets:
+            for _ in range(bucket.prefix_count):
+                start_index = self._rand_start_index(rand)
+                prompt_tokens = self._create_prompt(bucket.prefix_tokens, start_index)
+                sample_percent = (
+                    bucket.bucket_weight / bucket.prefix_count / total_weight
+                )
+                sample_count = sample_percent * self.config.samples
+                for _ in range(int(round(sample_count))):
+                    prompts.append(prompt_tokens)
+
+        rand.shuffle(prompts)
+        return prompts
+
     def _create_prompt(
         self, prompt_tokens: int, start_index: int, unique_prefix: Optional[int] = None
     ) -> list[int]:

From 692589cb2b67434a71570bee3e0a865a37d7ce71 Mon Sep 17 00:00:00 2001
From: Samuel Monson <smonson@redhat.com>
Date: Tue, 19 Aug 2025 15:48:19 -0400
Subject: [PATCH 7/8] Update tests for new prefix patch and reduce the number
 of mocks

Signed-off-by: Samuel Monson <smonson@redhat.com>
---
 tests/unit/dataset/test_synthetic.py | 200 +++++++++------------------
 1 file changed, 66 insertions(+), 134 deletions(-)

diff --git a/tests/unit/dataset/test_synthetic.py b/tests/unit/dataset/test_synthetic.py
index e3110fa3..b249ab30 100644
--- a/tests/unit/dataset/test_synthetic.py
+++ b/tests/unit/dataset/test_synthetic.py
@@ -11,6 +11,7 @@
 import yaml
 
 from guidellm.dataset.synthetic import (
+    PrefixBucketConfig,
     SyntheticDatasetConfig,
     SyntheticDatasetCreator,
     SyntheticTextItemsGenerator,
@@ -29,8 +30,12 @@ def test_config_creation_with_all_params(self):
 
         ### WRITTEN BY AI ###
         """
+        prefix_bucket = PrefixBucketConfig(
+            bucket_weight=100, prefix_count=1, prefix_tokens=5
+        )
+
         config = SyntheticDatasetConfig(
-            prefix_tokens=5,
+            prefix_buckets=[prefix_bucket],
             prompt_tokens=100,
             prompt_tokens_stdev=10,
             prompt_tokens_min=50,
@@ -43,7 +48,7 @@ def test_config_creation_with_all_params(self):
             source="custom_text.txt",
         )
 
-        assert config.prefix_tokens == 5
+        assert config.prefix_buckets[0].prefix_tokens == 5  # type: ignore [index]
         assert config.prompt_tokens == 100
         assert config.prompt_tokens_stdev == 10
         assert config.prompt_tokens_min == 50
@@ -67,7 +72,9 @@ def test_parse_json_string(self):
                 "output_tokens": 25,
                 "samples": 200,
                 "source": "test.txt",
-                "prefix_tokens": 10,
+                "prefix_buckets": [
+                    {"bucket_weight": 100, "prefix_count": 1, "prefix_tokens": 10}
+                ],
             }
         )
 
@@ -77,7 +84,7 @@ def test_parse_json_string(self):
         assert config.output_tokens == 25
         assert config.samples == 200
         assert config.source == "test.txt"
-        assert config.prefix_tokens == 10
+        assert config.prefix_buckets[0].prefix_tokens == 10  # type: ignore [index]
 
     @pytest.mark.regression
     def test_parse_key_value_pairs(self):
@@ -85,7 +92,7 @@ def test_parse_key_value_pairs(self):
 
         ### WRITTEN BY AI ###
         """
-        kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt,prefix_tokens=5"  # noqa: E501
+        kv_str = "prompt_tokens=80,output_tokens=30,samples=300,source=data.txt"
 
         config = SyntheticDatasetConfig.parse_str(kv_str)
 
@@ -93,7 +100,7 @@ def test_parse_key_value_pairs(self):
         assert config.output_tokens == 30
         assert config.samples == 300
         assert config.source == "data.txt"
-        assert config.prefix_tokens == 5
+        assert config.prefix_buckets is None
 
     @pytest.mark.sanity
     def test_parse_yaml_file(self):
@@ -106,7 +113,9 @@ def test_parse_yaml_file(self):
             "output_tokens": 15,
             "samples": 100,
             "source": "yaml_test.txt",
-            "prefix_tokens": 3,
+            "prefix_buckets": [
+                {"bucket_weight": 100, "prefix_count": 1, "prefix_tokens": 3}
+            ],
         }
 
         with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
@@ -120,7 +129,7 @@ def test_parse_yaml_file(self):
             assert config.output_tokens == 15
             assert config.samples == 100
             assert config.source == "yaml_test.txt"
-            assert config.prefix_tokens == 3
+            assert config.prefix_buckets[0].prefix_tokens == 3  # type: ignore [index]
         finally:
             Path(yaml_path).unlink()
 
@@ -134,7 +143,9 @@ def test_parse_config_file(self):
             "prompt_tokens": 90,
             "output_tokens": 35,
             "samples": 150,
-            "prefix_tokens": 2,
+            "prefix_buckets": [
+                {"bucket_weight": 100, "prefix_count": 1, "prefix_tokens": 2}
+            ],
         }
 
         with tempfile.NamedTemporaryFile(mode="w", suffix=".config", delete=False) as f:
@@ -147,7 +158,7 @@ def test_parse_config_file(self):
             assert config.prompt_tokens == 90
             assert config.output_tokens == 35
             assert config.samples == 150
-            assert config.prefix_tokens == 2
+            assert config.prefix_buckets[0].prefix_tokens == 2  # type: ignore [index]
         finally:
             Path(config_path).unlink()
 
@@ -194,8 +205,9 @@ def test_validation_positive_values(self):
         with pytest.raises(ValueError):
             SyntheticDatasetConfig(prompt_tokens=20, output_tokens=10, samples=0)
 
+        # Test negative prefix tokens via PrefixBucketConfig validation
         with pytest.raises(ValueError):
-            SyntheticDatasetConfig(prompt_tokens=20, output_tokens=10, prefix_tokens=-1)
+            PrefixBucketConfig(prefix_tokens=-1)
 
     @pytest.mark.regression
     def test_validation_optional_positive_values(self):
@@ -279,7 +291,7 @@ def mock_tokenizer(self):
         """
         tokenizer = Mock()
         tokenizer.get_vocab.return_value = {f"token_{i}": i for i in range(1000)}
-        tokenizer.encode.side_effect = lambda text: [1, 2, 3] * (len(text) // 10 + 1)
+        tokenizer.encode.side_effect = lambda text: list(range(len(text.split())))
         tokenizer.decode.side_effect = (
             lambda tokens, skip_special_tokens=False: " ".join(
                 f"token_{t}" for t in tokens[:5]
@@ -287,6 +299,22 @@ def mock_tokenizer(self):
         )
         return tokenizer
 
+    @pytest.fixture
+    def mock_integer_range_sampler(self):
+        """Fixture to provide a mocked IntegerRangeSampler.
+
+        ### WRITTEN BY AI ###
+        """
+        with patch("guidellm.dataset.synthetic.IntegerRangeSampler") as mock_sampler:
+            # Default side effect for basic iteration
+            def mock_sampler_side_effect(*args, **kwargs):
+                mock_instance = Mock()
+                mock_instance.__iter__ = Mock(return_value=iter([15, 15, 15, 15, 15]))
+                return mock_instance
+
+            mock_sampler.side_effect = mock_sampler_side_effect
+            yield mock_sampler
+
     @pytest.fixture
     def simple_config(self):
         """Fixture for simple configuration.
@@ -306,8 +334,12 @@ def config_with_prefix(self):
 
         ### WRITTEN BY AI ###
         """
+        prefix_bucket = PrefixBucketConfig(
+            bucket_weight=100, prefix_count=1, prefix_tokens=3
+        )
+
         return SyntheticDatasetConfig(
-            prefix_tokens=3,
+            prefix_buckets=[prefix_bucket],
             prompt_tokens=15,
             output_tokens=10,
             samples=5,
@@ -352,29 +384,16 @@ def test_generator_initialization(
         mock_text_creator.assert_called_once_with(data=simple_config.source)
 
     @pytest.mark.smoke
-    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
-    @patch("guidellm.dataset.synthetic.IntegerRangeSampler")
     def test_basic_iteration(
-        self, mock_sampler, mock_text_creator, simple_config, mock_tokenizer
+        self,
+        mock_integer_range_sampler,
+        simple_config,
+        mock_tokenizer,
     ):
         """Test basic iteration functionality.
 
         ### WRITTEN BY AI ###
         """
-        # Setup mocks
-        mock_text_creator_instance = Mock()
-        mock_text_creator_instance.words = ["word1", "word2", "word3"] * 100
-        mock_text_creator_instance.create_text.return_value = "sample text"
-        mock_text_creator.return_value = mock_text_creator_instance
-
-        # Mock IntegerRangeSampler to return iterators
-        def mock_sampler_side_effect(*args, **kwargs):
-            mock_instance = Mock()
-            mock_instance.__iter__ = Mock(return_value=iter([15, 15, 15, 15, 15]))
-            return mock_instance
-
-        mock_sampler.side_effect = mock_sampler_side_effect
-
         generator = SyntheticTextItemsGenerator(
             simple_config, mock_tokenizer, random_seed=42
         )
@@ -394,28 +413,19 @@ def mock_sampler_side_effect(*args, **kwargs):
             assert isinstance(item["output_tokens_count"], int)
 
     @pytest.mark.sanity
-    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
-    def test_create_prompt_method(
-        self, mock_text_creator, simple_config, mock_tokenizer
-    ):
+    def test_create_prompt_method(self, simple_config, mock_tokenizer):
         """Test _create_prompt method.
 
         ### WRITTEN BY AI ###
         """
-        mock_text_creator_instance = Mock()
-        mock_text_creator_instance.words = ["word"] * 100
-        mock_text_creator_instance.create_text.return_value = "test text"
-        mock_text_creator.return_value = mock_text_creator_instance
-
-        mock_tokenizer.encode.return_value = [1, 2, 3]
-
         generator = SyntheticTextItemsGenerator(
             simple_config, mock_tokenizer, random_seed=42
         )
 
         # Test normal case
         result = generator._create_prompt(5, 0, 42)
-        assert result == [42, 1, 2, 3]
+        assert result[0] == 42  # Unique prefix token
+        assert len(result) == 5
 
         # Test zero tokens
         result = generator._create_prompt(0, 0, 42)
@@ -423,30 +433,14 @@ def test_create_prompt_method(
 
         # Test without unique prefix
         result = generator._create_prompt(3, 0)
-        assert result == [1, 2, 3]
+        assert len(result) == 3
 
     @pytest.mark.regression
-    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
-    def test_create_prompt_binary_search(
-        self, mock_text_creator, simple_config, mock_tokenizer
-    ):
+    def test_create_prompt_binary_search(self, simple_config, mock_tokenizer):
         """Test binary search logic in _create_prompt.
 
         ### WRITTEN BY AI ###
         """
-        mock_text_creator_instance = Mock()
-        mock_text_creator_instance.words = ["word"] * 1000
-        mock_text_creator_instance.create_text.side_effect = lambda start, length: (
-            "text " * max(1, length // 4)
-        ).strip()
-        mock_text_creator.return_value = mock_text_creator_instance
-
-        # Mock tokenizer to return different lengths based on input
-        def mock_encode(text):
-            return [1] * len(text.split())
-
-        mock_tokenizer.encode.side_effect = mock_encode
-
         generator = SyntheticTextItemsGenerator(
             simple_config, mock_tokenizer, random_seed=42
         )
@@ -456,25 +450,13 @@ def mock_encode(text):
         assert len(result) >= 4  # Should include prefix + some tokens
 
     @pytest.mark.sanity
-    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
-    @patch("guidellm.dataset.synthetic.IntegerRangeSampler")
     def test_prefix_tokens_integration(
-        self, mock_sampler, mock_text_creator, config_with_prefix, mock_tokenizer
+        self, mock_integer_range_sampler, config_with_prefix, mock_tokenizer
     ):
         """Test integration with prefix tokens.
 
         ### WRITTEN BY AI ###
         """
-        # Setup mocks
-        mock_text_creator_instance = Mock()
-        mock_text_creator_instance.words = ["word"] * 100
-        mock_text_creator_instance.create_text.return_value = "sample text"
-        mock_text_creator.return_value = mock_text_creator_instance
-
-        mock_sampler_instance = Mock()
-        mock_sampler_instance.__iter__ = Mock(return_value=iter([15, 15, 15, 15, 15]))
-        mock_sampler.return_value = mock_sampler_instance
-
         generator = SyntheticTextItemsGenerator(
             config_with_prefix, mock_tokenizer, random_seed=42
         )
@@ -483,40 +465,19 @@ def test_prefix_tokens_integration(
 
         # Verify prompt_tokens_count includes prefix
         for item in items:
-            assert item["prompt_tokens_count"] == config_with_prefix.prefix_tokens + 15
+            assert (
+                item["prompt_tokens_count"]
+                == config_with_prefix.prefix_buckets[0].prefix_tokens + 15
+            )
 
     @pytest.mark.regression
-    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
-    @patch("guidellm.dataset.synthetic.IntegerRangeSampler")
     def test_random_seeding_consistency(
-        self, mock_sampler, mock_text_creator, simple_config, mock_tokenizer
+        self, mock_integer_range_sampler, simple_config, mock_tokenizer
     ):
         """Test that same seed produces consistent results.
 
         ### WRITTEN BY AI ###
         """
-        # Setup mocks
-        mock_text_creator_instance = Mock()
-        mock_text_creator_instance.words = ["word"] * 100
-        mock_text_creator_instance.create_text.return_value = "sample text"
-        mock_text_creator.return_value = mock_text_creator_instance
-
-        # Create consistent mock sampler behavior
-        call_count = 0
-
-        def mock_sampler_side_effect(*args, **kwargs):
-            nonlocal call_count
-            mock_instance = Mock()
-            # Return same sequence for both prompt and output tokens
-            if call_count % 2 == 0:  # prompt tokens
-                mock_instance.__iter__ = Mock(return_value=iter([15, 16, 17, 18, 19]))
-            else:  # output tokens
-                mock_instance.__iter__ = Mock(return_value=iter([10, 11, 12, 13, 14]))
-            call_count += 1
-            return mock_instance
-
-        mock_sampler.side_effect = mock_sampler_side_effect
-
         # Create two generators with same seed
         generator1 = SyntheticTextItemsGenerator(
             simple_config, mock_tokenizer, random_seed=42
@@ -528,7 +489,7 @@ def mock_sampler_side_effect(*args, **kwargs):
         items1 = list(generator1)
         items2 = list(generator2)
 
-        # Results should be identical with same seed
+        # With same seed and deterministic mocks, results should be identical
         assert len(items1) == len(items2)
         for item1, item2 in zip(items1, items2):
             assert item1["prompt"] == item2["prompt"]
@@ -536,34 +497,13 @@ def mock_sampler_side_effect(*args, **kwargs):
             assert item1["output_tokens_count"] == item2["output_tokens_count"]
 
     @pytest.mark.regression
-    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
-    @patch("guidellm.dataset.synthetic.IntegerRangeSampler")
     def test_variance_configuration(
-        self, mock_sampler, mock_text_creator, complex_config, mock_tokenizer
+        self, mock_integer_range_sampler, complex_config, mock_tokenizer
     ):
         """Test that variance configuration is properly used.
 
         ### WRITTEN BY AI ###
         """
-        # Setup mocks
-        mock_text_creator_instance = Mock()
-        mock_text_creator_instance.words = ["word"] * 100
-        mock_text_creator_instance.create_text.return_value = "sample text"
-        mock_text_creator.return_value = mock_text_creator_instance
-
-        # Fix tokenizer mock to handle the create_text return properly
-        mock_tokenizer.encode.side_effect = (
-            lambda text: [1, 2, 3] if isinstance(text, str) else [1, 2, 3]
-        )
-
-        # Setup mock sampler to track calls
-        def mock_sampler_side_effect(*args, **kwargs):
-            mock_instance = Mock()
-            mock_instance.__iter__ = Mock(return_value=iter([20, 18, 22, 19, 21] * 2))
-            return mock_instance
-
-        mock_sampler.side_effect = mock_sampler_side_effect
-
         generator = SyntheticTextItemsGenerator(
             complex_config, mock_tokenizer, random_seed=42
         )
@@ -573,10 +513,10 @@ def mock_sampler_side_effect(*args, **kwargs):
         next(generator_iter)
 
         # Verify that IntegerRangeSampler is called with correct parameters
-        assert mock_sampler.call_count == 2
+        assert mock_integer_range_sampler.call_count == 2
 
         # Check prompt tokens sampler call
-        prompt_call = mock_sampler.call_args_list[0]
+        prompt_call = mock_integer_range_sampler.call_args_list[0]
         assert prompt_call[1]["average"] == complex_config.prompt_tokens
         assert prompt_call[1]["variance"] == complex_config.prompt_tokens_stdev
         assert prompt_call[1]["min_value"] == complex_config.prompt_tokens_min
@@ -584,7 +524,7 @@ def mock_sampler_side_effect(*args, **kwargs):
         assert prompt_call[1]["random_seed"] == 42
 
         # Check output tokens sampler call
-        output_call = mock_sampler.call_args_list[1]
+        output_call = mock_integer_range_sampler.call_args_list[1]
         assert output_call[1]["average"] == complex_config.output_tokens
         assert output_call[1]["variance"] == complex_config.output_tokens_stdev
         assert output_call[1]["min_value"] == complex_config.output_tokens_min
@@ -592,19 +532,11 @@ def mock_sampler_side_effect(*args, **kwargs):
         assert output_call[1]["random_seed"] == 43  # 42 + 1
 
     @pytest.mark.regression
-    @patch("guidellm.dataset.synthetic.EndlessTextCreator")
-    def test_unique_prefix_generation(
-        self, mock_text_creator, simple_config, mock_tokenizer
-    ):
+    def test_unique_prefix_generation(self, simple_config, mock_tokenizer):
         """Test that unique prefixes are generated for each request.
 
         ### WRITTEN BY AI ###
         """
-        mock_text_creator_instance = Mock()
-        mock_text_creator_instance.words = ["word"] * 100
-        mock_text_creator_instance.create_text.return_value = "sample text"
-        mock_text_creator.return_value = mock_text_creator_instance
-
         # Mock the cycle to return predictable values
         with patch("guidellm.dataset.synthetic.cycle") as mock_cycle:
             mock_cycle.return_value = iter([100, 101, 102, 103, 104])

From 558cc78bbd4688bd3c6eb922fe0881e7c4d2c0dc Mon Sep 17 00:00:00 2001
From: Samuel Monson <smonson@redhat.com>
Date: Tue, 19 Aug 2025 17:28:48 -0400
Subject: [PATCH 8/8] Add more prefix bucket testcases

Signed-off-by: Samuel Monson <smonson@redhat.com>
---
 tests/unit/dataset/test_synthetic.py | 220 ++++++++++++++++++++++++++-
 1 file changed, 218 insertions(+), 2 deletions(-)

diff --git a/tests/unit/dataset/test_synthetic.py b/tests/unit/dataset/test_synthetic.py
index b249ab30..080fcbfb 100644
--- a/tests/unit/dataset/test_synthetic.py
+++ b/tests/unit/dataset/test_synthetic.py
@@ -18,6 +18,76 @@
 )
 
 
+class TestPrefixBucketConfig:
+    """Test cases for PrefixBucketConfig class.
+
+    ### WRITTEN BY AI ###
+    """
+
+    @pytest.mark.smoke
+    def test_creation_with_valid_params(self):
+        """Test creating PrefixBucketConfig with valid parameters.
+
+        ### WRITTEN BY AI ###
+        """
+        config = PrefixBucketConfig(bucket_weight=100, prefix_count=1, prefix_tokens=5)
+
+        assert config.bucket_weight == 100
+        assert config.prefix_count == 1
+        assert config.prefix_tokens == 5
+
+    @pytest.mark.sanity
+    def test_creation_with_negative_values(self):
+        """Test creating PrefixBucketConfig with negative values raises ValueError.
+
+        ### WRITTEN BY AI ###
+        """
+        with pytest.raises(ValueError):
+            PrefixBucketConfig(bucket_weight=-10, prefix_count=1, prefix_tokens=5)
+
+        with pytest.raises(ValueError):
+            PrefixBucketConfig(bucket_weight=100, prefix_count=-1, prefix_tokens=5)
+
+        with pytest.raises(ValueError):
+            PrefixBucketConfig(bucket_weight=100, prefix_count=1, prefix_tokens=-5)
+
+    @pytest.mark.regression
+    def test_prefix_bucket_zero_weight_error(self):
+        """Test that zero total weight raises an error.
+
+        ### WRITTEN BY AI ###
+        """
+        # Test validation error for creating PrefixBucketConfig with weight=0
+        with pytest.raises(ValueError):
+            PrefixBucketConfig(bucket_weight=0, prefix_count=1, prefix_tokens=2)
+
+    @pytest.mark.sanity
+    def test_prefix_bucket_config_validation(self):
+        """Test PrefixBucketConfig validation.
+
+        ### WRITTEN BY AI ###
+        """
+        # Test valid config
+        valid_config = PrefixBucketConfig(
+            bucket_weight=50, prefix_count=2, prefix_tokens=3
+        )
+        assert valid_config.bucket_weight == 50
+        assert valid_config.prefix_count == 2
+        assert valid_config.prefix_tokens == 3
+
+        # Test invalid bucket_weight
+        with pytest.raises(ValueError):
+            PrefixBucketConfig(bucket_weight=0, prefix_count=1, prefix_tokens=2)
+
+        # Test invalid prefix_count
+        with pytest.raises(ValueError):
+            PrefixBucketConfig(bucket_weight=100, prefix_count=0, prefix_tokens=2)
+
+        # Test invalid prefix_tokens
+        with pytest.raises(ValueError):
+            PrefixBucketConfig(bucket_weight=100, prefix_count=1, prefix_tokens=-1)
+
+
 class TestSyntheticDatasetConfig:
     """Test cases for SyntheticDatasetConfig class.
 
@@ -306,10 +376,11 @@ def mock_integer_range_sampler(self):
         ### WRITTEN BY AI ###
         """
         with patch("guidellm.dataset.synthetic.IntegerRangeSampler") as mock_sampler:
-            # Default side effect for basic iteration
+            # Side effect for basic iteration with enough values for larger tests
             def mock_sampler_side_effect(*args, **kwargs):
                 mock_instance = Mock()
-                mock_instance.__iter__ = Mock(return_value=iter([15, 15, 15, 15, 15]))
+                # Provide enough values for tests (up to 20 items)
+                mock_instance.__iter__ = Mock(return_value=iter([15] * 20))
                 return mock_instance
 
             mock_sampler.side_effect = mock_sampler_side_effect
@@ -346,6 +417,45 @@ def config_with_prefix(self):
             source="The quick brown fox jumps over the lazy dog.",
         )
 
+    @pytest.fixture
+    def config_with_multiple_prefix_buckets(self):
+        """Fixture for configuration with multiple prefix buckets.
+
+        ### WRITTEN BY AI ###
+        """
+        prefix_bucket1 = PrefixBucketConfig(
+            bucket_weight=60, prefix_count=1, prefix_tokens=2
+        )
+        prefix_bucket2 = PrefixBucketConfig(
+            bucket_weight=40, prefix_count=1, prefix_tokens=4
+        )
+
+        return SyntheticDatasetConfig(
+            prefix_buckets=[prefix_bucket1, prefix_bucket2],
+            prompt_tokens=10,
+            output_tokens=5,
+            samples=10,
+            source="The quick brown fox jumps over the lazy dog.",
+        )
+
+    @pytest.fixture
+    def config_with_multiple_prefix_counts(self):
+        """Fixture for configuration with prefix_count > 1.
+
+        ### WRITTEN BY AI ###
+        """
+        prefix_bucket = PrefixBucketConfig(
+            bucket_weight=100, prefix_count=3, prefix_tokens=2
+        )
+
+        return SyntheticDatasetConfig(
+            prefix_buckets=[prefix_bucket],
+            prompt_tokens=8,
+            output_tokens=4,
+            samples=6,
+            source="The quick brown fox jumps over the lazy dog.",
+        )
+
     @pytest.fixture
     def complex_config(self):
         """Fixture for complex configuration with variance.
@@ -552,6 +662,112 @@ def test_unique_prefix_generation(self, simple_config, mock_tokenizer):
             # Verify cycle was called with vocab values
             mock_cycle.assert_called_once()
 
+    @pytest.mark.regression
+    def test_multiple_prefix_buckets_distribution(
+        self,
+        mock_integer_range_sampler,
+        config_with_multiple_prefix_buckets,
+        mock_tokenizer,
+    ):
+        """Test distribution across multiple prefix buckets with different weights.
+
+        ### WRITTEN BY AI ###
+        """
+        generator = SyntheticTextItemsGenerator(
+            config_with_multiple_prefix_buckets, mock_tokenizer, random_seed=42
+        )
+
+        items = list(generator)
+
+        # Verify we get the expected number of items
+        assert len(items) == config_with_multiple_prefix_buckets.samples
+
+        # Verify that prefix tokens are added to prompt_tokens_count
+        # Since we have buckets with 2 and 4 prefix tokens, and the mock returns 15
+        # prompt tokens, we should see prompt_tokens_count of either 17 or 19
+        prefix_counts = [item["prompt_tokens_count"] for item in items]
+        assert all(count in [17, 19] for count in prefix_counts)
+
+        # Calculate expected distribution based on weights
+        # Bucket 1: weight=60, prefix_count=1, prefix_tokens=2
+        # Bucket 2: weight=40, prefix_count=1, prefix_tokens=4
+        # Total weight = 100, samples = 10
+        # Bucket 1: (60/1/100) * 10 = 6 samples with 17 tokens (2 prefix + 15 prompt)
+        # Bucket 2: (40/1/100) * 10 = 4 samples with 19 tokens (4 prefix + 15 prompt)
+        count_17 = prefix_counts.count(17)  # 2 prefix tokens
+        count_19 = prefix_counts.count(19)  # 4 prefix tokens
+        assert count_17 == 6
+        assert count_19 == 4
+
+    @pytest.mark.regression
+    def test_multiple_prefix_counts(
+        self,
+        mock_integer_range_sampler,
+        config_with_multiple_prefix_counts,
+        mock_tokenizer,
+    ):
+        """Test prefix buckets with prefix_count > 1.
+
+        ### WRITTEN BY AI ###
+        """
+        generator = SyntheticTextItemsGenerator(
+            config_with_multiple_prefix_counts, mock_tokenizer, random_seed=42
+        )
+
+        items = list(generator)
+
+        # Verify we get the expected number of items
+        assert len(items) == config_with_multiple_prefix_counts.samples
+
+        # All items should have 2 prefix tokens + 15 prompt tokens = 17 total
+        for item in items:
+            assert item["prompt_tokens_count"] == 17
+
+    @pytest.mark.sanity
+    def test_prefix_buckets_create_prefixes_method(
+        self, config_with_multiple_prefix_buckets, mock_tokenizer
+    ):
+        """Test the _create_prefixes method directly.
+
+        ### WRITTEN BY AI ###
+        """
+        generator = SyntheticTextItemsGenerator(
+            config_with_multiple_prefix_buckets, mock_tokenizer, random_seed=42
+        )
+
+        # Test _create_prefixes method
+        rand = Mock()
+        rand.randint = Mock(return_value=0)
+        prefixes = generator._create_prefixes(rand)
+
+        # Should return a sequence of prefix token lists
+        assert isinstance(prefixes, list)
+        assert len(prefixes) == 10
+
+        # Each prefix should be a list of integers
+        for prefix in prefixes:
+            assert isinstance(prefix, list)
+            assert all(isinstance(token, int) for token in prefix)
+
+    @pytest.mark.regression
+    def test_empty_prefix_buckets(
+        self, mock_integer_range_sampler, simple_config, mock_tokenizer
+    ):
+        """Test behavior when prefix_buckets is None or empty.
+
+        ### WRITTEN BY AI ###
+        """
+        # Test with None prefix_buckets (simple_config has None)
+        generator = SyntheticTextItemsGenerator(
+            simple_config, mock_tokenizer, random_seed=42
+        )
+
+        items = list(generator)
+
+        # All items should have exactly the prompt tokens (no prefix)
+        for item in items:
+            assert item["prompt_tokens_count"] == 15  # Mock returns 15
+
 
 class TestSyntheticDatasetCreator:
     """Test cases for SyntheticDatasetCreator class.