diff --git a/docs/preprocess.md b/docs/preprocess.md new file mode 100644 index 00000000..062bfc84 --- /dev/null +++ b/docs/preprocess.md @@ -0,0 +1,132 @@ +# Preprocess Commands + +GuideLLM provides preprocessing capabilities to transform and prepare data for benchmarking workflows. The preprocess module includes tools for creating datasets from existing benchmark results, enabling "apples-to-apples" comparisons and reusable benchmark datasets. + +## Overview + +The `guidellm preprocess` command provides utilities to: + +- **Extract datasets from benchmark results**: Convert completed benchmark reports into reusable datasets with known prompt and output token counts for consistent comparisons + +## Commands + +### `dataset-from-file` + +Extracts prompts and their corresponding output token counts from saved benchmark report files to create datasets for future benchmarking runs. + +#### Purpose + +When you run a benchmark with GuideLLM, you get detailed results about how a model performed with specific prompts. The `dataset-from-file` command allows you to extract those successful prompt-response pairs and convert them into a standardized dataset format. This enables: + +1. **Consistent Comparisons**: Use the exact same prompts across different models or configurations +2. **Known Expectations**: Each prompt comes with its expected output token count +3. **Reproducible Benchmarks**: Eliminate variability from different prompts when comparing models + +#### Syntax + +```bash +guidellm preprocess dataset-from-file [OPTIONS] BENCHMARK_FILE +``` + +#### Arguments + +- `BENCHMARK_FILE`: Path to the saved benchmark report file (JSON format) + +#### Options + +- `-o, --output-path PATH`: Output dataset file path (default: `dataset_from_benchmark.json`) +- `--show-stats`: Show dataset statistics after creation +- `--disable-console-outputs`: Disable console output for silent operation +- `--help`: Show help message and exit + +#### Example Usage + +##### Basic Usage + +```bash +# Convert a benchmark report to a dataset +guidellm preprocess dataset-from-file benchmark-results.json + +# Specify custom output path +guidellm preprocess dataset-from-file benchmark-results.json -o my_dataset.json + +# Show statistics about the created dataset +guidellm preprocess dataset-from-file benchmark-results.json --show-stats +``` + +#### Input File Requirements + +The input benchmark file must be a valid GuideLLM benchmark report containing: + +- **Valid JSON format**: The file must be properly formatted +- **Benchmark report structure**: Must contain the expected benchmark report schema +- **Successful requests**: Must contain at least one successful request to extract data from + +##### Supported Input Formats + +```json +{ + "benchmarks": [ + { + "requests": { + "successful": [ + { + "prompt": "What is the capital of France?", + "output_tokens": 5, + "... other request fields ..." + } + ], + "errored": [], + "incomplete": [] + } + } + ] +} +``` + +#### Output Format + +The generated dataset follows this structure: + +```json +{ + "version": "1.0", + "description": "Dataset created from benchmark results for apples-to-apples comparisons", + "data": [ + { + "prompt": "What is the capital of France?", + "output_tokens_count": 5, + "prompt_tokens_count": 12 + }, + { + "prompt": "Explain quantum computing in simple terms.", + "output_tokens_count": 45, + "prompt_tokens_count": 8 + } + ] +} +``` + +Each data item contains: + +- `prompt`: The original prompt text +- `output_tokens_count`: The number of tokens in the model's response +- `prompt_tokens_count`: The number of tokens in the original prompt + +#### Statistics Output + +When using `--show-stats`, you'll see detailed information about the created dataset: + +``` +Dataset Statistics: +================== +Total items: 95 +Prompt length statistics: + Min: 8 characters + Max: 245 characters + Mean: 87.3 characters +Output tokens statistics: + Min: 1 tokens + Max: 512 tokens + Mean: 124.8 tokens +``` diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index de789ad2..2f75acca 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -15,6 +15,10 @@ from guidellm.benchmark.scenario import GenerativeTextScenario, get_builtin_scenarios from guidellm.config import print_config from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset +from guidellm.preprocess.dataset_from_file import ( + DatasetCreationError, + create_dataset_from_file, +) from guidellm.scheduler import StrategyType from guidellm.utils import DefaultGroupHandler from guidellm.utils import cli as cli_tools @@ -493,6 +497,12 @@ def dataset( hub_dataset_id, random_seed, ): + """ + Convert a dataset to have specific prompt and output token counts. + + This creates a filtered and processed dataset where prompts and outputs + match specified token counts, useful for consistent benchmarking. + """ process_dataset( data=data, output_path=output_path, @@ -510,5 +520,58 @@ def dataset( ) +@preprocess.command( + "dataset-from-file", help="Create a dataset from a saved benchmark report file." +) +@click.argument( + "benchmark_file", + type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), +) +@click.option( + "-o", + "--output-path", + type=click.Path(file_okay=True, dir_okay=False, path_type=Path), + default=Path("dataset_from_benchmark.json"), + help="Output dataset file path.", +) +@click.option( + "--show-stats", + is_flag=True, + help="Show dataset statistics after creation.", +) +@click.option( + "--disable-console-outputs", + is_flag=True, + help="Set this flag to disable console output.", +) +def dataset_from_file( + benchmark_file, + output_path, + show_stats, + disable_console_outputs, +): + """ + Create a dataset from a saved benchmark report file. + + This extracts prompts and their corresponding output token counts from + benchmark results to create an 'apples-to-apples' comparison dataset. + + BENCHMARK_FILE: Path to the benchmark results JSON file. + """ + try: + create_dataset_from_file( + benchmark_file=benchmark_file, + output_path=Path(output_path), + show_stats=show_stats, + enable_console=not disable_console_outputs, + ) + except DatasetCreationError as e: + # To print clean error message without a traceback + if not disable_console_outputs: + click.echo(f"Error: {e}", err=True) + ctx = click.get_current_context() + ctx.exit(1) + + if __name__ == "__main__": cli() diff --git a/src/guidellm/preprocess/__init__.py b/src/guidellm/preprocess/__init__.py index 95d01e5f..83aeb207 100644 --- a/src/guidellm/preprocess/__init__.py +++ b/src/guidellm/preprocess/__init__.py @@ -1,3 +1,9 @@ from .dataset import ShortPromptStrategy, process_dataset +from .dataset_from_file import DatasetCreationError, create_dataset_from_file -__all__ = ["ShortPromptStrategy", "process_dataset"] +__all__ = [ + "DatasetCreationError", + "ShortPromptStrategy", + "create_dataset_from_file", + "process_dataset", +] diff --git a/src/guidellm/preprocess/dataset_from_file.py b/src/guidellm/preprocess/dataset_from_file.py new file mode 100644 index 00000000..5a1202d6 --- /dev/null +++ b/src/guidellm/preprocess/dataset_from_file.py @@ -0,0 +1,219 @@ +""" +Module for creating datasets from saved benchmark report files. + +This module provides functionality to extract prompts and their corresponding +output token counts from benchmark results to create datasets for future +'apples-to-apples' comparisons. +""" + +import json +from pathlib import Path +from typing import Any + +from rich.console import Console + +from guidellm.benchmark.output import GenerativeBenchmarksReport + +__all__ = [ + "DatasetCreationError", + "create_dataset_from_file", + "extract_dataset_from_benchmark_report", + "print_dataset_statistics", + "save_dataset_from_benchmark", + "validate_benchmark_file", +] + + +class DatasetCreationError(Exception): + """Exception raised when dataset creation fails.""" + + +def validate_benchmark_file(filepath: Path) -> GenerativeBenchmarksReport: + """ + Validate that the file is a proper GuideLLM benchmark report. + + Args: + filepath: Path to the benchmark report file + + Returns: + GenerativeBenchmarksReport: The validated and loaded report + + Raises: + DatasetCreationError: If file validation fails + """ + try: + report = GenerativeBenchmarksReport.load_file(filepath) + if not report.benchmarks: + raise DatasetCreationError("Benchmark report contains no benchmark data") + return report + except Exception as e: + error_msg = f"Invalid benchmark report file: {e}" + raise DatasetCreationError(error_msg) from e + + +def extract_dataset_from_benchmark_report( + report: GenerativeBenchmarksReport, +) -> list[dict[str, Any]]: + """ + Extract prompts and output tokens from a validated benchmark report. + + Args: + report: A validated GenerativeBenchmarksReport instance + + Returns: + List of dataset items with prompt and token information + """ + dataset_items = [] + + for benchmark in report.benchmarks: + # Access the StatusBreakdown properties directly + requests_breakdown = benchmark.requests + + # Get successful requests (these are the ones we want) + successful_requests = requests_breakdown.successful + + for request in successful_requests: + # Extract the needed data - these are Request objects + prompt = request.prompt + output_tokens = request.output_tokens + prompt_tokens = request.prompt_tokens + + # Only include items with valid data + if prompt and output_tokens > 0: + dataset_items.append( + { + "prompt": prompt, + "output_tokens": output_tokens, + "prompt_tokens": prompt_tokens, + } + ) + + return dataset_items + + +def save_dataset_from_benchmark( + dataset_items: list[dict[str, Any]], output_file: Path +) -> None: + """Save the dataset to a JSON file.""" + # Convert to the format expected by guidellm documentation + formatted_items = [] + for item in dataset_items: + formatted_items.append( + { + "prompt": item["prompt"], + "output_tokens_count": item["output_tokens"], + "prompt_tokens_count": item["prompt_tokens"], + } + ) + + dataset_data = { + "version": "1.0", + "description": ( + "Dataset created from benchmark results for apples-to-apples comparisons" + ), + "data": formatted_items, + } + + with output_file.open("w") as f: + json.dump(dataset_data, f, indent=2) + + +def print_dataset_statistics( + dataset_items: list[dict[str, Any]], enable_console: bool = True +) -> None: + """Print statistics about the dataset.""" + if not enable_console: + return + + console = Console() + console_err = Console(stderr=True) + + if not dataset_items: + console_err.print("No valid items found in dataset") + return + + total_items = len(dataset_items) + prompt_tokens = [item["prompt_tokens"] for item in dataset_items] + output_tokens = [item["output_tokens"] for item in dataset_items] + + console.print("\nDataset Statistics:") + console.print(f"Total items: {total_items}") + console.print( + f"Prompt tokens - Min: {min(prompt_tokens)}, " + f"Max: {max(prompt_tokens)}, " + f"Mean: {sum(prompt_tokens) / len(prompt_tokens):.1f}" + ) + console.print( + f"Output tokens - Min: {min(output_tokens)}, " + f"Max: {max(output_tokens)}, " + f"Mean: {sum(output_tokens) / len(output_tokens):.1f}" + ) + + +def create_dataset_from_file( + benchmark_file: Path, + output_path: Path, + show_stats: bool = False, + enable_console: bool = True, +) -> None: + """ + Create a dataset from a saved benchmark report file. + + This function validates the benchmark file format, loads it using the same + validation as the 'from-file' command, then extracts prompts and their + corresponding output token counts from successful requests. + + Args: + benchmark_file: Path to the benchmark results JSON/YAML file + output_path: Path where the dataset should be saved + show_stats: Whether to display dataset statistics + enable_console: Whether to enable console output + + Raises: + DatasetCreationError: If validation fails or no valid requests found + """ + console = Console() + console_err = Console(stderr=True) + + if enable_console: + console.print(f"Validating benchmark report file: {benchmark_file}") + + try: + report = validate_benchmark_file(benchmark_file) + + if enable_console: + console.print( + f"Valid benchmark report with {len(report.benchmarks)} benchmark(s)" + ) + console.print("Loading and extracting dataset from benchmark results...") + + dataset_items = extract_dataset_from_benchmark_report(report) + + if not dataset_items: + error_msg = ( + "No valid requests with prompts and output tokens " + "found in benchmark report" + ) + if enable_console: + console_err.print(f"Error: {error_msg}") + raise DatasetCreationError(error_msg) + + save_dataset_from_benchmark(dataset_items, output_path) + + if enable_console: + console.print(f"Dataset saved to: {output_path}") + console.print(f"Success, Created dataset with {len(dataset_items)} items") + console.print( + f"You can now use this dataset for future guidellm runs " + f"by specifying: --data {output_path}" + ) + + if show_stats: + print_dataset_statistics(dataset_items, enable_console) + + except DatasetCreationError: + raise + except Exception as e: + if enable_console: + console_err.print(f"Unexpected error: {e}") + raise DatasetCreationError(f"Failed to process benchmark file: {e}") from e diff --git a/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py new file mode 100644 index 00000000..12a668e6 --- /dev/null +++ b/tests/unit/entrypoints/test_dataset_from_file_entrypoint.py @@ -0,0 +1,327 @@ +import json +import tempfile +import unittest +from pathlib import Path +from typing import Any + +import pytest + +from guidellm.preprocess.dataset_from_file import ( + DatasetCreationError, + create_dataset_from_file, + extract_dataset_from_benchmark_report, + print_dataset_statistics, + save_dataset_from_benchmark, + validate_benchmark_file, +) + +REGENERATE_ARTIFACTS = False + + +@pytest.fixture +def get_test_asset_dir(): + def _() -> Path: + return Path(__file__).parent / "assets" + + return _ + + +@pytest.fixture +def cleanup(): + to_delete: list[Path] = [] + yield to_delete + for item in to_delete: + if item.exists(): + item.unlink() # Deletes the file + + +@pytest.fixture +def temp_file(): + """Create a temporary file that gets cleaned up automatically.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + temp_path = Path(f.name) + yield temp_path + if temp_path.exists(): + temp_path.unlink() + + +def test_create_dataset_from_valid_benchmark_json(get_test_asset_dir, cleanup): + """Test creating dataset from a valid benchmark JSON file.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + output_file = asset_dir / "test_dataset_output.json" + cleanup.append(output_file) + + create_dataset_from_file( + benchmark_file=source_file, + output_path=output_file, + show_stats=False, + enable_console=False, + ) + + assert output_file.exists() + + with output_file.open() as f: + dataset = json.load(f) + + assert "version" in dataset + assert "description" in dataset + assert "data" in dataset + assert isinstance(dataset["data"], list) + assert len(dataset["data"]) > 0 + + for item in dataset["data"]: + assert "prompt" in item + assert "output_tokens_count" in item + assert "prompt_tokens_count" in item + assert isinstance(item["prompt"], str) + assert isinstance(item["output_tokens_count"], int) + assert isinstance(item["prompt_tokens_count"], int) + assert len(item["prompt"]) > 0 + assert item["output_tokens_count"] > 0 + + +def test_create_dataset_from_valid_benchmark_yaml(get_test_asset_dir, cleanup): + """Test creating dataset from a valid benchmark YAML file.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.yaml" + output_file = asset_dir / "test_dataset_yaml_output.json" + cleanup.append(output_file) + + create_dataset_from_file( + benchmark_file=source_file, + output_path=output_file, + show_stats=False, + enable_console=False, + ) + + assert output_file.exists() + + with output_file.open() as f: + dataset = json.load(f) + + assert "data" in dataset + assert len(dataset["data"]) > 0 + + +def test_create_dataset_with_stats_output(capfd, get_test_asset_dir, cleanup): + """Test creating dataset with statistics output enabled.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + output_file = asset_dir / "test_dataset_stats_output.json" + cleanup.append(output_file) + + create_dataset_from_file( + benchmark_file=source_file, + output_path=output_file, + show_stats=True, + enable_console=True, + ) + + out, err = capfd.readouterr() + assert "Validating benchmark report file" in out + assert "Valid benchmark report with" in out + assert "Dataset saved to" in out + assert "Success, Created dataset with" in out + assert "Dataset Statistics:" in out + assert "Total items:" in out + assert "Prompt tokens - Min:" in out + assert "Output tokens - Min:" in out + + +def test_create_dataset_with_console_disabled(capfd, get_test_asset_dir, cleanup): + """Test creating dataset with console output disabled.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + output_file = asset_dir / "test_dataset_no_console.json" + cleanup.append(output_file) + + create_dataset_from_file( + benchmark_file=source_file, + output_path=output_file, + show_stats=True, + enable_console=False, + ) + + out, err = capfd.readouterr() + assert out == "" + assert err == "" + + assert output_file.exists() + + +def test_validate_benchmark_file_valid_file(get_test_asset_dir): + """Test validation with a valid benchmark file.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + + report = validate_benchmark_file(source_file) + assert report is not None + assert len(report.benchmarks) > 0 + + +def test_validate_benchmark_file_invalid_json(temp_file): + """Test validation with invalid JSON.""" + temp_file.write_text("This is not JSON") + + with pytest.raises(DatasetCreationError) as exc_info: + validate_benchmark_file(temp_file) + + assert "Invalid benchmark report file" in str(exc_info.value) + assert "Expecting value" in str(exc_info.value) + + +def test_validate_benchmark_file_invalid_structure(temp_file): + """Test validation with valid JSON but invalid benchmark structure.""" + temp_file.write_text('{"invalid": "structure"}') + + with pytest.raises(DatasetCreationError) as exc_info: + validate_benchmark_file(temp_file) + + assert "Invalid benchmark report file" in str(exc_info.value) + + +def test_validate_benchmark_file_no_benchmarks(temp_file): + """Test validation with valid structure but no benchmarks.""" + temp_file.write_text('{"benchmarks": []}') + + with pytest.raises(DatasetCreationError) as exc_info: + validate_benchmark_file(temp_file) + + assert "Benchmark report contains no benchmark data" in str(exc_info.value) + + +def test_extract_dataset_from_benchmark_report(get_test_asset_dir): + """Test extracting dataset from a validated benchmark report.""" + asset_dir = get_test_asset_dir() + source_file = asset_dir / "benchmarks_stripped.json" + + report = validate_benchmark_file(source_file) + + dataset_items = extract_dataset_from_benchmark_report(report) + + assert len(dataset_items) > 0 + + for item in dataset_items: + assert "prompt" in item + assert "output_tokens" in item + assert "prompt_tokens" in item + assert len(item["prompt"]) > 0 + assert item["output_tokens"] > 0 + assert item["prompt_tokens"] > 0 + + +def test_save_dataset_from_benchmark(cleanup): + """Test saving dataset to file.""" + dataset_items = [ + { + "prompt": "Test prompt 1", + "output_tokens": 100, + "prompt_tokens": 50, + }, + { + "prompt": "Test prompt 2", + "output_tokens": 200, + "prompt_tokens": 75, + }, + ] + + output_file = Path("test_save_dataset.json") + cleanup.append(output_file) + + save_dataset_from_benchmark(dataset_items, output_file) + + assert output_file.exists() + + with output_file.open() as f: + saved_data = json.load(f) + + assert "version" in saved_data + assert "description" in saved_data + assert "data" in saved_data + assert len(saved_data["data"]) == 2 + + for item in saved_data["data"]: + assert "prompt" in item + assert "output_tokens_count" in item + assert "prompt_tokens_count" in item + + +def test_print_dataset_statistics_with_data(capfd): + """Test printing statistics with valid dataset.""" + dataset_items = [ + {"prompt": "Test 1", "output_tokens": 100, "prompt_tokens": 50}, + {"prompt": "Test 2", "output_tokens": 200, "prompt_tokens": 75}, + {"prompt": "Test 3", "output_tokens": 150, "prompt_tokens": 60}, + ] + + print_dataset_statistics(dataset_items, enable_console=True) + + out, err = capfd.readouterr() + assert "Dataset Statistics:" in out + assert "Total items: 3" in out + assert "Prompt tokens - Min: 50, Max: 75, Mean: 61.7" in out + assert "Output tokens - Min: 100, Max: 200, Mean: 150.0" in out + + +def test_print_dataset_statistics_empty_dataset(capfd): + """Test printing statistics with empty dataset.""" + dataset_items: list[dict[str, Any]] = [] + + print_dataset_statistics(dataset_items, enable_console=True) + + out, err = capfd.readouterr() + assert "No valid items found in dataset" in err + + +def test_print_dataset_statistics_console_disabled(capfd): + """Test printing statistics with console disabled.""" + dataset_items = [ + {"prompt": "Test", "output_tokens": 100, "prompt_tokens": 50}, + ] + + print_dataset_statistics(dataset_items, enable_console=False) + + out, err = capfd.readouterr() + assert out == "" + assert err == "" + + +def test_create_dataset_from_file_nonexistent_file(): + """Test error handling for nonexistent file.""" + nonexistent_file = Path("does_not_exist.json") + output_file = Path("output.json") + + with pytest.raises(DatasetCreationError): + create_dataset_from_file( + benchmark_file=nonexistent_file, + output_path=output_file, + show_stats=False, + enable_console=False, + ) + + +def test_create_dataset_from_file_no_successful_requests(temp_file): + """Test handling of benchmark with no successful requests.""" + benchmark_data: dict[str, Any] = { + "benchmarks": [ + {"requests": {"successful": [], "errored": [], "incomplete": []}} + ] + } + temp_file.write_text(json.dumps(benchmark_data)) + + output_file = Path("output.json") + + with pytest.raises(DatasetCreationError) as exc_info: + create_dataset_from_file( + benchmark_file=temp_file, + output_path=output_file, + show_stats=False, + enable_console=False, + ) + + assert "Invalid benchmark report file" in str(exc_info.value) + + +if __name__ == "__main__": + unittest.main()