From 692c38a5a7da01253698035b6004db0ce90385d6 Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Sun, 29 Jun 2025 16:08:05 +0530 Subject: [PATCH 1/2] Update load.py --- src/datasets/load.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index bc2b0e679b6..d524d22aa0f 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1185,6 +1185,7 @@ def load_dataset_builder( def load_dataset( path: str, name: Optional[str] = None, + subset_name: Optional[str] = None, # <-- New alias parameter data_dir: Optional[str] = None, data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, split: Optional[Union[str, Split, list[str], list[Split]]] = None, @@ -1202,6 +1203,10 @@ def load_dataset( storage_options: Optional[dict] = None, **config_kwargs, ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]: + if name and subset_name and name != subset_name: + raise ValueError("'name' and 'subset_name' cannot both be set with different values.") + name = name or subset_name # Prefer 'name', fallback to 'subset_name' + """Load a dataset from the Hugging Face Hub, or a local dataset. You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`]. @@ -1388,7 +1393,6 @@ def load_dataset( (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS ) - # Create a dataset builder builder_instance = load_dataset_builder( path=path, name=name, @@ -1404,11 +1408,9 @@ def load_dataset( **config_kwargs, ) - # Return iterable dataset in case of streaming if streaming: return builder_instance.as_streaming_dataset(split=split) - # Download and prepare data builder_instance.download_and_prepare( download_config=download_config, download_mode=download_mode, @@ -1417,7 +1419,6 @@ def load_dataset( storage_options=storage_options, ) - # Build dataset for splits keep_in_memory = ( keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size) ) From 91a28a4d2ae4dfc7ba567d624f56406402ad2c61 Mon Sep 17 00:00:00 2001 From: Arjun Dinesh Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Sun, 29 Jun 2025 16:18:24 +0530 Subject: [PATCH 2/2] Update builder.py --- src/datasets/builder.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index e63960dcabf..8c724948184 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -1343,6 +1343,11 @@ def _prepare_split( """ raise NotImplementedError() + @property + def subset_name(self) -> str: + """Alias for self.config.name to match Hugging Face Hub terminology ('Subset').""" + return self.config.name + def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable: """Generate the examples on the fly.