Skip to content

Commit 8872579

Browse files
committed
[squash] reintroduce logging
1 parent 84d648a commit 8872579

1 file changed

Lines changed: 16 additions & 0 deletions

File tree

src/sedpack/io/dataset_base.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,12 @@ def dataset_structure(self) -> DatasetStructure:
137137
def dataset_structure(self, value: DatasetStructure) -> None:
138138
self._dataset_info.dataset_structure = value
139139

140+
@property
141+
def logger(self) -> logging.Logger:
142+
"""Get the logger.
143+
"""
144+
return self._logger
145+
140146
def shard_info_iterator(self, split: SplitT | None) -> Iterator[ShardInfo]:
141147
"""Iterate all `ShardInfo` in the split.
142148
@@ -305,6 +311,15 @@ def __init__(
305311
if shard_filter(shard_info)
306312
]
307313

314+
kept_metadata: set[str] = {
315+
str(s.custom_metadata) for s in shards_list
316+
}
317+
self.dataset.logger.info(
318+
"Filtered shards with custom metadata: %s from split: %s",
319+
kept_metadata,
320+
split,
321+
)
322+
308323
# Only use a limited amount of shards for each setting of
309324
# custom_metadata.
310325
if custom_metadata_type_limit:
@@ -319,6 +334,7 @@ def __init__(
319334
counts[k] = counts.get(k, 0) + 1
320335
if counts[k] <= custom_metadata_type_limit:
321336
shard_list.append(shard_info)
337+
self.dataset.logger.info("Took %s shards total", len(shard_list))
322338

323339
# Limit the number of shards.
324340
if shards:

0 commit comments

Comments
 (0)