Skip to content

Commit ba9cf96

Browse files
committed
Add precision in numbers when needed
1 parent d1eaadb commit ba9cf96

File tree

7 files changed

+480
-448
lines changed

7 files changed

+480
-448
lines changed

assets/compile_stats.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,10 @@ def to_language_name_subset(name, subset=None):
257257
SORT_BY = "#words"
258258

259259

260+
def must_be_skipped(data):
261+
return data["B words"] == 0
262+
263+
260264
def compute_extra_stats(data, tokencount_folder):
261265
if "B words" in data:
262266
return data
@@ -310,38 +314,52 @@ def get_stat_names(compute_token_stats=True):
310314
return list(dummy.keys())
311315

312316

317+
def precision_at_least(x, prec=3, length=8):
318+
if x == 0:
319+
return f"{0:{length}.{prec}f}"
320+
if round(x, prec) >= 100 * (10**-prec):
321+
return f"{x:{length}.{prec}f}"
322+
return precision_at_least(x, prec + 1)
323+
324+
325+
def precision_at_least_func(prec=3, length=8):
326+
return lambda x: precision_at_least(x, prec, length)
327+
328+
313329
def format_stats_display(data, main=True):
314330
for name, format in [
315331
("language", "{:<9s}"),
316332
("name", "{:<21s}"),
317333
("subset", "{:<12s}" if main else "{:<28s}"),
318334
("ocr", "{:<6s}"),
319335
("category", "{:<13s}"),
320-
("M docs", "{:8.3f}"),
321-
("B words", "{:8.3f}"),
322-
("B chars", "{:8.3f}"),
323-
("B tokens", "{:9.3f}"),
336+
("M docs", precision_at_least_func()), # "{:8.3f}"),
337+
("B words", precision_at_least_func()), # "{:8.3f}"),
338+
("B chars", precision_at_least_func()), # "{:8.3f}"),
339+
("B tokens", precision_at_least_func(length=9)), # "{:9.3f}"),
324340
("#words/doc", "{:11.0f}"),
325341
("#chars/page", "{:11.0f}"),
326342
("#chars/word", "{:11.1f}"),
327343
("#tokens/words", "{:12.2f}"),
328344
("#chars/tokens", "{:12.2f}"),
329345
]:
346+
format_func = format.format if isinstance(format, str) else format
347+
format_str = format if isinstance(format, str) else "{:" + str(len(format(1))) + ".3f}"
330348
if name in data.keys():
331349
val = data[name]
332350
if isinstance(val, str) or val is None:
333351
if val is None:
334352
val = " "
335-
if format.endswith("f}"):
336-
length = int(format[2:-2].split(".")[0])
337-
format = f"{{:>{length}s}}"
338-
elif format.endswith("}"):
339-
length = int(format[2:-1].strip("<>s"))
353+
if format_str.endswith("f}"):
354+
length = int(format_str[2:-2].split(".")[0])
355+
format_func = f"{{:>{length}s}}".format
356+
elif format_str.endswith("}"):
357+
length = int(format_str[2:-1].strip("<>s"))
340358
if len(val) > length:
341359
# val = val[:length]
342360
val = val.strip("_ ")
343361
try:
344-
data[name] = format.format(val)
362+
data[name] = format_func(val)
345363
except Exception as err:
346364
raise RuntimeError(f"Error formatting {name}={val} with {format=}") from err
347365
return data
@@ -576,6 +594,8 @@ def sort_function(row):
576594
writer.writerow(header_with_spaces)
577595
for row in rows:
578596
row = compute_extra_stats(row, tokencount_folder)
597+
if must_be_skipped(row):
598+
continue
579599
row = format_stats_display(row, ONLY_DETAILED)
580600
writer.writerow(row)
581601

@@ -584,5 +604,7 @@ def sort_function(row):
584604
writer.writerow(header_with_spaces)
585605
for row in rows_detailed:
586606
row = compute_extra_stats(row, tokencount_folder)
607+
if must_be_skipped(row):
608+
continue
587609
row = format_stats_display(row, ONLY_DETAILED)
588610
writer.writerow(row)

0 commit comments

Comments
 (0)