@@ -257,6 +257,10 @@ def to_language_name_subset(name, subset=None):
257257SORT_BY = "#words"
258258
259259
260+ def must_be_skipped (data ):
261+ return data ["B words" ] == 0
262+
263+
260264def compute_extra_stats (data , tokencount_folder ):
261265 if "B words" in data :
262266 return data
@@ -310,38 +314,52 @@ def get_stat_names(compute_token_stats=True):
310314 return list (dummy .keys ())
311315
312316
317+ def precision_at_least (x , prec = 3 , length = 8 ):
318+ if x == 0 :
319+ return f"{ 0 :{length }.{prec }f} "
320+ if round (x , prec ) >= 100 * (10 ** - prec ):
321+ return f"{ x :{length }.{prec }f} "
322+ return precision_at_least (x , prec + 1 )
323+
324+
325+ def precision_at_least_func (prec = 3 , length = 8 ):
326+ return lambda x : precision_at_least (x , prec , length )
327+
328+
313329def format_stats_display (data , main = True ):
314330 for name , format in [
315331 ("language" , "{:<9s}" ),
316332 ("name" , "{:<21s}" ),
317333 ("subset" , "{:<12s}" if main else "{:<28s}" ),
318334 ("ocr" , "{:<6s}" ),
319335 ("category" , "{:<13s}" ),
320- ("M docs" , "{:8.3f}" ),
321- ("B words" , "{:8.3f}" ),
322- ("B chars" , "{:8.3f}" ),
323- ("B tokens" , "{:9.3f}" ),
336+ ("M docs" , precision_at_least_func ()), # "{:8.3f}"),
337+ ("B words" , precision_at_least_func ()), # "{:8.3f}"),
338+ ("B chars" , precision_at_least_func ()), # "{:8.3f}"),
339+ ("B tokens" , precision_at_least_func ( length = 9 )), # "{:9.3f}"),
324340 ("#words/doc" , "{:11.0f}" ),
325341 ("#chars/page" , "{:11.0f}" ),
326342 ("#chars/word" , "{:11.1f}" ),
327343 ("#tokens/words" , "{:12.2f}" ),
328344 ("#chars/tokens" , "{:12.2f}" ),
329345 ]:
346+ format_func = format .format if isinstance (format , str ) else format
347+ format_str = format if isinstance (format , str ) else "{:" + str (len (format (1 ))) + ".3f}"
330348 if name in data .keys ():
331349 val = data [name ]
332350 if isinstance (val , str ) or val is None :
333351 if val is None :
334352 val = " "
335- if format .endswith ("f}" ):
336- length = int (format [2 :- 2 ].split ("." )[0 ])
337- format = f"{{:>{ length } s}}"
338- elif format .endswith ("}" ):
339- length = int (format [2 :- 1 ].strip ("<>s" ))
353+ if format_str .endswith ("f}" ):
354+ length = int (format_str [2 :- 2 ].split ("." )[0 ])
355+ format_func = f"{{:>{ length } s}}" . format
356+ elif format_str .endswith ("}" ):
357+ length = int (format_str [2 :- 1 ].strip ("<>s" ))
340358 if len (val ) > length :
341359 # val = val[:length]
342360 val = val .strip ("_ " )
343361 try :
344- data [name ] = format . format (val )
362+ data [name ] = format_func (val )
345363 except Exception as err :
346364 raise RuntimeError (f"Error formatting { name } ={ val } with { format = } " ) from err
347365 return data
@@ -576,6 +594,8 @@ def sort_function(row):
576594 writer .writerow (header_with_spaces )
577595 for row in rows :
578596 row = compute_extra_stats (row , tokencount_folder )
597+ if must_be_skipped (row ):
598+ continue
579599 row = format_stats_display (row , ONLY_DETAILED )
580600 writer .writerow (row )
581601
@@ -584,5 +604,7 @@ def sort_function(row):
584604 writer .writerow (header_with_spaces )
585605 for row in rows_detailed :
586606 row = compute_extra_stats (row , tokencount_folder )
607+ if must_be_skipped (row ):
608+ continue
587609 row = format_stats_display (row , ONLY_DETAILED )
588610 writer .writerow (row )
0 commit comments