22import json
33import os
44
5- text_types = {
6- "ocr" : [
7- "AmericanStories" ,
8- "Eurovoc" ,
9- "GallicaPress" ,
10- "GallicaMonographies" ,
11- "HAL" ,
12- # 'OtherFr',
13- "Persee" ,
14- "Theses" ,
15- ],
16- "mixed" : ["PeS2o" ],
17- }
18-
19- datasets_categories = {
20- "technical" : [
21- "HAL" ,
22- "NIH_ExPorter" ,
23- "OpenEdition" ,
24- "Persee" ,
25- "PeS2o" ,
26- "PhilPapers" ,
27- "Theses" ,
28- "USPTO_Backgrounds" ,
29- ],
30- "legi_written" : [
31- "amendements_parlement" ,
32- "Eurovoc.de" ,
33- "Eurovoc.en" ,
34- "Eurovoc.es" ,
35- "Eurovoc.it" ,
36- "FreeLaw" ,
37- "LEGI" ,
38- "OpenData" ,
39- "questions_ecrites_parlement" ,
40- ],
41- "legi_spoken" : [
42- "DiscoursPublics" ,
43- "interventions_parlement" ,
44- ],
45- "legi_dialogue" : [
46- "AssembleeNationale" ,
47- "Europarl.en" ,
48- "Europarl.es" ,
49- "Europarl.de" ,
50- "Europarl.fr" ,
51- "FREDSum" ,
52- "Senat" ,
53- ],
54- "dialogue" : [
55- "Claire.en" ,
56- "Claire.fr" ,
57- "Stac" ,
58- "ValidatedYouTube.fr" ,
59- ],
60- "book" : [
61- "GallicaMonographies" ,
62- "Gutenberg.en" ,
63- "Gutenberg.de" ,
64- "Gutenberg.it" ,
65- "Gutenberg.es" ,
66- "Gutenberg.fr" ,
67- ],
68- "newspaper" : [
69- "AmericanStories" ,
70- "GallicaPress" ,
71- ],
72- "forum" : [
73- "StackExchange" ,
74- "Ubuntu_IRC" ,
75- ],
76- "wiki" : [
77- "Wikiother.fr" ,
78- "Wikipedia.en" ,
79- "Wikipedia.es" ,
80- "Wikipedia.de" ,
81- "Wikipedia.it" ,
82- "Wikipedia.fr" ,
83- ],
84- "programming" : [
85- "TheStack" ,
86- ],
87- "math" : [
88- "DM_Mathematics" ,
89- "MathPile" ,
90- ],
91- "aligned" : [
92- "CroissantAligned" ,
93- "EuroparlAligned.fr-en" ,
94- "EuroparlAligned.es-en" ,
95- "EuroparlAligned.it-en" ,
96- "EuroparlAligned.de-fr" ,
97- ]
98- }
99-
100-
101- def _norm_string (s ):
102- return s .lower ().replace ("_" , " " )
103-
104-
105- text_types = {k : [_norm_string (x ) for x in v ] for k , v in text_types .items ()}
106- datasets_categories = {k : [_norm_string (x ) for x in v ] for k , v in datasets_categories .items ()}
107-
108-
109- def is_ocr_dataset (name , subset ):
110- if name in ["---" , "" , None ]:
111- return ""
112- name = _norm_string (name )
113- res = "false"
114- if any (d in name for d in text_types ["ocr" ]):
115- res = "true"
116- if any (d in name for d in text_types ["mixed" ]):
117- res = "mixed"
118- return res
119-
120-
121- def get_dataset_category (name , subset ):
122- if name in ["---" , "" , None ]:
123- return ""
124- name = _norm_string (name )
125- if name in ["pile" , "otherfr" ] and subset :
126- name = _norm_string (subset )
127- for cat , datasets in datasets_categories .items ():
128- if name in datasets :
129- return cat
130- return None
131-
1325
1336# Ignore datasets
1347def ignore_datasets (name ):
@@ -706,4 +579,4 @@ def sort_function(row):
706579 for row in rows_detailed :
707580 row = compute_extra_stats (row , tokencount_folder )
708581 row = format_stats_display (row , ONLY_DETAILED )
709- writer .writerow (row )
582+ writer .writerow (row )
0 commit comments