Skip to content

Commit ea89965

Browse files
chore: clean-up a conflict from the rebase
1 parent 9c83ef1 commit ea89965

File tree

1 file changed

+1
-128
lines changed

1 file changed

+1
-128
lines changed

assets/compile_stats.py

Lines changed: 1 addition & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -2,133 +2,6 @@
22
import json
33
import os
44

5-
text_types = {
6-
"ocr": [
7-
"AmericanStories",
8-
"Eurovoc",
9-
"GallicaPress",
10-
"GallicaMonographies",
11-
"HAL",
12-
# 'OtherFr',
13-
"Persee",
14-
"Theses",
15-
],
16-
"mixed": ["PeS2o"],
17-
}
18-
19-
datasets_categories = {
20-
"technical": [
21-
"HAL",
22-
"NIH_ExPorter",
23-
"OpenEdition",
24-
"Persee",
25-
"PeS2o",
26-
"PhilPapers",
27-
"Theses",
28-
"USPTO_Backgrounds",
29-
],
30-
"legi_written": [
31-
"amendements_parlement",
32-
"Eurovoc.de",
33-
"Eurovoc.en",
34-
"Eurovoc.es",
35-
"Eurovoc.it",
36-
"FreeLaw",
37-
"LEGI",
38-
"OpenData",
39-
"questions_ecrites_parlement",
40-
],
41-
"legi_spoken": [
42-
"DiscoursPublics",
43-
"interventions_parlement",
44-
],
45-
"legi_dialogue": [
46-
"AssembleeNationale",
47-
"Europarl.en",
48-
"Europarl.es",
49-
"Europarl.de",
50-
"Europarl.fr",
51-
"FREDSum",
52-
"Senat",
53-
],
54-
"dialogue": [
55-
"Claire.en",
56-
"Claire.fr",
57-
"Stac",
58-
"ValidatedYouTube.fr",
59-
],
60-
"book": [
61-
"GallicaMonographies",
62-
"Gutenberg.en",
63-
"Gutenberg.de",
64-
"Gutenberg.it",
65-
"Gutenberg.es",
66-
"Gutenberg.fr",
67-
],
68-
"newspaper": [
69-
"AmericanStories",
70-
"GallicaPress",
71-
],
72-
"forum": [
73-
"StackExchange",
74-
"Ubuntu_IRC",
75-
],
76-
"wiki": [
77-
"Wikiother.fr",
78-
"Wikipedia.en",
79-
"Wikipedia.es",
80-
"Wikipedia.de",
81-
"Wikipedia.it",
82-
"Wikipedia.fr",
83-
],
84-
"programming": [
85-
"TheStack",
86-
],
87-
"math": [
88-
"DM_Mathematics",
89-
"MathPile",
90-
],
91-
"aligned": [
92-
"CroissantAligned",
93-
"EuroparlAligned.fr-en",
94-
"EuroparlAligned.es-en",
95-
"EuroparlAligned.it-en",
96-
"EuroparlAligned.de-fr",
97-
]
98-
}
99-
100-
101-
def _norm_string(s):
102-
return s.lower().replace("_", " ")
103-
104-
105-
text_types = {k: [_norm_string(x) for x in v] for k, v in text_types.items()}
106-
datasets_categories = {k: [_norm_string(x) for x in v] for k, v in datasets_categories.items()}
107-
108-
109-
def is_ocr_dataset(name, subset):
110-
if name in ["---", "", None]:
111-
return ""
112-
name = _norm_string(name)
113-
res = "false"
114-
if any(d in name for d in text_types["ocr"]):
115-
res = "true"
116-
if any(d in name for d in text_types["mixed"]):
117-
res = "mixed"
118-
return res
119-
120-
121-
def get_dataset_category(name, subset):
122-
if name in ["---", "", None]:
123-
return ""
124-
name = _norm_string(name)
125-
if name in ["pile", "otherfr"] and subset:
126-
name = _norm_string(subset)
127-
for cat, datasets in datasets_categories.items():
128-
if name in datasets:
129-
return cat
130-
return None
131-
1325

1336
# Ignore datasets
1347
def ignore_datasets(name):
@@ -706,4 +579,4 @@ def sort_function(row):
706579
for row in rows_detailed:
707580
row = compute_extra_stats(row, tokencount_folder)
708581
row = format_stats_display(row, ONLY_DETAILED)
709-
writer.writerow(row)
582+
writer.writerow(row)

0 commit comments

Comments
 (0)