OpenLLM-France
diff --git a/‎…ugging_face/generate_dataset_table_md.py‎ ‎…s/hugging_face/generate_dataset_table.py‎assets/hugging_face/generate_dataset_table_md.py renamed to assets/hugging_face/generate_dataset_table.py
Lines changed: 57 additions & 3 deletions b/‎…ugging_face/generate_dataset_table_md.py‎ ‎…s/hugging_face/generate_dataset_table.py‎assets/hugging_face/generate_dataset_table_md.py renamed to assets/hugging_face/generate_dataset_table.py
Lines changed: 57 additions & 3 deletions
diff --git a/‎assets/hugging_face/generate_dataset_table_html.py‎
Lines changed: 0 additions & 60 deletions b/‎assets/hugging_face/generate_dataset_table_html.py‎
Lines changed: 0 additions & 60 deletions
@@ -1,6 +1,8 @@
 import csv
 import os
+import re
 
+import mistune
 import pandas as pd
 import slugify
 
@@ -233,15 +235,54 @@ def to_header(x):
     return x
 
 
+def convert_markdown_table_to_html(
+    markdown,
+    html_doc,
+    headers=None,
+    # center_title= True,
+):
+    generated_html = mistune.html(markdown)
+
+    # Fix multi-rows
+    for num_columns in list(range(10, 1, -1)):
+        regex_to = rf'\1<td colspan="{num_columns}" style="text-align: center;"><u>\2</u></td></tr>'
+        # if not center_title:
+        #     regex_to = rf'\1<td colspan="{num_columns-5}"></td><td colspan="{num_columns-2}"><u>\2</u></td></tr>'
+        if headers:
+            headers = headers.strip("<>/")
+
+            def regex_to(match):
+                title = match.group(2)
+                title = re.sub("<[^>]*>", "", title)
+                title = f"<{headers} id={slugify.slugify(title)}>{title}</{headers}>"
+                return f'{match.group(1)}<td colspan="{num_columns}">{title}</td></tr>'
+
+        generated_html = re.sub(
+            rf"(<tr>\s*)<td>(.+)</td>\s*(<td>\s*</td>\s*){{{num_columns-1}}}</tr>", regex_to, generated_html
+        )
+
+    html_doc.write(generated_html)
+
+
 if __name__ == "__main__":
     import argparse
 
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "output",
+        "output_md",
         default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "README_dataset_table.md"),
         nargs="?",
     )
+    parser.add_argument(
+        "output_html",
+        default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "README_dataset_table.html"),
+        nargs="?",
+    )
+    parser.add_argument(
+        "--headers",
+        default="h4",
+        help="HTML headers for the table (if None, it will just be centered text)",
+    )
     args = parser.parse_args()
 
     df = load_stats()
@@ -272,12 +313,25 @@ def key_category(category, df):
         num_tokens = df[df["category"] == category]["B tokens"].sum()
         return -num_tokens
 
-    if args.output:
-        with open(args.output, "w") as f:
+    if args.output_md:
+        with open(args.output_md, "w") as f:
             f.write(write_md_table_row(fields) + "\n")
             for category in sorted(categories, key=lambda x: key_category(x, df)):
                 f.write(f"| ***{category}*** " + ("|" * len(fields)) + "\n")
                 df_cat = df[df["category"] == category]
                 rows = [row for irow, row in df_cat.iterrows()]
                 for row in sorted(rows, key=lambda x: key_row(x, df_cat)):
                     f.write(write_md_table_row(fields, row) + "\n")
+
+    if args.output_html:
+        assert args.output_md, "Need to generate the markdown table first"
+        with open(args.output_md) as f_in:
+            table_content = f_in.readlines()
+
+        with open(args.output_html, "w") as f_out:
+            convert_markdown_table_to_html(
+                "".join(table_content),
+                f_out,
+                # headers="h4",
+                headers=args.headers,
+            )