Skip to content

Commit d1eaadb

Browse files
committed
one script instead of two to make tables
1 parent 5f178f6 commit d1eaadb

File tree

2 files changed

+57
-63
lines changed

2 files changed

+57
-63
lines changed

assets/hugging_face/generate_dataset_table_md.py renamed to assets/hugging_face/generate_dataset_table.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import csv
22
import os
3+
import re
34

5+
import mistune
46
import pandas as pd
57
import slugify
68

@@ -233,15 +235,54 @@ def to_header(x):
233235
return x
234236

235237

238+
def convert_markdown_table_to_html(
239+
markdown,
240+
html_doc,
241+
headers=None,
242+
# center_title= True,
243+
):
244+
generated_html = mistune.html(markdown)
245+
246+
# Fix multi-rows
247+
for num_columns in list(range(10, 1, -1)):
248+
regex_to = rf'\1<td colspan="{num_columns}" style="text-align: center;"><u>\2</u></td></tr>'
249+
# if not center_title:
250+
# regex_to = rf'\1<td colspan="{num_columns-5}"></td><td colspan="{num_columns-2}"><u>\2</u></td></tr>'
251+
if headers:
252+
headers = headers.strip("<>/")
253+
254+
def regex_to(match):
255+
title = match.group(2)
256+
title = re.sub("<[^>]*>", "", title)
257+
title = f"<{headers} id={slugify.slugify(title)}>{title}</{headers}>"
258+
return f'{match.group(1)}<td colspan="{num_columns}">{title}</td></tr>'
259+
260+
generated_html = re.sub(
261+
rf"(<tr>\s*)<td>(.+)</td>\s*(<td>\s*</td>\s*){{{num_columns-1}}}</tr>", regex_to, generated_html
262+
)
263+
264+
html_doc.write(generated_html)
265+
266+
236267
if __name__ == "__main__":
237268
import argparse
238269

239270
parser = argparse.ArgumentParser()
240271
parser.add_argument(
241-
"output",
272+
"output_md",
242273
default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "README_dataset_table.md"),
243274
nargs="?",
244275
)
276+
parser.add_argument(
277+
"output_html",
278+
default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "README_dataset_table.html"),
279+
nargs="?",
280+
)
281+
parser.add_argument(
282+
"--headers",
283+
default="h4",
284+
help="HTML headers for the table (if None, it will just be centered text)",
285+
)
245286
args = parser.parse_args()
246287

247288
df = load_stats()
@@ -272,12 +313,25 @@ def key_category(category, df):
272313
num_tokens = df[df["category"] == category]["B tokens"].sum()
273314
return -num_tokens
274315

275-
if args.output:
276-
with open(args.output, "w") as f:
316+
if args.output_md:
317+
with open(args.output_md, "w") as f:
277318
f.write(write_md_table_row(fields) + "\n")
278319
for category in sorted(categories, key=lambda x: key_category(x, df)):
279320
f.write(f"| ***{category}*** " + ("|" * len(fields)) + "\n")
280321
df_cat = df[df["category"] == category]
281322
rows = [row for irow, row in df_cat.iterrows()]
282323
for row in sorted(rows, key=lambda x: key_row(x, df_cat)):
283324
f.write(write_md_table_row(fields, row) + "\n")
325+
326+
if args.output_html:
327+
assert args.output_md, "Need to generate the markdown table first"
328+
with open(args.output_md) as f_in:
329+
table_content = f_in.readlines()
330+
331+
with open(args.output_html, "w") as f_out:
332+
convert_markdown_table_to_html(
333+
"".join(table_content),
334+
f_out,
335+
# headers="h4",
336+
headers=args.headers,
337+
)

assets/hugging_face/generate_dataset_table_html.py

Lines changed: 0 additions & 60 deletions
This file was deleted.

0 commit comments

Comments
 (0)