|
1 | 1 | import csv |
2 | 2 | import os |
| 3 | +import re |
3 | 4 |
|
| 5 | +import mistune |
4 | 6 | import pandas as pd |
5 | 7 | import slugify |
6 | 8 |
|
@@ -233,15 +235,54 @@ def to_header(x): |
233 | 235 | return x |
234 | 236 |
|
235 | 237 |
|
| 238 | +def convert_markdown_table_to_html( |
| 239 | + markdown, |
| 240 | + html_doc, |
| 241 | + headers=None, |
| 242 | + # center_title= True, |
| 243 | +): |
| 244 | + generated_html = mistune.html(markdown) |
| 245 | + |
| 246 | + # Fix multi-rows |
| 247 | + for num_columns in list(range(10, 1, -1)): |
| 248 | + regex_to = rf'\1<td colspan="{num_columns}" style="text-align: center;"><u>\2</u></td></tr>' |
| 249 | + # if not center_title: |
| 250 | + # regex_to = rf'\1<td colspan="{num_columns-5}"></td><td colspan="{num_columns-2}"><u>\2</u></td></tr>' |
| 251 | + if headers: |
| 252 | + headers = headers.strip("<>/") |
| 253 | + |
| 254 | + def regex_to(match): |
| 255 | + title = match.group(2) |
| 256 | + title = re.sub("<[^>]*>", "", title) |
| 257 | + title = f"<{headers} id={slugify.slugify(title)}>{title}</{headers}>" |
| 258 | + return f'{match.group(1)}<td colspan="{num_columns}">{title}</td></tr>' |
| 259 | + |
| 260 | + generated_html = re.sub( |
| 261 | + rf"(<tr>\s*)<td>(.+)</td>\s*(<td>\s*</td>\s*){{{num_columns-1}}}</tr>", regex_to, generated_html |
| 262 | + ) |
| 263 | + |
| 264 | + html_doc.write(generated_html) |
| 265 | + |
| 266 | + |
236 | 267 | if __name__ == "__main__": |
237 | 268 | import argparse |
238 | 269 |
|
239 | 270 | parser = argparse.ArgumentParser() |
240 | 271 | parser.add_argument( |
241 | | - "output", |
| 272 | + "output_md", |
242 | 273 | default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "README_dataset_table.md"), |
243 | 274 | nargs="?", |
244 | 275 | ) |
| 276 | + parser.add_argument( |
| 277 | + "output_html", |
| 278 | + default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "README_dataset_table.html"), |
| 279 | + nargs="?", |
| 280 | + ) |
| 281 | + parser.add_argument( |
| 282 | + "--headers", |
| 283 | + default="h4", |
| 284 | + help="HTML headers for the table (if None, it will just be centered text)", |
| 285 | + ) |
245 | 286 | args = parser.parse_args() |
246 | 287 |
|
247 | 288 | df = load_stats() |
@@ -272,12 +313,25 @@ def key_category(category, df): |
272 | 313 | num_tokens = df[df["category"] == category]["B tokens"].sum() |
273 | 314 | return -num_tokens |
274 | 315 |
|
275 | | - if args.output: |
276 | | - with open(args.output, "w") as f: |
| 316 | + if args.output_md: |
| 317 | + with open(args.output_md, "w") as f: |
277 | 318 | f.write(write_md_table_row(fields) + "\n") |
278 | 319 | for category in sorted(categories, key=lambda x: key_category(x, df)): |
279 | 320 | f.write(f"| ***{category}*** " + ("|" * len(fields)) + "\n") |
280 | 321 | df_cat = df[df["category"] == category] |
281 | 322 | rows = [row for irow, row in df_cat.iterrows()] |
282 | 323 | for row in sorted(rows, key=lambda x: key_row(x, df_cat)): |
283 | 324 | f.write(write_md_table_row(fields, row) + "\n") |
| 325 | + |
| 326 | + if args.output_html: |
| 327 | + assert args.output_md, "Need to generate the markdown table first" |
| 328 | + with open(args.output_md) as f_in: |
| 329 | + table_content = f_in.readlines() |
| 330 | + |
| 331 | + with open(args.output_html, "w") as f_out: |
| 332 | + convert_markdown_table_to_html( |
| 333 | + "".join(table_content), |
| 334 | + f_out, |
| 335 | + # headers="h4", |
| 336 | + headers=args.headers, |
| 337 | + ) |
0 commit comments