Improve the schemaorg markup for tables

calpaterson · Aug 13, 2024 · 5392d9b · 5392d9b
1 parent 37984de
commit 5392d9b
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 63 deletions.
diff --git a/csvbase/value_objs.py b/csvbase/value_objs.py
@@ -547,3 +547,14 @@ class Backend(enum.Enum):
     POSTGRES = 1
     HTTP = 2
     GIT = 3
+
+
+@dataclass
+class TableRepresentation:
+    """Convenience object holding metadata on a specific representation (eg:
+    csv) of a table."""
+
+    content_type: ContentType
+    offered: bool
+    size: int
+    size_is_estimate: bool
diff --git a/csvbase/web/main/bp.py b/csvbase/web/main/bp.py
@@ -22,7 +22,6 @@
 from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
 import hashlib
 import json
-import dataclasses
 
 import pydantic
 from sqlalchemy.orm import Session
@@ -81,6 +80,7 @@
     Table,
     Backend,
     BinaryOp,
+    TableRepresentation,
 )
 from ...constants import COPY_BUFFER_SIZE, FAR_FUTURE, MAX_UUID
 from ..billing import svc as billing_svc
@@ -444,30 +444,23 @@ def make_table_view_response(sesh, content_type: ContentType, table: Table) -> R
             is_first_page = min_row_id is None or (min_row_id in row_ids)
             is_last_page = max_row_id is None or (max_row_id in row_ids)
 
-            reps = get_table_reps(sesh, table)
-
             template_kwargs = dict(
                 page_title=table.table_name,
-                table=table,
                 page=page,
                 keyset=keyset,
-                praise_id=get_praise_id_if_exists(sesh, table),
                 is_first_page=is_first_page,
                 is_last_page=is_last_page,
                 max_row_id=max_row_id,
                 highlight=request.args.get("highlight", None, type=int),
-                reps=reps,
             )
 
             if is_first_page:
                 template_kwargs["readme_html"] = readme_html(sesh, table.table_uuid)
 
-            response = make_response(
-                render_template(
-                    "table_view.html",
-                    **template_kwargs,
-                )
+            response = render_table_page(
+                sesh, "table_view.html", table=table, **template_kwargs
             )
+
             # HTML doesn't get an etag - too hard to key everything that goes in
             add_table_view_cache_headers(table, response)
             add_table_metadata_headers(table, response)
@@ -718,7 +711,7 @@ def put(self, username: str, table_name: str) -> Response:
 
 
 @bp.get("/<username:username>/<table_name:table_name>/docs")
-def get_table_apidocs(username: str, table_name: str) -> str:
+def get_table_apidocs(username: str, table_name: str) -> Response:
     sesh = get_sesh()
     table = svc.get_table(sesh, username, table_name)
     ensure_table_access(sesh, table, "read")
@@ -729,22 +722,19 @@ def get_table_apidocs(username: str, table_name: str) -> str:
     sample_row = backend.get_a_sample_row(table.table_uuid)
     sample_page = Page(has_less=False, has_more=True, rows=[sample_row])
 
-    reps = get_table_reps(sesh, table)
-
-    return render_template(
+    return render_table_page(
+        sesh,
         "table_api.html",
+        table,
         page_title=f"REST docs: {username}/{table_name}",
         owner=owner,
-        table=table,
         sample_row=sample_row,
         sample_row_id=row_id_from_row(sample_row),
         sample_page=sample_page,
         made_up_row=made_up_row,
         row_to_json_dict=row_to_json_dict,
         table_to_json_dict=table_to_json_dict,
         url_for_with_auth=url_for_with_auth,
-        praise_id=get_praise_id_if_exists(sesh, table),
-        reps=reps,
     )
 
 
@@ -818,42 +808,37 @@ def post(self, username: str, table_name: str) -> Response:
 
 
 @bp.get("/<username:username>/<table_name:table_name>/details")
-def table_details(username: str, table_name: str) -> str:
+def table_details(username: str, table_name: str) -> Response:
     sesh = get_sesh()
     table = svc.get_table(sesh, username, table_name)
     ensure_table_access(sesh, table, "read")
-    reps = get_table_reps(sesh, table)
 
-    return render_template(
+    return render_table_page(
+        sesh,
         "table_details.html",
+        table,
         username=username,
         page_title=f"Schema & Details: {username}/{table_name}",
         DataLicence=DataLicence,
-        table=table,
-        praise_id=get_praise_id_if_exists(sesh, table),
-        reps=reps,
     )
 
 
 @bp.get("/<username:username>/<table_name:table_name>/settings")
-def table_settings(username: str, table_name: str) -> str:
+def table_settings(username: str, table_name: str) -> Response:
     sesh = get_sesh()
     table = svc.get_table(sesh, username, table_name)
     ensure_table_access(sesh, table, "write")
 
     table_readme_markdown = svc.get_readme_markdown(sesh, table.table_uuid)
 
-    reps = get_table_reps(sesh, table)
-
-    return render_template(
+    return render_table_page(
+        sesh,
         "table_settings.html",
+        table,
         username=username,
         page_title=f"Settings: {username}/{table_name}",
         table_readme=table_readme_markdown or "",
         DataLicence=DataLicence,
-        table=table,
-        praise_id=get_praise_id_if_exists(sesh, table),
-        reps=reps,
     )
 
 
@@ -1720,17 +1705,6 @@ def get_user_str_buf() -> codecs.StreamReader:
     return str_buf
 
 
-@dataclasses.dataclass
-class TableRepresentation:
-    """Convenience object holding metadata on a specific representation (eg:
-    csv) of a table."""
-
-    content_type: ContentType
-    offered: bool
-    size: int
-    size_is_estimate: bool
-
-
 def get_table_reps(sesh: Session, table: Table) -> List[TableRepresentation]:
     supported_content_types = [
         ContentType.CSV,
@@ -1765,3 +1739,24 @@ def get_table_reps(sesh: Session, table: Table) -> List[TableRepresentation]:
             )
         )
     return rv
+
+
+def render_table_page(
+    sesh: Session,
+    template_name: str,
+    table: Table,
+    **template_kwargs,
+) -> Response:
+    """Type-safe helper to ensure that we pass the list of reps whenever we render the a table page."""
+    praise_id = get_praise_id_if_exists(sesh, table)
+    reps = get_table_reps(sesh, table)
+
+    return make_response(
+        render_template(
+            template_name,
+            table=table,
+            reps=reps,
+            praise_id=praise_id,
+            **template_kwargs,
+        )
+    )
diff --git a/csvbase/web/schemaorg.py b/csvbase/web/schemaorg.py
@@ -1,11 +1,11 @@
-from typing import Mapping, Any
+from typing import Dict, Any, Collection
 
 from flask import url_for
 
-from csvbase.value_objs import Table, ContentType
+from csvbase.value_objs import Table, TableRepresentation
 
 
-def to_dataset(table: Table) -> Mapping[str, Any]:
+def to_dataset(table: Table, reps: Collection[TableRepresentation]) -> Dict[str, Any]:
     """Produce a schema.org Dataset object from a Table."""
     obj = {
         "@context": "https://schema.org",
@@ -17,7 +17,7 @@ def to_dataset(table: Table) -> Mapping[str, Any]:
             table_name=table.table_name,
             _external=True,
         ),
-        "isAccessibleForFree": True,
+        "isAccessibleForFree": table.is_public,
         "distribution": [],
         "dateCreated": table.created.isoformat(),
         "dateModified": table.last_changed.isoformat(),
@@ -29,35 +29,32 @@ def to_dataset(table: Table) -> Mapping[str, Any]:
 
     # Mark up all the reps we hold
     distribution = []
-    content_types = [ContentType.CSV, ContentType.PARQUET, ContentType.JSON_LINES]
-    if not table.row_count.is_big():
-        content_types.append(ContentType.XLSX)
-    for content_type in content_types:
-        distribution.append(to_datadownload(table, content_type))
+    for rep in reps:
+        distribution.append(to_datadownload(table, rep))
     obj["distribution"] = distribution
 
     return obj
 
 
-def to_datadownload(table: Table, content_type: ContentType) -> Mapping[str, str]:
+def to_datadownload(table: Table, rep: TableRepresentation) -> Dict[str, str]:
     """Produce a schema.org DataDownload object from a table + content type."""
-    # potential improvements:
-    # contentSize (needs the rep)
     obj = {
         "@type": "DataDownload",
         "contentUrl": url_for(
             "csvbase.table_view_with_extension",
             username=table.username,
             table_name=table.table_name,
-            extension=content_type.file_extension(),
+            extension=rep.content_type.file_extension(),
             _external=True,
         ),
-        "encodingFormat": content_type.value,
+        "encodingFormat": rep.content_type.value,
     }
+    if not rep.size_is_estimate:
+        obj["contentSize"] = str(rep.size)
     return obj
 
 
-def make_organisation() -> Mapping[str, str]:
+def make_organisation() -> Dict[str, str]:
     """Produce the schema.org Publisher object for this csvbase instance."""
     return {
         "@type": "Organization",
@@ -67,7 +64,7 @@ def make_organisation() -> Mapping[str, str]:
     }
 
 
-def to_person(username: str) -> Mapping[str, str]:
+def to_person(username: str) -> Dict[str, str]:
     """Produce the schema.org Person object for this username."""
     # This is quite basic
     return {

diff --git a/csvbase/web/templates/table.html b/csvbase/web/templates/table.html
@@ -12,7 +12,7 @@
   {% endif %}
 
   <script type="application/ld+json">
- {{ schemaorg.to_dataset(table)|ppjson|safe }}
+ {{ schemaorg.to_dataset(table, reps)|ppjson|safe }}
   </script>
 {% endblock %}
 

diff --git a/tests/test_seo.py b/tests/test_seo.py
@@ -2,6 +2,7 @@
 from datetime import datetime
 
 from csvbase.web import schemaorg
+from csvbase.web.main.bp import get_table_reps
 from .utils import test_data_path
 
 
@@ -32,7 +33,7 @@ def test_sitemap(client):
     assert first_line == b"<?xml version='1.0' encoding='UTF-8'?>"
 
 
-def test_schemaorg_dataset(ten_rows):
+def test_schemaorg_dataset(sesh, ten_rows):
     expected = {
         "@context": "https://schema.org",
         "@type": "Dataset",
@@ -74,11 +75,19 @@ def test_schemaorg_dataset(ten_rows):
             "url": f"http://localhost/{ten_rows.username}",
         },
     }
-    actual = schemaorg.to_dataset(ten_rows)
+    reps = get_table_reps(sesh, ten_rows)
+    actual = schemaorg.to_dataset(ten_rows, reps)
+
+    def key(d):
+        return d["encodingFormat"]
+
+    assert sorted(actual.pop("distribution"), key=key) == sorted(
+        expected.pop("distribution"), key=key
+    )
 
     # do the dates this way
-    assert datetime.fromisoformat(actual.pop("dateCreated")) == ten_rows.created  # type: ignore
-    assert datetime.fromisoformat(actual.pop("dateModified")) == ten_rows.last_changed  # type: ignore
+    assert datetime.fromisoformat(actual.pop("dateCreated")) == ten_rows.created
+    assert datetime.fromisoformat(actual.pop("dateModified")) == ten_rows.last_changed
 
     # the rest must match:
     assert expected == actual