Skip to content

Commit

Permalink
Improve the schemaorg markup for tables
Browse files Browse the repository at this point in the history
  • Loading branch information
calpaterson committed Aug 13, 2024
1 parent 37984de commit 5392d9b
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 63 deletions.
11 changes: 11 additions & 0 deletions csvbase/value_objs.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,3 +547,14 @@ class Backend(enum.Enum):
POSTGRES = 1
HTTP = 2
GIT = 3


@dataclass
class TableRepresentation:
"""Convenience object holding metadata on a specific representation (eg:
csv) of a table."""

content_type: ContentType
offered: bool
size: int
size_is_estimate: bool
79 changes: 37 additions & 42 deletions csvbase/web/main/bp.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from urllib.parse import urlsplit, urlunsplit, parse_qsl, urlencode
import hashlib
import json
import dataclasses

import pydantic
from sqlalchemy.orm import Session
Expand Down Expand Up @@ -81,6 +80,7 @@
Table,
Backend,
BinaryOp,
TableRepresentation,
)
from ...constants import COPY_BUFFER_SIZE, FAR_FUTURE, MAX_UUID
from ..billing import svc as billing_svc
Expand Down Expand Up @@ -444,30 +444,23 @@ def make_table_view_response(sesh, content_type: ContentType, table: Table) -> R
is_first_page = min_row_id is None or (min_row_id in row_ids)
is_last_page = max_row_id is None or (max_row_id in row_ids)

reps = get_table_reps(sesh, table)

template_kwargs = dict(
page_title=table.table_name,
table=table,
page=page,
keyset=keyset,
praise_id=get_praise_id_if_exists(sesh, table),
is_first_page=is_first_page,
is_last_page=is_last_page,
max_row_id=max_row_id,
highlight=request.args.get("highlight", None, type=int),
reps=reps,
)

if is_first_page:
template_kwargs["readme_html"] = readme_html(sesh, table.table_uuid)

response = make_response(
render_template(
"table_view.html",
**template_kwargs,
)
response = render_table_page(
sesh, "table_view.html", table=table, **template_kwargs
)

# HTML doesn't get an etag - too hard to key everything that goes in
add_table_view_cache_headers(table, response)
add_table_metadata_headers(table, response)
Expand Down Expand Up @@ -718,7 +711,7 @@ def put(self, username: str, table_name: str) -> Response:


@bp.get("/<username:username>/<table_name:table_name>/docs")
def get_table_apidocs(username: str, table_name: str) -> str:
def get_table_apidocs(username: str, table_name: str) -> Response:
sesh = get_sesh()
table = svc.get_table(sesh, username, table_name)
ensure_table_access(sesh, table, "read")
Expand All @@ -729,22 +722,19 @@ def get_table_apidocs(username: str, table_name: str) -> str:
sample_row = backend.get_a_sample_row(table.table_uuid)
sample_page = Page(has_less=False, has_more=True, rows=[sample_row])

reps = get_table_reps(sesh, table)

return render_template(
return render_table_page(
sesh,
"table_api.html",
table,
page_title=f"REST docs: {username}/{table_name}",
owner=owner,
table=table,
sample_row=sample_row,
sample_row_id=row_id_from_row(sample_row),
sample_page=sample_page,
made_up_row=made_up_row,
row_to_json_dict=row_to_json_dict,
table_to_json_dict=table_to_json_dict,
url_for_with_auth=url_for_with_auth,
praise_id=get_praise_id_if_exists(sesh, table),
reps=reps,
)


Expand Down Expand Up @@ -818,42 +808,37 @@ def post(self, username: str, table_name: str) -> Response:


@bp.get("/<username:username>/<table_name:table_name>/details")
def table_details(username: str, table_name: str) -> str:
def table_details(username: str, table_name: str) -> Response:
sesh = get_sesh()
table = svc.get_table(sesh, username, table_name)
ensure_table_access(sesh, table, "read")
reps = get_table_reps(sesh, table)

return render_template(
return render_table_page(
sesh,
"table_details.html",
table,
username=username,
page_title=f"Schema & Details: {username}/{table_name}",
DataLicence=DataLicence,
table=table,
praise_id=get_praise_id_if_exists(sesh, table),
reps=reps,
)


@bp.get("/<username:username>/<table_name:table_name>/settings")
def table_settings(username: str, table_name: str) -> str:
def table_settings(username: str, table_name: str) -> Response:
sesh = get_sesh()
table = svc.get_table(sesh, username, table_name)
ensure_table_access(sesh, table, "write")

table_readme_markdown = svc.get_readme_markdown(sesh, table.table_uuid)

reps = get_table_reps(sesh, table)

return render_template(
return render_table_page(
sesh,
"table_settings.html",
table,
username=username,
page_title=f"Settings: {username}/{table_name}",
table_readme=table_readme_markdown or "",
DataLicence=DataLicence,
table=table,
praise_id=get_praise_id_if_exists(sesh, table),
reps=reps,
)


Expand Down Expand Up @@ -1720,17 +1705,6 @@ def get_user_str_buf() -> codecs.StreamReader:
return str_buf


@dataclasses.dataclass
class TableRepresentation:
"""Convenience object holding metadata on a specific representation (eg:
csv) of a table."""

content_type: ContentType
offered: bool
size: int
size_is_estimate: bool


def get_table_reps(sesh: Session, table: Table) -> List[TableRepresentation]:
supported_content_types = [
ContentType.CSV,
Expand Down Expand Up @@ -1765,3 +1739,24 @@ def get_table_reps(sesh: Session, table: Table) -> List[TableRepresentation]:
)
)
return rv


def render_table_page(
sesh: Session,
template_name: str,
table: Table,
**template_kwargs,
) -> Response:
"""Type-safe helper to ensure that we pass the list of reps whenever we render the a table page."""
praise_id = get_praise_id_if_exists(sesh, table)
reps = get_table_reps(sesh, table)

return make_response(
render_template(
template_name,
table=table,
reps=reps,
praise_id=praise_id,
**template_kwargs,
)
)
29 changes: 13 additions & 16 deletions csvbase/web/schemaorg.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Mapping, Any
from typing import Dict, Any, Collection

from flask import url_for

from csvbase.value_objs import Table, ContentType
from csvbase.value_objs import Table, TableRepresentation


def to_dataset(table: Table) -> Mapping[str, Any]:
def to_dataset(table: Table, reps: Collection[TableRepresentation]) -> Dict[str, Any]:
"""Produce a schema.org Dataset object from a Table."""
obj = {
"@context": "https://schema.org",
Expand All @@ -17,7 +17,7 @@ def to_dataset(table: Table) -> Mapping[str, Any]:
table_name=table.table_name,
_external=True,
),
"isAccessibleForFree": True,
"isAccessibleForFree": table.is_public,
"distribution": [],
"dateCreated": table.created.isoformat(),
"dateModified": table.last_changed.isoformat(),
Expand All @@ -29,35 +29,32 @@ def to_dataset(table: Table) -> Mapping[str, Any]:

# Mark up all the reps we hold
distribution = []
content_types = [ContentType.CSV, ContentType.PARQUET, ContentType.JSON_LINES]
if not table.row_count.is_big():
content_types.append(ContentType.XLSX)
for content_type in content_types:
distribution.append(to_datadownload(table, content_type))
for rep in reps:
distribution.append(to_datadownload(table, rep))
obj["distribution"] = distribution

return obj


def to_datadownload(table: Table, content_type: ContentType) -> Mapping[str, str]:
def to_datadownload(table: Table, rep: TableRepresentation) -> Dict[str, str]:
"""Produce a schema.org DataDownload object from a table + content type."""
# potential improvements:
# contentSize (needs the rep)
obj = {
"@type": "DataDownload",
"contentUrl": url_for(
"csvbase.table_view_with_extension",
username=table.username,
table_name=table.table_name,
extension=content_type.file_extension(),
extension=rep.content_type.file_extension(),
_external=True,
),
"encodingFormat": content_type.value,
"encodingFormat": rep.content_type.value,
}
if not rep.size_is_estimate:
obj["contentSize"] = str(rep.size)
return obj


def make_organisation() -> Mapping[str, str]:
def make_organisation() -> Dict[str, str]:
"""Produce the schema.org Publisher object for this csvbase instance."""
return {
"@type": "Organization",
Expand All @@ -67,7 +64,7 @@ def make_organisation() -> Mapping[str, str]:
}


def to_person(username: str) -> Mapping[str, str]:
def to_person(username: str) -> Dict[str, str]:
"""Produce the schema.org Person object for this username."""
# This is quite basic
return {
Expand Down
2 changes: 1 addition & 1 deletion csvbase/web/templates/table.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
{% endif %}

<script type="application/ld+json">
{{ schemaorg.to_dataset(table)|ppjson|safe }}
{{ schemaorg.to_dataset(table, reps)|ppjson|safe }}
</script>
{% endblock %}

Expand Down
17 changes: 13 additions & 4 deletions tests/test_seo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from datetime import datetime

from csvbase.web import schemaorg
from csvbase.web.main.bp import get_table_reps
from .utils import test_data_path


Expand Down Expand Up @@ -32,7 +33,7 @@ def test_sitemap(client):
assert first_line == b"<?xml version='1.0' encoding='UTF-8'?>"


def test_schemaorg_dataset(ten_rows):
def test_schemaorg_dataset(sesh, ten_rows):
expected = {
"@context": "https://schema.org",
"@type": "Dataset",
Expand Down Expand Up @@ -74,11 +75,19 @@ def test_schemaorg_dataset(ten_rows):
"url": f"http://localhost/{ten_rows.username}",
},
}
actual = schemaorg.to_dataset(ten_rows)
reps = get_table_reps(sesh, ten_rows)
actual = schemaorg.to_dataset(ten_rows, reps)

def key(d):
return d["encodingFormat"]

assert sorted(actual.pop("distribution"), key=key) == sorted(
expected.pop("distribution"), key=key
)

# do the dates this way
assert datetime.fromisoformat(actual.pop("dateCreated")) == ten_rows.created # type: ignore
assert datetime.fromisoformat(actual.pop("dateModified")) == ten_rows.last_changed # type: ignore
assert datetime.fromisoformat(actual.pop("dateCreated")) == ten_rows.created
assert datetime.fromisoformat(actual.pop("dateModified")) == ten_rows.last_changed

# the rest must match:
assert expected == actual

0 comments on commit 5392d9b

Please sign in to comment.