Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Glob #392

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
20 changes: 20 additions & 0 deletions .guix-run
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#! /bin/sh
#
# This script sets up a Guix container. Make sure guix is in the path
# - after installing Guix (on Debian).
#
# Note that pyshex etc are part of the guix-bioinformatics channel at
#
# https://git.genenetwork.org/guix-bioinformatics/guix-bioinformatics

env GUIX_PACKAGE_PATH=~/guix-bioinformatics/ guix environment -C guix --ad-hoc git python python-pyyaml python-pycurl python-magic nss-certs python-pyshex --network openssl python-schema-salad python-pytest which less vim python-toml python-iniconfig python-tox python-mypy python-pylint

# Once in the shell you can do
# --- run tests (takes 1 minute, skips lint and mypy)
# tox
# --- install and run
# python3 setup.py install --user
# ~/.local/bin/schema-salad-tool --help
# --- Example
# ~/.local/bin/schema-salad-tool --debug --print-rdf schema_salad/tests/data/pubseq/pubseq-schema.yml schema_salad/tests/data/pubseq/MW084447.1.json schema_salad/tests/data/pubseq/MW343767.1.json

32 changes: 23 additions & 9 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,21 @@
Schema Salad
------------

Salad is a schema language for describing JSON or YAML structured
linked data documents. Salad schema describes rules for
preprocessing, structural validation, and hyperlink checking for
documents described by a Salad schema. Salad supports rich data
modeling with inheritance, template specialization, object
identifiers, object references, documentation generation, code
generation, and transformation to RDF_. Salad provides a bridge
between document and record oriented data modeling and the Semantic
Web.
Schema Salad is a schema language for YAML (or JSON) that also lets
you map your YAML data structures into RDF structured linked data
documents via JSON-LD. In other words, a schema validates and
transforms YAML or JSON documents into structured linked data
documents - the missing link between a NoSQL JSON document and a
linked data document that can be reasoned about, e.g. for human and
machine processing.

Salad schema describes rules for preprocessing, structural validation,
and hyperlink checking for documents described by a Salad
schema. Salad supports rich data modeling with inheritance, template
specialization, object identifiers, object references, documentation
generation, code generation, and transformation to RDF_. Salad
provides a bridge between document and record oriented data modeling
and the Semantic Web.

The Schema Salad library is Python 3.6+ only.

Expand Down Expand Up @@ -63,6 +69,14 @@ Validate a document using a schema::

$ schema-salad-tool myschema.yml mydocument.yml

Validate a JSON document using a schema::

$ schema-salad-tool myschema.yml mydocument.json

Multiple documents and (lazy) expansion can be used::

$ schema-salad-tool myschema.yml 'my*.yml'

Generate HTML documentation::

$ schema-salad-tool myschema.yml > myschema.html
Expand Down
61 changes: 38 additions & 23 deletions schema_salad/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Command line interface to schema-salad."""

import argparse
import glob
import logging
import os
import sys
Expand Down Expand Up @@ -200,7 +201,7 @@ def main(argsl: Optional[List[str]] = None) -> int:
)

parser.add_argument("schema", type=str, nargs="?", default=None)
parser.add_argument("document", type=str, nargs="?", default=None)
parser.add_argument("documents", nargs="*", default=None)
parser.add_argument(
"--version", "-v", action="store_true", help="Print version", default=None
)
Expand Down Expand Up @@ -263,12 +264,15 @@ def main(argsl: Optional[List[str]] = None) -> int:
makedoc(args)
return 0

# Use globbing to expand the list of documents - and flatten again
args.documents = [item for sublist in map(lambda fn: glob.glob(fn), args.documents) for item in sublist]

# Optionally print the schema after ref resolution
if not args.document and args.print_pre:
if not args.documents and args.print_pre:
json_dump(schema_doc, fp=sys.stdout, indent=4)
return 0

if not args.document and args.print_index:
if not args.documents and args.print_index:
json_dump(list(metaschema_loader.idx.keys()), fp=sys.stdout, indent=4)
return 0

Expand Down Expand Up @@ -344,7 +348,7 @@ def main(argsl: Optional[List[str]] = None) -> int:
rdfs.serialize(destination=stdout(), format=args.rdf_serializer)
return 0

if args.print_metadata and not args.document:
if args.print_metadata and not args.documents:
json_dump(schema_metadata, fp=sys.stdout, indent=4)
return 0

Expand All @@ -357,25 +361,36 @@ def main(argsl: Optional[List[str]] = None) -> int:
return 0

# If no document specified, all done.
if not args.document:
if not args.documents:
print(f"Schema `{args.schema}` is valid")
return 0

# Load target document and resolve refs
try:
uri = args.document
document, doc_metadata = document_loader.resolve_ref(
uri, strict_foreign_properties=args.strict_foreign_properties
)
except ValidationException as e:
msg = to_one_line_messages(e) if args.print_oneline else str(e)
_logger.error(
"Document `%s` failed validation:\n%s",
args.document,
msg,
exc_info=args.debug,
)
return 1
# Load target document and resolve refs. Note that this can now
# take multiple document files. doc_metadata only returns the
# metadata for the last document as they should be the same
document = []
ids = {} # check for duplicate use of document id as it creates
# unpredictable output
for uri in args.documents:
try:
document1, doc_metadata = document_loader.resolve_ref(
uri, strict_foreign_properties=args.strict_foreign_properties
)
if "id" in document1:
doc_id = document1["id"]
if doc_id in ids:
raise Exception(f"Document id {doc_id} is duplicated in {uri}!")
ids[doc_id] = True
document.append(document1)
except ValidationException as e:
msg = to_one_line_messages(e) if args.print_oneline else str(e)
_logger.error(
"Document `%s` failed validation:\n%s",
document,
msg,
exc_info=args.debug,
)
return 1

# Optionally print the document after ref resolution
if args.print_pre:
Expand All @@ -397,13 +412,13 @@ def main(argsl: Optional[List[str]] = None) -> int:
)
except ValidationException as e:
msg2 = to_one_line_messages(e) if args.print_oneline else str(e)
_logger.error(f"While validating document `{args.document}`:\n{msg2}")
_logger.error(f"While validating document `{args.documents}`:\n{msg2}")
return 1

# Optionally convert the document to RDF
if args.print_rdf:
if isinstance(document, (Mapping, MutableSequence)):
printrdf(args.document, document, schema_ctx, args.rdf_serializer)
printrdf(args.documents, document, schema_ctx, args.rdf_serializer)
return 0
else:
print("Document must be a dictionary or list.")
Expand All @@ -413,7 +428,7 @@ def main(argsl: Optional[List[str]] = None) -> int:
json_dump(doc_metadata, fp=sys.stdout, indent=4)
return 0

print(f"Document `{args.document}` is valid")
print(f"Document `{args.documents}` is valid")

return 0

Expand Down
46 changes: 46 additions & 0 deletions schema_salad/tests/data/pubseq/MW084447.1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"id": "placeholder",
"update_date": "2020-11-12",
"host": {
"host_species": "http://purl.obolibrary.org/obo/NCBITaxon_9606"
},
"sample": {
"sample_id": "MW084447.1",
"database": "https://www.ncbi.nlm.nih.gov/genbank/",
"source_database_accession": [
"http://identifiers.org/insdc/MW084447.1#sequence"
],
"collection_date": "2020-04-14",
"original_collection_location": "USA",
"collection_location": "http://www.wikidata.org/entity/Q23337",
"country": "USA",
"place": "Salt Lake City"
},
"virus": {
"virus_strain": "SARS-CoV-2/human/USA/UT-02140/2020",
"virus_species": "http://purl.obolibrary.org/obo/NCBITaxon_2697049"
},
"technology": {
"alignment_protocol": "bwa v. 0.7.17-r1188",
"sample_sequencing_technology": [
"http://purl.obolibrary.org/obo/OBI_0000759"
],
"assembly_method": "http://purl.obolibrary.org/obo/GENEPIO_0002028"
},
"submitter": {
"authors": [
"Young,E.L.",
"Oakeson,K.",
"Sangster,A.",
"Hirschi,B.",
"Butz,H."
],
"submitter_name": [
" Utah Public Health Laboratory"
],
"submitter_address": "Utah Public Health Laboratory Infectious Disease submission group, 4431 S 2700 W, Salt Lake City, UT 84129, USA"
},
"warnings": [
"Missing specimen_source"
]
}
53 changes: 53 additions & 0 deletions schema_salad/tests/data/pubseq/MW343767.1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"id": "placeholder1",
"update_date": "2020-12-08",
"host": {
"host_species": "http://purl.obolibrary.org/obo/NCBITaxon_9606"
},
"sample": {
"sample_id": "MW343767.1",
"database": "https://www.ncbi.nlm.nih.gov/genbank/",
"source_database_accession": [
"http://identifiers.org/insdc/MW343767.1#sequence"
],
"collection_date": "2020-10-27",
"specimen_source": [
"http://purl.obolibrary.org/obo/NCIT_C155831"
],
"original_collection_location": "USA",
"collection_location": "http://www.wikidata.org/entity/Q23556",
"country": "USA",
"place": "Atlanta"
},
"virus": {
"virus_strain": "SARS-CoV-2/human/USA/GA-CDC-7701/2020",
"virus_species": "http://purl.obolibrary.org/obo/NCBITaxon_2697049"
},
"technology": {
"alignment_protocol": "freebayes v. 1.3 ",
"sample_sequencing_technology": [
"http://purl.obolibrary.org/obo/OBI_0000759"
],
"assembly_method": "http://purl.obolibrary.org/obo/GENEPIO_0002028"
},
"submitter": {
"authors": [
"Li,Y.",
"Tao,Y.",
"Zhang,J.",
"Queen,K.",
"Uehara,A.",
"Cook,P.",
"Paden,C.R.",
"Wang,H.",
"Tong,S."
],
"submitter_name": [
" Respiratory Viruses Branch"
],
"submitter_address": "Centers for Disease Control and Prevention, 1600 Clifton Rd, Atlanta, GA 30329, USA"
},
"warnings": [

]
}
Loading