Skip to content

Commit 3895762

Browse files
Fix #922: validate the contract against the custom schema before converting (#923)
* fix: validate the contract against the custom schema before converting * Adding test cases --------- Signed-off-by: Yannick Libert <[email protected]> Co-authored-by: jochen <[email protected]>
1 parent 53e71ea commit 3895762

File tree

7 files changed

+277
-22
lines changed

7 files changed

+277
-22
lines changed

CHANGELOG.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
### Added
1111

12-
- Support for nested arrays in odcs v3 importer
12+
- import: Support for nested arrays in odcs v3 importer
13+
- lint: ODCS schema is now checked before converting
1314

1415
### Fixed
1516

16-
- Excel exporter now exports critical data element
17+
- export: Excel exporter now exports critical data element
1718

1819

1920
## [0.10.36] - 2025-10-17

datacontract/lint/resolve.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import importlib.resources as resources
12
import logging
23
import os
34
import warnings
5+
from pathlib import Path
46

57
import fastjsonschema
68
import yaml
@@ -298,6 +300,14 @@ def _resolve_data_contract_from_str(
298300
) -> DataContractSpecification:
299301
yaml_dict = _to_yaml(data_contract_str)
300302

303+
if schema_location is None:
304+
if is_open_data_contract_standard(yaml_dict):
305+
logging.info("Using ODCS 3.0.2 schema to validate data contract")
306+
# TODO refactor this to a specific function
307+
schema_location = resources.files("datacontract").joinpath("schemas", "odcs-3.0.2.schema.json")
308+
309+
_validate_json_schema(yaml_dict, schema_location)
310+
301311
if is_open_data_contract_standard(yaml_dict):
302312
logging.info("Importing ODCS v3")
303313
# if ODCS, then validate the ODCS schema and import to DataContractSpecification directly
@@ -311,7 +321,7 @@ def _resolve_data_contract_from_str(
311321

312322

313323
def _resolve_dcs_from_yaml_dict(inline_definitions, inline_quality, schema_location, yaml_dict):
314-
_validate_data_contract_specification_schema(yaml_dict, schema_location)
324+
_validate_json_schema(yaml_dict, schema_location)
315325
data_contract_specification = yaml_dict
316326
spec = DataContractSpecification(**data_contract_specification)
317327
if inline_definitions:
@@ -349,16 +359,16 @@ def _to_yaml(data_contract_str) -> dict:
349359
)
350360

351361

352-
def _validate_data_contract_specification_schema(data_contract_yaml, schema_location: str = None):
362+
def _validate_json_schema(yaml_str, schema_location: str | Path = None):
353363
schema = fetch_schema(schema_location)
354364
try:
355-
fastjsonschema.validate(schema, data_contract_yaml, use_default=False)
365+
fastjsonschema.validate(schema, yaml_str, use_default=False)
356366
logging.debug("YAML data is valid.")
357367
except JsonSchemaValueException as e:
358368
logging.warning(f"Data Contract YAML is invalid. Validation error: {e.message}")
359369
raise DataContractException(
360370
type="lint",
361-
result="failed",
371+
result=ResultEnum.failed,
362372
name="Check that data contract YAML is valid",
363373
reason=e.message,
364374
engine="datacontract",
@@ -367,7 +377,7 @@ def _validate_data_contract_specification_schema(data_contract_yaml, schema_loca
367377
logging.warning(f"Data Contract YAML is invalid. Validation error: {str(e)}")
368378
raise DataContractException(
369379
type="lint",
370-
result="failed",
380+
result=ResultEnum.failed,
371381
name="Check that data contract YAML is valid",
372382
reason=str(e),
373383
engine="datacontract",

datacontract/lint/schema.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@
22
import json
33
import logging
44
import os
5+
from pathlib import Path
56
from typing import Any, Dict
67

78
import requests
89

910
from datacontract.model.exceptions import DataContractException
11+
from datacontract.model.run import ResultEnum
1012

1113
DEFAULT_DATA_CONTRACT_SCHEMA = "datacontract-1.2.1.schema.json"
1214

1315

14-
def fetch_schema(location: str = None) -> Dict[str, Any]:
16+
def fetch_schema(location: str | Path = None) -> Dict[str, Any]:
1517
"""
1618
Fetch and return a JSON schema from a given location.
1719
@@ -36,19 +38,23 @@ def fetch_schema(location: str = None) -> Dict[str, Any]:
3638
schema_file = schemas.joinpath("schemas", DEFAULT_DATA_CONTRACT_SCHEMA)
3739
with schema_file.open("r") as file:
3840
schema = json.load(file)
39-
elif location.startswith("http://") or location.startswith("https://"):
40-
response = requests.get(location)
41-
schema = response.json()
4241
else:
43-
if not os.path.exists(location):
44-
raise DataContractException(
45-
type="lint",
46-
name=f"Reading schema from {location}",
47-
reason=f"The file '{location}' does not exist.",
48-
engine="datacontract",
49-
result="error",
50-
)
51-
with open(location, "r") as file:
52-
schema = json.load(file)
42+
# Convert Path objects to strings for string operations
43+
location_str = str(location)
44+
45+
if location_str.startswith("http://") or location_str.startswith("https://"):
46+
response = requests.get(location_str)
47+
schema = response.json()
48+
else:
49+
if not os.path.exists(location):
50+
raise DataContractException(
51+
type="lint",
52+
name=f"Reading schema from {location}",
53+
reason=f"The file '{location}' does not exist.",
54+
engine="datacontract",
55+
result=ResultEnum.error,
56+
)
57+
with open(location, "r") as file:
58+
schema = json.load(file)
5359

5460
return schema
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
apiVersion: "v3.0.2"
2+
kind: "DataContract"
3+
id: "valid_odcs"
4+
name: "This is invalid ODCS, because the status is missing"
5+
version: "1.0.0"
6+
#status: "draft"
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
apiVersion: "v3.0.2"
2+
kind: "DataContract"
3+
id: "valid_odcs"
4+
name: "Valid ODCS data contract"
5+
version: "2.0.0"
6+
status: "draft"
7+
tenant: "company-A"
8+
tags:
9+
- "datalocation:EU"
10+
description:
11+
purpose: "This data can be used for analytical purposes"
12+
usage: "Use this to analyze shipments"
13+
limitations: "Not suitable for real-time use cases"
14+
authoritativeDefinitions:
15+
- type: "Data Guidelines"
16+
url: "https://example.com/data-guidelines.html"
17+
customProperties:
18+
- property: "github_link"
19+
value: "https://github.example.com/shipment-specification.yaml"
20+
customProperties:
21+
- property: "additionalField"
22+
value: "some value in a new major contract"
23+
- property: "owner"
24+
value: "fulfillment"
25+
schema:
26+
- name: "shipments2"
27+
physicalType: "table"
28+
physicalName: "shipments_v2"
29+
businessName: "Shipments"
30+
description: "This table contains shipment data, including details about shipment IDs, associated orders, delivery dates, carriers, tracking numbers, statuses, and additional shipment information in JSON format."
31+
dataGranularityDescription: "Not Aggregated"
32+
tags:
33+
- "pii"
34+
quality:
35+
- type: "sql"
36+
description: "Table shall contain at least 1 row"
37+
query: "SELECT COUNT(*) FROM shipments"
38+
thresholdType: "mustBeGreaterThanOrEqualTo"
39+
mustBeGreaterThanOrEqualTo: 1
40+
properties:
41+
- name: "shipment_id"
42+
businessName: "Shipment ID"
43+
physicalName: "sid"
44+
logicalType: "string"
45+
description: "Unique identifier for each shipment."
46+
required: false
47+
unique: false
48+
physicalType: "uuid"
49+
primaryKey: true
50+
partitioned: false
51+
classification: "internal"
52+
examples:
53+
- "123e4567-e89b-12d3-a456-426614174000"
54+
criticalDataElement: false
55+
tags:
56+
- "businesskey"
57+
authoritativeDefinitions:
58+
- type: "definition"
59+
url: "https://datamesh-manager-demo.azurecontainerapps.io/demo440238121320/definitions/fulfillment/shipment_id"
60+
- name: "order_id"
61+
physicalName: "oid"
62+
physicalType: "text"
63+
primaryKey: false
64+
partitioned: false
65+
authoritativeDefinitions:
66+
- type: "definition"
67+
url: "https://datamesh-manager-demo.azurecontainerapps.io/demo440238121320/definitions/sales/order_id"
68+
- name: "delivery_date"
69+
businessName: "Delivery Date"
70+
logicalType: "date"
71+
description: "The actual or expected delivery date of the shipment."
72+
required: false
73+
unique: false
74+
physicalType: "timestamp_tz"
75+
primaryKey: false
76+
partitioned: false
77+
classification: "internal"
78+
examples:
79+
- "2023-10-01T10:00:00Z"
80+
criticalDataElement: false
81+
- name: "carrier"
82+
businessName: "Carrier"
83+
logicalType: "string"
84+
description: "The shipping carrier used for the delivery."
85+
required: false
86+
unique: false
87+
physicalType: "text"
88+
primaryKey: false
89+
partitioned: false
90+
classification: "internal"
91+
examples:
92+
- "FedEx"
93+
- "UPS"
94+
criticalDataElement: false
95+
- name: "tracking_number"
96+
businessName: "Tracking Number"
97+
logicalType: "string"
98+
description: "Tracking number provided by the carrier."
99+
required: false
100+
unique: false
101+
physicalType: "text"
102+
primaryKey: false
103+
partitioned: false
104+
classification: "restricted"
105+
examples:
106+
- "1Z999AA10123456784"
107+
criticalDataElement: false
108+
customProperties:
109+
- property: "external"
110+
value: "true"
111+
- name: "status"
112+
businessName: "Status"
113+
logicalType: "string"
114+
description: "Current status of the shipment."
115+
required: false
116+
unique: false
117+
physicalType: "text"
118+
primaryKey: false
119+
partitioned: false
120+
classification: "internal"
121+
examples:
122+
- "Delivered"
123+
- "In Transit"
124+
criticalDataElement: false
125+
- name: "inline_object_definition"
126+
businessName: "Inline Object Definition"
127+
logicalType: "object"
128+
description: "A JSON representation of additional shipment info"
129+
required: false
130+
unique: false
131+
physicalType: "json"
132+
primaryKey: false
133+
partitioned: false
134+
partitionKeyPosition: -1
135+
classification: "internal"
136+
examples:
137+
- "{\"destination\": \"New York\"}"
138+
criticalDataElement: false
139+
quality:
140+
- type: "text"
141+
description: "{field} must contain the field \"destination\""
142+
- name: "address"
143+
businessName: "Shipment Address"
144+
logicalType: "object"
145+
description: "Shipping address details."
146+
required: true
147+
physicalType: "JSON"
148+
classification: "restricted"
149+
properties:
150+
- name: "street"
151+
businessName: "Street"
152+
logicalType: "string"
153+
description: "Street address."
154+
required: true
155+
unique: false
156+
physicalType: "text"
157+
primaryKey: false
158+
partitioned: false
159+
classification: "restricted"
160+
examples:
161+
- "Marienplatz 1"
162+
- name: "city"
163+
businessName: "City"
164+
logicalType: "string"
165+
description: "City of the shipping address."
166+
required: true
167+
unique: false
168+
physicalType: "text"
169+
primaryKey: false
170+
partitioned: false
171+
classification: "restricted"
172+
examples:
173+
- "Munich"
174+
- name: "country"
175+
businessName: "Country"
176+
logicalType: "string"
177+
description: "Country of the shipping address."
178+
required: true
179+
physicalType: "text"
180+
primaryKey: false
181+
partitioned: false
182+
classification: "restricted"
183+
examples:
184+
- "DE"
185+
servers:
186+
- server: "production"
187+
type: "bigquery"
188+
environment: "production"
189+
dataset: "shipments_v1"
190+
project: "acme_shipments_prod"
191+
support:
192+
- channel: "slackname"
193+
url: "http://find.me.here"
194+
tool: "slack"
195+
scope: "interactive"
196+
authoritativeDefinitions:
197+
- type: "Guidelines"
198+
url: "https://example.com/guidelines"
199+
price:
200+
priceAmount: 1
201+
priceCurrency: "EUR"
202+
priceUnit: "Per 1000 requests"
203+
team:
204+
- username: "vimportant"
205+
role: "administrator"
206+
dateIn: "2020-01-01"
207+
- username: "nimportant"
208+
role: "reader"
209+
dateIn: "2020-01-01"
210+
dateOut: "2024-10-10"
211+
slaProperties:
212+
- property: "availability"
213+
value: "95%"
214+
unit: "%"
215+
driver: "operational"

tests/fixtures/postgres/odcs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ id: postgres
44
name: postgres
55
version: 0.0.1
66
domain: my-domain-team
7-
status: null
7+
status: active
88
schema:
99
- name: my_table
1010
physicalName: my_table

tests/test_lint.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,23 @@ def test_lint_custom_schema():
5454

5555
assert run.result == "passed"
5656

57+
def test_lint_valid_odcs_schema():
58+
data_contract_file = "fixtures/lint/valid.odcs.yaml"
59+
data_contract = DataContract(data_contract_file=data_contract_file)
60+
61+
run = data_contract.lint()
62+
63+
assert run.result == "passed"
64+
65+
66+
def test_lint_invalid_odcs_schema():
67+
data_contract_file = "fixtures/lint/invalid.odcs.yaml"
68+
data_contract = DataContract(data_contract_file=data_contract_file)
69+
70+
run = data_contract.lint()
71+
72+
assert run.result == "failed"
73+
5774

5875
def test_lint_with_ref():
5976
data_contract = DataContract(

0 commit comments

Comments
 (0)