diff --git a/README.md b/README.md index b0e26be..e30e907 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ Constraints: * `CreationDate: [Optional[String]]` when this output port has been created. * `StartDate: [Optional[String]]` the first business date present in the dataset, leave it empty for events or we can use some standard semantic like: "-7D, -1Y". * `ProcessDescription: [Option[String]]` what is the underlying process that contributes to generate the data exposed by this output port. -* `DataContract: [Yaml]`: In case something is going to change in this section, it represents a breaking change because the producer is breaking the contract, this will require to create a new version of the data product to keep backward compatibility +* `DataContract: [Yaml]`: In case something is going to change in this section, it represents a breaking change because the producer is breaking the contract, this will require to create a new version of the data product to keep backward compatibility. * `Schema: [Array[Yaml]]` when it comes to describe a schema we propose to leverage OpenMetadata specification: Ref https://docs.open-metadata.org/metadata-standard/schemas/entities/table#column. Each column can have a tag array and you can choose between simples LabelTags, ClassificationTags or DescriptiveTags. Here an example of classification Tag https://github.com/open-metadata/OpenMetadata/blob/main/catalog-rest-service/src/main/resources/json/data/tags/piiTags.json. * `SLA: [Yaml]` Service Level Agreement, describe the quality of data delivery and the output port in general. It represents the producer's overall promise to the consumers. * `IntervalOfChange: [Option[String]]` how often changes in the data are reflected. @@ -92,7 +92,10 @@ Constraints: * `Confidentiality: [Option[String]]` Describe what a consumer should do to keep the information confidential, how to process and store it. Permission to share or report it. * `Tags: [Array[Yaml]]` Tag labels at OutputPort level, here we can have security classification for example (please refer to OpenMetadata https://docs.open-metadata.org/metadata-standard/schemas/types/taglabel). * `SampleData: [Option[Yaml]]` provides a sample data of your Output Port (please refer to OpenMetadata specification: https://docs.open-metadata.org/metadata-standard/schemas/entities/table#tabledata). -* `SemanticLinking: [Option[Yaml]]` here we can express semantic relationships between this output port and other outputports (also coming from other domains and data products). For example we could say that column "customerId" of our SQL Output Port references the column "id" of the SQL Output Port of the "Customer" Data Product. +* `SemanticLinking: [Array[Yaml]]` here we can express semantic relationships between this output port and other outputports (also coming from other domains and data products). For example we could say that column "customerId" of our SQL Output Port references the column "id" of the SQL Output Port of the "Customer" Data Product. This array can contain multiple references for the same field (e.g. the field custom_id can be joined with external_id of output port A and with refrence_id of output port B). + * `FieldName: [String]` name of the output port field that can be joined with a remote output port field. + * `ReferenceOutputPort: [String]` unique ID of the output port refrenced by this output port field. + * `RefrenceFieldName: [String]` name of the field name of the refrenced output port that is joinable with the current one. * `Specific: [Yaml]` this is a custom section where we must put all the information strictly related to a specific technology or dependent from a standard/policy defined in the federated governance. diff --git a/data-product-specification.cue b/data-product-specification.cue index 2d745d3..ef45f6f 100644 --- a/data-product-specification.cue +++ b/data-product-specification.cue @@ -4,13 +4,15 @@ package generic_dp import "strings" -#Version: string & =~"^[0-9]+\\.[0-9]+\\..+$" -#Id: string & =~"^[a-zA-Z0-9:._-]+$" -#DataProductId: #Id & =~"^urn:dmb:dp:\(domain):[a-zA-Z0-9_-]+:\(majorVersion)$" -#ComponentId: #Id & =~"^urn:dmb:cmp:\(domain):[a-zA-Z0-9_-]+:\(majorVersion):[a-zA-Z0-9_-]+$" -#URL: string & =~"^https?://[a-zA-Z0-9@:%._~#=&/?]*$" -#OM_DataType: string & =~"(?i)^(NUMBER|TINYINT|SMALLINT|INT|BIGINT|BYTEINT|BYTES|FLOAT|DOUBLE|DECIMAL|NUMERIC|TIMESTAMP|TIME|DATE|DATETIME|INTERVAL|STRING|MEDIUMTEXT|TEXT|CHAR|VARCHAR|BOOLEAN|BINARY|VARBINARY|ARRAY|BLOB|LONGBLOB|MEDIUMBLOB|MAP|STRUCT|UNION|SET|GEOGRAPHY|ENUM|JSON)$" -#OM_Constraint: string & =~"(?i)^(NULL|NOT_NULL|UNIQUE|PRIMARY_KEY)$" +#Version: string & =~"^[0-9]+\\.[0-9]+\\..+$" +#Id: string & =~"^[a-zA-Z0-9:._-]+$" +#DataProductId: #Id & =~"^urn:dmb:dp:\(domain):[a-zA-Z0-9_-]+:\(majorVersion)$" +#ComponentId: #Id & =~"^urn:dmb:cmp:\(domain):[a-zA-Z0-9_-]+:\(majorVersion):[a-zA-Z0-9_-]+$" +#ExternalComponentId: #Id & =~"^urn:dmb:cmp:[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+:[0-9]+:[a-zA-Z0-9_-]+$" +#ExternalResourceId: #Id & =~"^urn:dmb:ex:[a-zA-Z0-9_-]+$" +#URL: string & =~"^https?://[a-zA-Z0-9@:%._~#=&/?]*$" +#OM_DataType: string & =~"(?i)^(NUMBER|TINYINT|SMALLINT|INT|BIGINT|BYTEINT|BYTES|FLOAT|DOUBLE|DECIMAL|NUMERIC|TIMESTAMP|TIME|DATE|DATETIME|INTERVAL|STRING|MEDIUMTEXT|TEXT|CHAR|VARCHAR|BOOLEAN|BINARY|VARBINARY|ARRAY|BLOB|LONGBLOB|MEDIUMBLOB|MAP|STRUCT|UNION|SET|GEOGRAPHY|ENUM|JSON)$" +#OM_Constraint: string & =~"(?i)^(NULL|NOT_NULL|UNIQUE|PRIMARY_KEY)$" #OM_TableData: { columns: [... string] @@ -49,6 +51,12 @@ import "strings" } } +#SemanticLink: { + fieldName: string + referenceOutputPort: #ExternalComponentId + refrenceFieldName: string +} + #DataContract: { schema: [... #OM_Column] SLA: { @@ -91,10 +99,32 @@ import "strings" dataContract: #DataContract dataSharingAgreement: #DataSharingAgreement tags: [... #OM_Tag] - sampleData?: #OM_TableData | null - semanticLinking?: {...} | null + sampleData?: #OM_TableData | null + semanticLinking: [... #SemanticLink] specific: {...} ... + + // check that the sample data columsn are defined in the schema + #checkColumns: { + if sampleData != null && sampleData != _|_ { + _schemaColumns: { + for sc in dataContract.schema { + "\(sc.name)": {} + } + } + _sampleColumns: { + for sc in sampleData.columns { + "\(sc)": {} + } + } + for c in sampleData.columns { + _schemaColumns[c] + } + for c in dataContract.schema { + _sampleColumns[c.name] + } + } + } } #Workload: { @@ -111,7 +141,7 @@ import "strings" workloadType?: string | null connectionType?: string & =~"(?i)^(housekeeping|datapipeline)$" | null tags: [... #OM_Tag] - readsFrom: [... string] + readsFrom: [... (#ExternalComponentId | #ExternalResourceId)] specific: {...} | null ... } diff --git a/example.yaml b/example.yaml index 5569095..d6f02ff 100644 --- a/example.yaml +++ b/example.yaml @@ -34,7 +34,11 @@ components: startDate: processDescription: this output port is generated by a Spark Job scheduled every day at 2AM and it lasts for approx 2 hours dataContract: - schema: [] + schema: + - name: name + dataType: string + - name: surname + dataType: string SLA: intervalOfChange: 1 hours timeliness: 1 minutes @@ -58,8 +62,8 @@ components: source: Tag labelType: Manual state: Confirmed - sampleData: {} - semanticLinking: {} + sampleData: null + semanticLinking: [] specific: directory: history bucket: ms-datamesh-s3 @@ -110,7 +114,10 @@ components: - Jura - - Chandra - Nalaar - semanticLinking: {} + semanticLinking: + - fieldName: name + referenceOutputPort: urn:dmb:cmp:other_domain:other_data_product:2:other_event_port + refrenceFieldName: nameReference specific: database: my_database table: my_table