diff --git a/datacontract/imports/bigquery_importer.py b/datacontract/imports/bigquery_importer.py index e07904c9..6f6024b4 100644 --- a/datacontract/imports/bigquery_importer.py +++ b/datacontract/imports/bigquery_importer.py @@ -127,7 +127,7 @@ def convert_bigquery_schema( return data_contract_specification -def import_table_fields(table_fields): +def import_table_fields(table_fields, in_array=False): imported_fields = {} for field in table_fields: field_name = field.get("name") @@ -136,26 +136,51 @@ def import_table_fields(table_fields): imported_fields[field_name].description = field.get("description") if field.get("type") == "RECORD": - imported_fields[field_name].type = "object" - imported_fields[field_name].fields = import_table_fields(field.get("fields")) + if field.get("mode") == "REPEATED": + imported_fields[field_name].type = "array" + imported_fields[field_name].items = Field( + type="record", fields= import_table_fields(field.get("fields"), in_array=True)) + elif field.get("mode") == "NULLABLE": + imported_fields[field_name].type = "struct" + imported_fields[field_name].fields = import_table_fields(field.get("fields"), in_array=True) + else: + imported_fields[field_name].type = "object" + imported_fields[field_name].fields = import_table_fields(field.get("fields"), in_array=True) + elif field.get("type") == "STRUCT": imported_fields[field_name].type = "struct" - imported_fields[field_name].fields = import_table_fields(field.get("fields")) + imported_fields[field_name].fields = import_table_fields(field.get("fields"), in_array=in_array) elif field.get("type") == "RANGE": # This is a range of date/datetime/timestamp but multiple values # So we map it to an array imported_fields[field_name].type = "array" imported_fields[field_name].items = Field( - type=map_type_from_bigquery(field["rangeElementType"].get("type")) + type=map_type_from_bigquery(field["rangeElementType"].get("type"), in_array=True) ) - else: # primitive type - imported_fields[field_name].type = map_type_from_bigquery(field.get("type")) + elif field.get("type") == "TIME": + imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array) + imported_fields[field_name].config = {"bigqueryType": "TIME"} + elif field.get("type") == "GEOGRAPHY": + imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array) + imported_fields[field_name].config = {"bigqueryType": "GEOGRAPHY"} + elif field.get("type") == "JSON": + imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array) + imported_fields[field_name].config = {"bigqueryType": "JSON"} + + else: + if field.get("mode") == "REPEATED": # not a type record meaning type ARRAY ARRAY + imported_fields[field_name].type = "array" + imported_fields[field_name].items = Field( + type= map_type_from_bigquery(field.get("type"), in_array=True)) + else: # primitive type + imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array) if field.get("type") == "STRING": # in bigquery both string and bytes have maxLength but in the datacontracts # spec it is only valid for strings if field.get("maxLength") is not None: imported_fields[field_name].maxLength = int(field.get("maxLength")) + imported_fields[field_name].config = {"bigqueryType": f"STRING({field.get('maxLength')})"} if field.get("type") == "NUMERIC" or field.get("type") == "BIGNUMERIC": if field.get("precision") is not None: @@ -163,17 +188,19 @@ def import_table_fields(table_fields): if field.get("scale") is not None: imported_fields[field_name].scale = int(field.get("scale")) + if field.get("precision") is not None and field.get("scale") is not None: + imported_fields[field_name].config = {"bigqueryType": f"{ field.get('type')}({field.get('precision')}, {field.get('scale')})"} return imported_fields -def map_type_from_bigquery(bigquery_type_str: str): +def map_type_from_bigquery(bigquery_type_str: str, in_array=False): if bigquery_type_str == "STRING": return "string" elif bigquery_type_str == "BYTES": return "bytes" elif bigquery_type_str == "INTEGER": - return "int" + return "bigint" if in_array else "int" elif bigquery_type_str == "INT64": return "bigint" elif bigquery_type_str == "FLOAT": @@ -187,9 +214,9 @@ def map_type_from_bigquery(bigquery_type_str: str): elif bigquery_type_str == "DATE": return "date" elif bigquery_type_str == "TIME": - return "timestamp_ntz" + return "object" elif bigquery_type_str == "DATETIME": - return "timestamp" + return "timestamp_ntz" elif bigquery_type_str == "NUMERIC": return "numeric" elif bigquery_type_str == "BIGNUMERIC": diff --git a/tests/fixtures/bigquery/export/bq_table_schema.json b/tests/fixtures/bigquery/export/bq_table_schema.json index e4a33567..bcc15834 100644 --- a/tests/fixtures/bigquery/export/bq_table_schema.json +++ b/tests/fixtures/bigquery/export/bq_table_schema.json @@ -105,10 +105,10 @@ "description": "a simple timestamp_tz field" }, { - "name": "timestamp_ntz_field", + "name": "datetime_field", "type": "DATETIME", "mode": "NULLABLE", - "description": "a simple timestamp_ntz field" + "description": "a simple timestamp_ntz field (bq datetime)" }, { "name": "date_field", @@ -116,6 +116,24 @@ "mode": "NULLABLE", "description": "a simple date field" }, + { + "name": "time_field", + "type": "TIME", + "mode": "NULLABLE", + "description": "a time field" + }, + { + "name": "geography_field", + "type": "GEOGRAPHY", + "mode": "NULLABLE", + "description": "a geography field" + }, + { + "name": "json_field", + "type": "JSON", + "mode": "NULLABLE", + "description": "a json field" + }, { "name": "number_field", "type": "NUMERIC", @@ -194,7 +212,13 @@ "type": "DATE", "mode": "NULLABLE", "description": "a non required date field" - } + }, + { + "name": "subfield_3", + "type": "INT64", + "mode": "NULLABLE", + "description": "an integer field" + } ] }, { @@ -211,7 +235,7 @@ }, { "name": "subfield_2", - "type": "INTEGER", + "type": "INT64", "mode": "NULLABLE", "description": "a non required int field" } @@ -225,7 +249,7 @@ }, { "name": "int_array_field", - "type": "INTEGER", + "type": "INT64", "mode": "REPEATED", "description": "an int array" }, diff --git a/tests/fixtures/bigquery/export/datacontract.yaml b/tests/fixtures/bigquery/export/datacontract.yaml index 871d39c3..5fde9f6f 100644 --- a/tests/fixtures/bigquery/export/datacontract.yaml +++ b/tests/fixtures/bigquery/export/datacontract.yaml @@ -79,14 +79,32 @@ models: type: timestamp_tz required: false description: a simple timestamp_tz field - timestamp_ntz_field: + datetime_field: type: timestamp_ntz required: false - description: a simple timestamp_ntz field + description: a simple timestamp_ntz field (bq datetime) date_field: type: date required: false description: a simple date field + time_field: + type: object + required: false + description: a time field + config: + bigqueryType: TIME + geography_field: + type: object + required: false + description: a geography field + config: + bigqueryType: GEOGRAPHY + json_field: + type: object + required: false + description: a json field + config: + bigqueryType: JSON number_field: type: number required: false @@ -141,6 +159,10 @@ models: type: date required: false description: a non required date field + subfield_3: + type: bigint + required: false + description: an integer field struct_field: type: struct required: false @@ -151,7 +173,7 @@ models: required: true description: a required bytes field subfield_2: - type: int + type: bigint required: false description: a non required int field string_array_field: @@ -165,7 +187,7 @@ models: required: false description: an int array items: - type: int + type: bigint complex_array_field: type: array required: false diff --git a/tests/fixtures/bigquery/import/complete_table_schema.json b/tests/fixtures/bigquery/import/complete_table_schema.json index 669f176a..c831d574 100644 --- a/tests/fixtures/bigquery/import/complete_table_schema.json +++ b/tests/fixtures/bigquery/import/complete_table_schema.json @@ -152,6 +152,39 @@ "mode": "NULLABLE", "name": "JSON_Field", "type": "JSON" + }, + { + "description": "an array of string", + "mode": "REPEATED", + "name": "Array_of_string", + "type": "STRING" + }, + { + "description": "an array of int", + "mode": "REPEATED", + "name": "Array_of_int", + "type": "INTEGER" + }, + { + "description": "an array of objects that has multiple fields that should carry through", + "fields": [ + { + "name": "Id", + "type": "INTEGER", + "mode": "NULLABLE", + "description": "an id field" + }, + { + "name": "Name", + "type": "STRING", + "mode": "NULLABLE", + "description": "a name field" + + } + ], + "mode": "REPEATED", + "name": "Array_of_struct_col", + "type": "RECORD" } ] }, diff --git a/tests/fixtures/bigquery/import/datacontract.yaml b/tests/fixtures/bigquery/import/datacontract.yaml index d921bb41..7c143356 100644 --- a/tests/fixtures/bigquery/import/datacontract.yaml +++ b/tests/fixtures/bigquery/import/datacontract.yaml @@ -22,6 +22,8 @@ models: required: true description: A required String field with a maximum length maxLength: 42 + config: + bigqueryType: STRING(42) Bytes field: type: bytes required: false @@ -51,11 +53,13 @@ models: required: false description: A Date field Time_Field: - type: timestamp_ntz + type: object required: false description: A time field + config: + bigqueryType: TIME Datetime_Field: - type: timestamp + type: timestamp_ntz required: false description: A Datetime field Numeric_Field: @@ -64,14 +68,18 @@ models: description: A Numeric field with precision 5 and scale 3 precision: 5 scale: 3 + config: + bigqueryType: NUMERIC(5, 3) Bignumeric_field: type: double required: false description: A bignumeric field with precision 8 and sclae 4 precision: 8 scale: 4 + config: + bigqueryType: BIGNUMERIC(8, 4) Record_field: - type: object + type: struct required: false description: A record field with two subfields fields: @@ -80,7 +88,7 @@ models: required: false description: subfield 1 of type string subfield_2: - type: int + type: bigint required: false description: Subfield 2 of type integer Range_field: @@ -88,12 +96,44 @@ models: required: false description: a datetime range items: - type: timestamp + type: timestamp_ntz Geography_Field: type: object required: false description: a geography field + config: + bigqueryType: GEOGRAPHY JSON_Field: type: object required: false description: a json field + config: + bigqueryType: JSON + Array_of_string: + type: array + required: false + description: an array of string + items: + type: string + Array_of_int: + type: array + required: false + description: an array of int + items: + type: bigint + Array_of_struct_col: + type: array + required: false + description: an array of objects that has multiple fields that should carry through + items: + type: record + fields: + Id: + type: bigint + required: false + description: an id field + Name: + type: string + required: false + description: a name field + \ No newline at end of file