Skip to content

Commit 37ffaa8

Browse files
author
LiberationMindset
committed
resolve issue #737 fix type conversion issue for bigquery target when doing datacontract import followed by datacontract test after adding server information
1 parent 22611af commit 37ffaa8

File tree

6 files changed

+82
-21
lines changed

6 files changed

+82
-21
lines changed

datacontract/export/bigquery_converter.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,12 @@ def map_type_to_bigquery(field: Field) -> str:
9898
return "FLOAT64"
9999
elif field_type.lower() == "boolean":
100100
return "BOOL"
101-
elif field_type.lower() in ["timestamp", "timestamp_tz"]:
101+
elif field_type.lower() in "timestamp":
102102
return "TIMESTAMP"
103103
elif field_type.lower() == "date":
104104
return "DATE"
105+
elif field_type.lower() == "timestamp_tz":
106+
return "TIME"
105107
elif field_type.lower() == "timestamp_ntz":
106108
return "DATETIME"
107109
elif field_type.lower() in ["number", "decimal", "numeric"]:

datacontract/imports/bigquery_importer.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def convert_bigquery_schema(
127127
return data_contract_specification
128128

129129

130-
def import_table_fields(table_fields):
130+
def import_table_fields(table_fields, in_array=False):
131131
imported_fields = {}
132132
for field in table_fields:
133133
field_name = field.get("name")
@@ -136,20 +136,38 @@ def import_table_fields(table_fields):
136136
imported_fields[field_name].description = field.get("description")
137137

138138
if field.get("type") == "RECORD":
139-
imported_fields[field_name].type = "object"
140-
imported_fields[field_name].fields = import_table_fields(field.get("fields"))
139+
if field.get("mode") == "REPEATED":
140+
imported_fields[field_name].type = "array"
141+
imported_fields[field_name].items = Field(
142+
type="record", fields= import_table_fields(field.get("fields"), in_array=True))
143+
else:
144+
imported_fields[field_name].type = "object"
145+
imported_fields[field_name].fields = import_table_fields(field.get("fields"), in_array=True)
146+
141147
elif field.get("type") == "STRUCT":
142148
imported_fields[field_name].type = "struct"
143-
imported_fields[field_name].fields = import_table_fields(field.get("fields"))
149+
imported_fields[field_name].fields = import_table_fields(field.get("fields"), in_array=in_array)
144150
elif field.get("type") == "RANGE":
145151
# This is a range of date/datetime/timestamp but multiple values
146152
# So we map it to an array
147153
imported_fields[field_name].type = "array"
148154
imported_fields[field_name].items = Field(
149-
type=map_type_from_bigquery(field["rangeElementType"].get("type"))
155+
type=map_type_from_bigquery(field["rangeElementType"].get("type"), in_array=True)
150156
)
151-
else: # primitive type
152-
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"))
157+
elif field.get("type") == "GEOGRAPHY":
158+
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array)
159+
imported_fields[field_name].config = {"bigqueryType": "GEOGRAPHY"}
160+
elif field.get("type") == "JSON":
161+
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array)
162+
imported_fields[field_name].config = {"bigqueryType": "JSON"}
163+
164+
else:
165+
if field.get("type") == "REPEATED": # not a type record meaning type ARRAY<STRING> ARRAY<INTEGER>
166+
imported_fields[field_name].type = "array"
167+
imported_fields[field_name].items = Field(
168+
type= map_type_from_bigquery(field.get("type"), in_array=True))
169+
else: # primitive type
170+
imported_fields[field_name].type = map_type_from_bigquery(field.get("type"), in_array=in_array)
153171

154172
if field.get("type") == "STRING":
155173
# in bigquery both string and bytes have maxLength but in the datacontracts
@@ -167,13 +185,13 @@ def import_table_fields(table_fields):
167185
return imported_fields
168186

169187

170-
def map_type_from_bigquery(bigquery_type_str: str):
188+
def map_type_from_bigquery(bigquery_type_str: str, in_array=False):
171189
if bigquery_type_str == "STRING":
172190
return "string"
173191
elif bigquery_type_str == "BYTES":
174192
return "bytes"
175193
elif bigquery_type_str == "INTEGER":
176-
return "int"
194+
return "bigint" if in_array else "int"
177195
elif bigquery_type_str == "INT64":
178196
return "bigint"
179197
elif bigquery_type_str == "FLOAT":
@@ -187,9 +205,9 @@ def map_type_from_bigquery(bigquery_type_str: str):
187205
elif bigquery_type_str == "DATE":
188206
return "date"
189207
elif bigquery_type_str == "TIME":
190-
return "timestamp_ntz"
208+
return "timestamp_tz"
191209
elif bigquery_type_str == "DATETIME":
192-
return "timestamp"
210+
return "timestamp_ntz"
193211
elif bigquery_type_str == "NUMERIC":
194212
return "numeric"
195213
elif bigquery_type_str == "BIGNUMERIC":

tests/fixtures/bigquery/export/bq_table_schema.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,9 @@
100100
},
101101
{
102102
"name": "timestamp_tz_field",
103-
"type": "TIMESTAMP",
103+
"type": "TIME",
104104
"mode": "NULLABLE",
105-
"description": "a simple timestamp_tz field"
105+
"description": "a simple time field"
106106
},
107107
{
108108
"name": "timestamp_ntz_field",
@@ -225,7 +225,7 @@
225225
},
226226
{
227227
"name": "int_array_field",
228-
"type": "INTEGER",
228+
"type": "INT64",
229229
"mode": "REPEATED",
230230
"description": "an int array"
231231
},

tests/fixtures/bigquery/export/datacontract.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ models:
7878
timestamp_tz_field:
7979
type: timestamp_tz
8080
required: false
81-
description: a simple timestamp_tz field
81+
description: a simple time field
8282
timestamp_ntz_field:
8383
type: timestamp_ntz
8484
required: false
@@ -165,7 +165,7 @@ models:
165165
required: false
166166
description: an int array
167167
items:
168-
type: int
168+
type: bigint
169169
complex_array_field:
170170
type: array
171171
required: false

tests/fixtures/bigquery/import/complete_table_schema.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,27 @@
152152
"mode": "NULLABLE",
153153
"name": "JSON_Field",
154154
"type": "JSON"
155+
},
156+
{
157+
"description": "an array of objects that has multiple fields that should carry through",
158+
"fields": [
159+
{
160+
"name": "Id",
161+
"type": "INTEGER",
162+
"mode": "NULLABLE",
163+
"description": "an id field"
164+
},
165+
{
166+
"name": "Name",
167+
"type": "STRING",
168+
"mode": "NULLABLE",
169+
"description": "a name field"
170+
171+
}
172+
],
173+
"mode": "REPEATED",
174+
"name": "Array_of_struct_col",
175+
"type": "RECORD"
155176
}
156177
]
157178
},

tests/fixtures/bigquery/import/datacontract.yaml

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ models:
5151
required: false
5252
description: A Date field
5353
Time_Field:
54-
type: timestamp_ntz
54+
type: timestamp_tz
5555
required: false
5656
description: A time field
5757
Datetime_Field:
58-
type: timestamp
58+
type: timestamp_ntz
5959
required: false
6060
description: A Datetime field
6161
Numeric_Field:
@@ -80,20 +80,40 @@ models:
8080
required: false
8181
description: subfield 1 of type string
8282
subfield_2:
83-
type: int
83+
type: bigint
8484
required: false
8585
description: Subfield 2 of type integer
8686
Range_field:
8787
type: array
8888
required: false
8989
description: a datetime range
9090
items:
91-
type: timestamp
91+
type: timestamp_ntz
9292
Geography_Field:
9393
type: object
9494
required: false
9595
description: a geography field
96+
config:
97+
bigqueryType: GEOGRAPHY
9698
JSON_Field:
9799
type: object
98100
required: false
99101
description: a json field
102+
config:
103+
bigqueryType: JSON
104+
Array_of_struct_col:
105+
type: array
106+
required: false
107+
description: an array of objects that has multiple fields that should carry through
108+
items:
109+
type: record
110+
fields:
111+
Id:
112+
type: bigint
113+
required: false
114+
description: an id field
115+
Name:
116+
type: string
117+
required: false
118+
description: a name field
119+

0 commit comments

Comments
 (0)