-
Notifications
You must be signed in to change notification settings - Fork 3.2k
feat(ingestion): add column level description for parquet files #12988
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
3f4cb2a
945d3db
e036764
94b8ff2
7db4684
58e78fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from typing import IO, Any, Callable, Dict, List, Type | ||
|
||
import pandas | ||
import pyarrow | ||
import pyarrow.parquet | ||
|
||
|
@@ -64,6 +65,39 @@ | |
} | ||
|
||
|
||
def get_column_metadata(schema_dict: dict, column_name: str) -> str: | ||
""" | ||
Get metadata for a specific column from the schema dictionary. | ||
|
||
Args: | ||
schema_dict (dict): The schema dictionary containing column definitions | ||
column_name (str): The name of the column to get metadata for | ||
|
||
Returns: | ||
dict: The metadata for the specified column, or None if column not found | ||
""" | ||
# Iterate through all columns in the schema | ||
for _, column_info in schema_dict.items(): | ||
if column_info.get("name") == column_name: | ||
return column_info.get("metadata", {}) | ||
|
||
# Return None if column not found | ||
return None | ||
|
||
|
||
def parse_metadata(schema_metadata: bytes) -> Dict: | ||
""" | ||
Parse parquet schema metadata into a dictionary of fields. | ||
|
||
Args: | ||
schema_metadata (bytes): Raw schema metadata from parquet file | ||
|
||
Returns: | ||
Dict: Parsed metadata fields dictionary | ||
""" | ||
return pandas.read_json(schema_metadata.decode("utf-8")).to_dict()["fields"] | ||
|
||
|
||
def map_pyarrow_type(pyarrow_type: Type) -> Type: | ||
for checker, mapped_type in pyarrow_type_map.items(): | ||
if checker(pyarrow_type): | ||
|
@@ -81,14 +115,23 @@ def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: | |
|
||
fields: List[SchemaField] = [] | ||
|
||
meta_data_fields = parse_metadata( | ||
schema.metadata[b"org.apache.spark.sql.parquet.row.metadata"] | ||
) | ||
Comment on lines
+118
to
+120
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a guarantee that this metadata field will always exist? We should consider treating it as optional. |
||
|
||
for name, pyarrow_type in zip(schema.names, schema.types): | ||
mapped_type = map_pyarrow_type(pyarrow_type) | ||
|
||
description = get_column_metadata(meta_data_fields, name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of traversing |
||
|
||
description = description.get(name, None) | ||
|
||
field = SchemaField( | ||
fieldPath=name, | ||
type=SchemaFieldDataType(mapped_type()), | ||
nativeDataType=str(pyarrow_type), | ||
recursive=False, | ||
description=description, | ||
) | ||
|
||
fields.append(field) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the same, right?
Unless necessary, I would avoid depending on pandas for this.
For resilience, we should also account for the possibility that the
fields
field might be missing.