Skip to content

Commit 57c75b3

Browse files
committed
Gross hack during type conversion
1 parent f76b6ce commit 57c75b3

File tree

3 files changed

+59
-15
lines changed

3 files changed

+59
-15
lines changed

parquet/src/arrow/schema/complex.rs

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use crate::errors::ParquetError;
2525
use crate::errors::Result;
2626
use crate::schema::types::{SchemaDescriptor, Type, TypePtr};
2727
use arrow_schema::{DataType, Field, Fields, SchemaBuilder};
28-
use crate::arrow::schema::extension::add_extension_type;
28+
use crate::arrow::schema::extension::{add_extension_type, binary_to_binary_view};
2929

3030
fn get_repetition(t: &Type) -> Repetition {
3131
let info = t.get_basic_info();
@@ -173,7 +173,7 @@ impl Visitor {
173173

174174
let parquet_fields = struct_type.get_fields();
175175

176-
// Extract the arrow fields
176+
// Extract any arrow fields from the hints
177177
let arrow_fields = match &context.data_type {
178178
Some(DataType::Struct(fields)) => {
179179
if fields.len() != parquet_fields.len() {
@@ -221,10 +221,10 @@ impl Visitor {
221221
data_type,
222222
};
223223

224-
if let Some(child) = self.dispatch(parquet_field, child_ctx)? {
224+
if let Some(mut child) = self.dispatch(parquet_field, child_ctx)? {
225225
// The child type returned may be different from what is encoded in the arrow
226226
// schema in the event of a mismatch or a projection
227-
child_fields.push(convert_field(parquet_field, &child, arrow_field));
227+
child_fields.push(convert_field(parquet_field, &mut child, arrow_field));
228228
children.push(child);
229229
}
230230
}
@@ -353,13 +353,13 @@ impl Visitor {
353353

354354
// Need both columns to be projected
355355
match (maybe_key, maybe_value) {
356-
(Some(key), Some(value)) => {
356+
(Some(mut key), Some(mut value)) => {
357357
let key_field = Arc::new(
358-
convert_field(map_key, &key, arrow_key)
358+
convert_field(map_key, &mut key, arrow_key)
359359
// The key is always non-nullable (#5630)
360360
.with_nullable(false),
361361
);
362-
let value_field = Arc::new(convert_field(map_value, &value, arrow_value));
362+
let value_field = Arc::new(convert_field(map_value, &mut value, arrow_value));
363363
let field_metadata = match arrow_map {
364364
Some(field) => field.metadata().clone(),
365365
_ => HashMap::default(),
@@ -496,8 +496,8 @@ impl Visitor {
496496
};
497497

498498
match self.dispatch(item_type, new_context) {
499-
Ok(Some(item)) => {
500-
let item_field = Arc::new(convert_field(item_type, &item, arrow_field));
499+
Ok(Some(mut item)) => {
500+
let item_field = Arc::new(convert_field(item_type, &mut item, arrow_field));
501501

502502
// Use arrow type as hint for index size
503503
let arrow_type = match context.data_type {
@@ -545,7 +545,7 @@ impl Visitor {
545545
///
546546
/// The resulting Arrow [`Field`] will have the type dictated by the Parquet `field`, a name
547547
/// dictated by the `parquet_type`, and any metadata from `arrow_hint`
548-
fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<&Field>) -> Field {
548+
fn convert_field(parquet_type: &Type, field: &mut ParquetField, arrow_hint: Option<&Field>) -> Field {
549549
let name = parquet_type.name();
550550
let data_type = field.arrow_type.clone();
551551
let nullable = field.nullable;
@@ -576,7 +576,9 @@ fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<&
576576
);
577577
ret.set_metadata(meta);
578578
}
579-
add_extension_type(ret, &parquet_type)
579+
let ret = add_extension_type(ret, &parquet_type);
580+
field.arrow_type = binary_to_binary_view(&field.arrow_type);
581+
ret
580582
}
581583
}
582584
}

parquet/src/arrow/schema/extension.rs

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717

1818
//! Arrow Extension Type Support
1919
20-
use arrow_schema::Field;
20+
use std::sync::Arc;
21+
use arrow_schema::{DataType, Field, Fields};
2122
use crate::basic::LogicalType;
2223
use crate::schema::types::Type;
2324

@@ -28,11 +29,50 @@ use crate::schema::types::Type;
2829
/// Arrow DataType, and instead are represented by an Arrow ExtensionType.
2930
/// Extension types are attached to Arrow Fields via metadata.
3031
pub(crate) fn add_extension_type(arrow_field: Field, parquet_type: &Type) -> Field {
31-
match parquet_type.get_basic_info().logical_type() {
32+
println!("Adding extension to field: {:#?}", arrow_field);
33+
let result = match parquet_type.get_basic_info().logical_type() {
3234
#[cfg(feature = "variant_experimental")]
3335
Some(LogicalType::Variant) => {
34-
arrow_field.with_extension_type(crate::variant::VariantType)
36+
let new_data_type = binary_to_binary_view(arrow_field.data_type());
37+
println!("New variant data type: {:?}", new_data_type);
38+
arrow_field
39+
.with_data_type(new_data_type)
40+
.with_extension_type(crate::variant::VariantType)
3541
}
3642
_ => arrow_field
43+
};
44+
println!("new field: {:#?}", result);
45+
result
46+
}
47+
48+
/// replaces all instances of Binary with BinaryView in a DataType
49+
///
50+
/// The `VariantArray` implementation uses BinaryView rather than Binary to
51+
/// avoid unnecessary copies. This function recursively traverses the DataType
52+
/// and replaces Binary with BinaryView, including within nested List and Struct
53+
/// types.
54+
#[cfg(feature = "variant_experimental")]
55+
pub(crate) fn binary_to_binary_view(data_type: &DataType) -> DataType {
56+
// TODO avoid the clones below
57+
match data_type {
58+
DataType::Binary => DataType::BinaryView,
59+
DataType::List(field) => {
60+
let new_field = field
61+
.as_ref()
62+
.clone()
63+
.with_data_type(binary_to_binary_view(field.data_type()));
64+
DataType::List(Arc::new(new_field))
65+
}
66+
DataType::Struct(fields) => {
67+
let new_fields: Fields = fields
68+
.iter()
69+
.map(|f| {
70+
let new_field = f.as_ref().clone().with_data_type(binary_to_binary_view(f.data_type()));
71+
Arc::new(new_field)
72+
})
73+
.collect();
74+
DataType::Struct(new_fields)
75+
}
76+
_ => data_type.clone(),
3777
}
3878
}

parquet/src/variant.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ mod tests {
123123
use crate::arrow::ArrowWriter;
124124
use crate::file::reader::ChunkReader;
125125
use arrow::util::test_util::parquet_test_data;
126+
use arrow_array::RecordBatchReader;
126127
use arrow_array::{Array, ArrayRef, RecordBatch};
127128
use arrow_schema::{Field, Fields, Schema};
128129
use bytes::Bytes;
@@ -165,7 +166,7 @@ mod tests {
165166
assert_eq!(var_array.len(), 1);
166167
assert!(var_array.is_valid(0));
167168
let var_value = var_array.value(0);
168-
assert_eq!(var_value, Variant::String("iceberg - UPDATE"));
169+
assert_eq!(var_value, Variant::from("iceberg"));
169170
}
170171

171172
/// Writes a variant to a parquet file and ensures the parquet logical type
@@ -210,6 +211,7 @@ mod tests {
210211
.unwrap()
211212
.build()
212213
.unwrap();
214+
println!("Reader shcema: {:?}", reader.schema());
213215
let mut batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>().unwrap();
214216
assert_eq!(batches.len(), 1);
215217
batches.swap_remove(0)

0 commit comments

Comments
 (0)