@@ -35,6 +35,7 @@ mod adapter;
3535mod deserialize;
3636
3737pub use adapter:: RowGroupImplBuilder ;
38+ use databend_common_exception:: Result ;
3839pub use deserialize:: column_chunks_to_record_batch;
3940
4041use crate :: io:: read:: block:: block_reader_merge_io:: DataItem ;
@@ -48,17 +49,41 @@ impl BlockReader {
4849 column_chunks : HashMap < ColumnId , DataItem > ,
4950 compression : & Compression ,
5051 block_path : & str ,
51- ) -> databend_common_exception:: Result < DataBlock > {
52+ ) -> Result < DataBlock > {
53+ let mut blocks = self . deserialize_parquet_to_blocks (
54+ num_rows,
55+ column_metas,
56+ column_chunks,
57+ compression,
58+ block_path,
59+ num_rows,
60+ ) ?;
61+ // Defensive check: using `num_rows` as batch_size, expects only one block
62+ assert_eq ! ( blocks. len( ) , 1 ) ;
63+ Ok ( blocks. pop ( ) . unwrap ( ) )
64+ }
65+
66+ pub ( crate ) fn deserialize_parquet_to_blocks (
67+ & self ,
68+ num_rows : usize ,
69+ column_metas : & HashMap < ColumnId , ColumnMeta > ,
70+ column_chunks : HashMap < ColumnId , DataItem > ,
71+ compression : & Compression ,
72+ block_path : & str ,
73+ batch_size : usize ,
74+ ) -> Result < Vec < DataBlock > > {
5275 if column_chunks. is_empty ( ) {
53- return self . build_default_values_block ( num_rows) ;
76+ return Ok ( vec ! [ self . build_default_values_block( num_rows) ? ] ) ;
5477 }
55- let record_batch = column_chunks_to_record_batch (
78+
79+ let record_batches = column_chunks_to_record_batch (
5680 & self . original_schema ,
5781 num_rows,
5882 & column_chunks,
5983 compression,
84+ batch_size,
6085 ) ?;
61- let mut columns = Vec :: with_capacity ( self . projected_schema . fields . len ( ) ) ;
86+
6287 let name_paths = column_name_paths ( & self . projection , & self . original_schema ) ;
6388
6489 let array_cache = if self . put_cache {
@@ -67,58 +92,71 @@ impl BlockReader {
6792 None
6893 } ;
6994
70- for ( ( i, field) , column_node) in self
71- . projected_schema
72- . fields
73- . iter ( )
74- . enumerate ( )
75- . zip ( self . project_column_nodes . iter ( ) )
76- {
77- let data_type = field. data_type ( ) . into ( ) ;
78-
79- // NOTE, there is something tricky here:
80- // - `column_chunks` always contains data of leaf columns
81- // - here we may processing a nested type field
82- // - But, even if the field being processed is a field with multiple leaf columns
83- // `column_chunks.get(&field.column_id)` will still return Some(DataItem::_)[^1],
84- // even if we are getting data from `column_chunks` using a non-leaf
85- // `column_id` of `projected_schema.fields`
86- //
87- // [^1]: Except in the current block, there is no data stored for the
88- // corresponding field, and a default value has been declared for
89- // the corresponding field.
90- //
91- // Yes, it is too obscure, we need to polish it later.
92-
93- let value = match column_chunks. get ( & field. column_id ) {
94- Some ( DataItem :: RawData ( data) ) => {
95- // get the deserialized arrow array, which may be a nested array
96- let arrow_array = column_by_name ( & record_batch, & name_paths[ i] ) ;
97- if !column_node. is_nested {
98- if let Some ( cache) = & array_cache {
99- let meta = column_metas. get ( & field. column_id ) . unwrap ( ) ;
100- let ( offset, len) = meta. offset_length ( ) ;
101- let key =
102- TableDataCacheKey :: new ( block_path, field. column_id , offset, len) ;
103- cache. insert ( key. into ( ) , ( arrow_array. clone ( ) , data. len ( ) ) ) ;
95+ let mut blocks = Vec :: with_capacity ( record_batches. len ( ) ) ;
96+
97+ for record_batch in record_batches {
98+ let num_rows_record_batch = record_batch. num_rows ( ) ;
99+ let mut columns = Vec :: with_capacity ( self . projected_schema . fields . len ( ) ) ;
100+ for ( ( i, field) , column_node) in self
101+ . projected_schema
102+ . fields
103+ . iter ( )
104+ . enumerate ( )
105+ . zip ( self . project_column_nodes . iter ( ) )
106+ {
107+ let data_type = field. data_type ( ) . into ( ) ;
108+
109+ // NOTE, there is something tricky here:
110+ // - `column_chunks` always contains data of leaf columns
111+ // - here we may processing a nested type field
112+ // - But, even if the field being processed is a field with multiple leaf columns
113+ // `column_chunks.get(&field.column_id)` will still return Some(DataItem::_)[^1],
114+ // even if we are getting data from `column_chunks` using a non-leaf
115+ // `column_id` of `projected_schema.fields`
116+ //
117+ // [^1]: Except in the current block, there is no data stored for the
118+ // corresponding field, and a default value has been declared for
119+ // the corresponding field.
120+ //
121+ // Yes, it is too obscure, we need to polish it later.
122+
123+ let value = match column_chunks. get ( & field. column_id ) {
124+ Some ( DataItem :: RawData ( data) ) => {
125+ // get the deserialized arrow array, which may be a nested array
126+ let arrow_array = column_by_name ( & record_batch, & name_paths[ i] ) ;
127+ if !column_node. is_nested {
128+ if let Some ( cache) = & array_cache {
129+ let meta = column_metas. get ( & field. column_id ) . unwrap ( ) ;
130+ let ( offset, len) = meta. offset_length ( ) ;
131+ let key = TableDataCacheKey :: new (
132+ block_path,
133+ field. column_id ,
134+ offset,
135+ len,
136+ ) ;
137+ cache. insert ( key. into ( ) , ( arrow_array. clone ( ) , data. len ( ) ) ) ;
138+ }
104139 }
140+ Value :: from_arrow_rs ( arrow_array, & data_type) ?
105141 }
106- Value :: from_arrow_rs ( arrow_array, & data_type) ?
107- }
108- Some ( DataItem :: ColumnArray ( cached) ) => {
109- if column_node. is_nested {
110- // a defensive check, should never happen
111- return Err ( ErrorCode :: StorageOther (
112- "unexpected nested field: nested leaf field hits cached" ,
113- ) ) ;
142+ Some ( DataItem :: ColumnArray ( cached) ) => {
143+ // TODO this is NOT correct!
144+ if column_node. is_nested {
145+ // a defensive check, should never happen
146+ return Err ( ErrorCode :: StorageOther (
147+ "unexpected nested field: nested leaf field hits cached" ,
148+ ) ) ;
149+ }
150+ Value :: from_arrow_rs ( cached. 0 . clone ( ) , & data_type) ?
114151 }
115- Value :: from_arrow_rs ( cached . 0 . clone ( ) , & data_type ) ?
116- }
117- None => Value :: Scalar ( self . default_vals [ i ] . clone ( ) ) ,
118- } ;
119- columns . push ( BlockEntry :: new ( data_type , value ) ) ;
152+ None => Value :: Scalar ( self . default_vals [ i ] . clone ( ) ) ,
153+ } ;
154+ columns . push ( BlockEntry :: new ( data_type , value ) ) ;
155+ }
156+ blocks . push ( DataBlock :: new ( columns , num_rows_record_batch ) ) ;
120157 }
121- Ok ( DataBlock :: new ( columns, num_rows) )
158+
159+ Ok ( blocks)
122160 }
123161}
124162
0 commit comments