From 305d43e28f7994370f95c8c29a889d15613c7f28 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 9 Dec 2025 15:30:13 -0800 Subject: [PATCH 1/8] add option to skip column statistics --- parquet/benches/metadata.rs | 26 +++++++++++++++++++++++++ parquet/src/arrow/arrow_reader/mod.rs | 9 +++++++++ parquet/src/file/metadata/options.rs | 26 +++++++++++++++++++++++++ parquet/src/file/metadata/thrift/mod.rs | 4 +++- parquet/src/file/serialized_reader.rs | 9 +++++++++ 5 files changed, 73 insertions(+), 1 deletion(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index c962a4c3fdf8..aee7c18e9967 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -190,6 +190,15 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); + let options = + ParquetMetaDataOptions::new().with_column_stats_policy(ParquetStatisticsPolicy::SkipAll); + c.bench_function("decode metadata with skip column stats", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) + .unwrap(); + }) + }); + let buf: Bytes = black_box(encoded_meta()).into(); c.bench_function("decode parquet metadata (wide)", |b| { b.iter(|| { @@ -219,6 +228,23 @@ fn criterion_benchmark(c: &mut Criterion) { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) }); + + let options = + ParquetMetaDataOptions::new().with_column_stats_policy(ParquetStatisticsPolicy::SkipAll); + c.bench_function("decode metadata (wide) with skip column stats", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); + }) + }); + + let options = ParquetMetaDataOptions::new() + .with_column_stats_policy(ParquetStatisticsPolicy::SkipAll) + .with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll); + c.bench_function("decode metadata (wide) with skip all stats", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a626076ebdd7..7ba747152645 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -581,6 +581,15 @@ impl ArrowReaderOptions { self } + /// Sets the decoding policy for [`statistics`] in the Parquet `ColumnMetaData`. + /// + /// [`statistics`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L912 + pub fn with_column_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self { + self.metadata_options.set_column_stats_policy(policy); + self + } + /// Provide the file decryption properties to use when reading encrypted parquet files. /// /// If encryption is enabled and the file is encrypted, the `file_decryption_properties` must be provided. diff --git a/parquet/src/file/metadata/options.rs b/parquet/src/file/metadata/options.rs index c1ee22ff8de9..2bdf5328ddcc 100644 --- a/parquet/src/file/metadata/options.rs +++ b/parquet/src/file/metadata/options.rs @@ -92,6 +92,7 @@ pub struct ParquetMetaDataOptions { schema_descr: Option, encoding_stats_as_mask: bool, encoding_stats_policy: ParquetStatisticsPolicy, + column_stats_policy: ParquetStatisticsPolicy, } impl ParquetMetaDataOptions { @@ -180,6 +181,31 @@ impl ParquetMetaDataOptions { self.set_encoding_stats_policy(policy); self } + + /// Returns whether to skip decoding the [`statistics`] in the Parquet `ColumnMetaData` + /// for the column indexed by `col_index`. + /// + /// [`statistics`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L912 + pub fn skip_column_stats(&self, col_index: usize) -> bool { + self.column_stats_policy.is_skip(col_index) + } + + /// Sets the decoding policy for [`statistics`] in the Parquet `ColumnMetaData`. + /// + /// The default policy is to decode all `statistics`. + /// + /// [`statistics`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L912 + pub fn set_column_stats_policy(&mut self, policy: ParquetStatisticsPolicy) { + self.column_stats_policy = policy; + } + + /// Call [`Self::set_column_stats_policy`] and return `Self` for chaining. + pub fn with_column_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self { + self.set_column_stats_policy(policy); + self + } } #[cfg(test)] diff --git a/parquet/src/file/metadata/thrift/mod.rs b/parquet/src/file/metadata/thrift/mod.rs index 95ad67da6d95..9102b4a04c24 100644 --- a/parquet/src/file/metadata/thrift/mod.rs +++ b/parquet/src/file/metadata/thrift/mod.rs @@ -411,10 +411,12 @@ fn read_column_metadata<'a>( let mut skip_pes = false; let mut pes_mask = false; + let mut skip_col_stats = false; if let Some(opts) = options { skip_pes = opts.skip_encoding_stats(col_index); pes_mask = opts.encoding_stats_as_mask(); + skip_col_stats = opts.skip_column_stats(col_index); } // struct ColumnMetaData { @@ -483,7 +485,7 @@ fn read_column_metadata<'a>( 11 => { column.dictionary_page_offset = Some(i64::read_thrift(&mut *prot)?); } - 12 => { + 12 if !skip_col_stats => { column.statistics = convert_stats(column_descr, Some(Statistics::read_thrift(&mut *prot)?))?; } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 8ef7b972d7e1..c0c4e0f14a91 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -182,6 +182,15 @@ impl ReadOptionsBuilder { self } + /// Sets the decoding policy for [`statistics`] in the Parquet `ColumnMetaData`. + /// + /// [`statistics`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L912 + pub fn with_column_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self { + self.metadata_options.set_column_stats_policy(policy); + self + } + /// Seal the builder and return the read options pub fn build(self) -> ReadOptions { let props = self From 47ea8294372f366527a317dc7e56ba5fee2aa8f5 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 16 Dec 2025 09:40:24 -0800 Subject: [PATCH 2/8] add size statistics to the mix --- parquet/benches/metadata.rs | 46 +++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index aee7c18e9967..d671d7ca8e7c 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -20,8 +20,9 @@ use std::sync::Arc; use parquet::basic::{Encoding, PageType, Type as PhysicalType}; use parquet::file::metadata::{ - ColumnChunkMetaData, FileMetaData, PageEncodingStats, ParquetMetaData, ParquetMetaDataOptions, - ParquetMetaDataReader, ParquetMetaDataWriter, ParquetStatisticsPolicy, RowGroupMetaData, + ColumnChunkMetaData, FileMetaData, LevelHistogram, PageEncodingStats, ParquetMetaData, + ParquetMetaDataOptions, ParquetMetaDataReader, ParquetMetaDataWriter, ParquetStatisticsPolicy, + RowGroupMetaData, }; use parquet::file::statistics::Statistics; use parquet::file::writer::TrackedWrite; @@ -40,7 +41,7 @@ use parquet::file::serialized_reader::ReadOptionsBuilder; const NUM_COLUMNS: usize = 10_000; const NUM_ROW_GROUPS: usize = 10; -fn encoded_meta() -> Vec { +fn encoded_meta(is_nullable: bool, has_lists: bool) -> Vec { let mut rng = seedable_rng(); let mut column_desc_ptrs: Vec = Vec::with_capacity(NUM_COLUMNS); @@ -66,6 +67,23 @@ fn encoded_meta() -> Vec { let stats = Statistics::float(Some(rng.random()), Some(rng.random()), None, Some(0), false); + let (var_size, rep_hist, def_hist) = match (is_nullable, has_lists) { + (true, true) => { + let rep_hist = LevelHistogram::from(vec![1500i64; 2]); + let def_hist = LevelHistogram::from(vec![1000i64; 3]); + ( + Some(rng.random_range(0..1000000000)), + Some(rep_hist), + Some(def_hist), + ) + } + (true, false) => { + let def_hist = LevelHistogram::from(vec![1500i64; 2]); + (Some(rng.random_range(0..1000000000)), None, Some(def_hist)) + } + (_, _) => (None, None, None), + }; + let row_groups = (0..NUM_ROW_GROUPS) .map(|i| { let columns = (0..NUM_COLUMNS) @@ -94,6 +112,9 @@ fn encoded_meta() -> Vec { .set_offset_index_length(Some(rng.random_range(1..100000))) .set_column_index_offset(Some(rng.random_range(0..2000000000))) .set_column_index_length(Some(rng.random_range(1..100000))) + .set_unencoded_byte_array_data_bytes(var_size) + .set_repetition_level_histogram(rep_hist.clone()) + .set_definition_level_histogram(def_hist.clone()) .build() .unwrap() }) @@ -199,7 +220,7 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let buf: Bytes = black_box(encoded_meta()).into(); + let buf: Bytes = black_box(encoded_meta(false, false)).into(); c.bench_function("decode parquet metadata (wide)", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata(&buf).unwrap(); @@ -245,6 +266,23 @@ fn criterion_benchmark(c: &mut Criterion) { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) }); + + let buf: Bytes = black_box(encoded_meta(true, true)).into(); + c.bench_function("decode parquet metadata w/ size stats (wide)", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata(&buf).unwrap(); + }) + }); + + let buf: Bytes = black_box(encoded_meta(true, false)).into(); + c.bench_function( + "decode parquet metadata w/ size stats no lists (wide)", + |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata(&buf).unwrap(); + }) + }, + ); } criterion_group!(benches, criterion_benchmark); From 32a82245986d5d938f258acc487b214b7c382b36 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 16 Dec 2025 10:18:33 -0800 Subject: [PATCH 3/8] skip size stats --- parquet/benches/metadata.rs | 33 ++++++++++++------------- parquet/src/arrow/arrow_reader/mod.rs | 9 +++++++ parquet/src/file/metadata/options.rs | 26 +++++++++++++++++++ parquet/src/file/metadata/thrift/mod.rs | 4 ++- parquet/src/file/serialized_reader.rs | 9 +++++++ 5 files changed, 63 insertions(+), 18 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index d671d7ca8e7c..661372fb30d6 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -258,31 +258,30 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let options = ParquetMetaDataOptions::new() - .with_column_stats_policy(ParquetStatisticsPolicy::SkipAll) - .with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll); - c.bench_function("decode metadata (wide) with skip all stats", |b| { + let buf: Bytes = black_box(encoded_meta(true, true)).into(); + c.bench_function("decode parquet metadata w/ size stats (wide)", |b| { b.iter(|| { - ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); + ParquetMetaDataReader::decode_metadata(&buf).unwrap(); }) }); - let buf: Bytes = black_box(encoded_meta(true, true)).into(); - c.bench_function("decode parquet metadata w/ size stats (wide)", |b| { + let options = + ParquetMetaDataOptions::new().with_size_stats_policy(ParquetStatisticsPolicy::SkipAll); + c.bench_function("decode metadata (wide) with skip size stats", |b| { b.iter(|| { - ParquetMetaDataReader::decode_metadata(&buf).unwrap(); + ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) }); - let buf: Bytes = black_box(encoded_meta(true, false)).into(); - c.bench_function( - "decode parquet metadata w/ size stats no lists (wide)", - |b| { - b.iter(|| { - ParquetMetaDataReader::decode_metadata(&buf).unwrap(); - }) - }, - ); + let options = ParquetMetaDataOptions::new() + .with_column_stats_policy(ParquetStatisticsPolicy::SkipAll) + .with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll) + .with_size_stats_policy(ParquetStatisticsPolicy::SkipAll); + c.bench_function("decode metadata (wide) with skip all stats", |b| { + b.iter(|| { + ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 7ba747152645..035220b23ea9 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -590,6 +590,15 @@ impl ArrowReaderOptions { self } + /// Sets the decoding policy for [`size_statistics`] in the Parquet `ColumnMetaData`. + /// + /// [`size_statistics`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936 + pub fn with_size_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self { + self.metadata_options.set_size_stats_policy(policy); + self + } + /// Provide the file decryption properties to use when reading encrypted parquet files. /// /// If encryption is enabled and the file is encrypted, the `file_decryption_properties` must be provided. diff --git a/parquet/src/file/metadata/options.rs b/parquet/src/file/metadata/options.rs index 2bdf5328ddcc..c5342a83629a 100644 --- a/parquet/src/file/metadata/options.rs +++ b/parquet/src/file/metadata/options.rs @@ -93,6 +93,7 @@ pub struct ParquetMetaDataOptions { encoding_stats_as_mask: bool, encoding_stats_policy: ParquetStatisticsPolicy, column_stats_policy: ParquetStatisticsPolicy, + size_stats_policy: ParquetStatisticsPolicy, } impl ParquetMetaDataOptions { @@ -206,6 +207,31 @@ impl ParquetMetaDataOptions { self.set_column_stats_policy(policy); self } + + /// Returns whether to skip decoding the [`size_statistics`] in the Parquet `ColumnMetaData` + /// for the column indexed by `col_index`. + /// + /// [`size_statistics`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936 + pub fn skip_size_stats(&self, col_index: usize) -> bool { + self.size_stats_policy.is_skip(col_index) + } + + /// Sets the decoding policy for [`size_statistics`] in the Parquet `ColumnMetaData`. + /// + /// The default policy is to decode all `size_statistics`. + /// + /// [`statistics`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936 + pub fn set_size_stats_policy(&mut self, policy: ParquetStatisticsPolicy) { + self.size_stats_policy = policy; + } + + /// Call [`Self::set_size_stats_policy`] and return `Self` for chaining. + pub fn with_size_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self { + self.set_size_stats_policy(policy); + self + } } #[cfg(test)] diff --git a/parquet/src/file/metadata/thrift/mod.rs b/parquet/src/file/metadata/thrift/mod.rs index 9102b4a04c24..80ec058c6339 100644 --- a/parquet/src/file/metadata/thrift/mod.rs +++ b/parquet/src/file/metadata/thrift/mod.rs @@ -412,11 +412,13 @@ fn read_column_metadata<'a>( let mut skip_pes = false; let mut pes_mask = false; let mut skip_col_stats = false; + let mut skip_size_stats = false; if let Some(opts) = options { skip_pes = opts.skip_encoding_stats(col_index); pes_mask = opts.encoding_stats_as_mask(); skip_col_stats = opts.skip_column_stats(col_index); + skip_size_stats = opts.skip_size_stats(col_index); } // struct ColumnMetaData { @@ -505,7 +507,7 @@ fn read_column_metadata<'a>( 15 => { column.bloom_filter_length = Some(i32::read_thrift(&mut *prot)?); } - 16 => { + 16 if !skip_size_stats => { let val = SizeStatistics::read_thrift(&mut *prot)?; column.unencoded_byte_array_data_bytes = val.unencoded_byte_array_data_bytes; column.repetition_level_histogram = diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index c0c4e0f14a91..2b1c3f2e95aa 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -191,6 +191,15 @@ impl ReadOptionsBuilder { self } + /// Sets the decoding policy for [`size_statistics`] in the Parquet `ColumnMetaData`. + /// + /// [`size_statistics`]: + /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936 + pub fn with_size_stats_policy(mut self, policy: ParquetStatisticsPolicy) -> Self { + self.metadata_options.set_size_stats_policy(policy); + self + } + /// Seal the builder and return the read options pub fn build(self) -> ReadOptions { let props = self From 8046fc261b34d2e221a068f703743c072d3922a7 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 16 Dec 2025 10:42:20 -0800 Subject: [PATCH 4/8] add column stats tests --- parquet/src/arrow/arrow_reader/mod.rs | 10 +++++++--- parquet/src/file/serialized_reader.rs | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 035220b23ea9..1dc4e9f30672 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1548,8 +1548,9 @@ pub(crate) mod tests { let file = File::open(path).unwrap(); // test skipping all - let arrow_options = - ArrowReaderOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll); + let arrow_options = ArrowReaderOptions::new() + .with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll) + .with_column_stats_policy(ParquetStatisticsPolicy::SkipAll); let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( file.try_clone().unwrap(), arrow_options, @@ -1560,12 +1561,14 @@ pub(crate) mod tests { for column in row_group_metadata.columns() { assert!(column.page_encoding_stats().is_none()); assert!(column.page_encoding_stats_mask().is_none()); + assert!(column.statistics().is_none()); } // test skipping all but one column and converting to mask let arrow_options = ArrowReaderOptions::new() .with_encoding_stats_as_mask(true) - .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0])); + .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0])) + .with_column_stats_policy(ParquetStatisticsPolicy::skip_except(&[0])); let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( file.try_clone().unwrap(), arrow_options, @@ -1576,6 +1579,7 @@ pub(crate) mod tests { for (idx, column) in row_group_metadata.columns().iter().enumerate() { assert!(column.page_encoding_stats().is_none()); assert_eq!(column.page_encoding_stats_mask().is_some(), idx == 0); + assert_eq!(column.statistics().is_some(), idx == 0); } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 2b1c3f2e95aa..a74084715443 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1927,6 +1927,7 @@ mod tests { // test skipping all let options = ReadOptionsBuilder::new() .with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll) + .with_column_stats_policy(ParquetStatisticsPolicy::SkipAll) .build(); let file_reader = Arc::new( SerializedFileReader::new_with_options(file.try_clone().unwrap(), options).unwrap(), @@ -1936,12 +1937,14 @@ mod tests { for column in row_group_metadata.columns() { assert!(column.page_encoding_stats().is_none()); assert!(column.page_encoding_stats_mask().is_none()); + assert!(column.statistics().is_none()); } // test skipping all but one column let options = ReadOptionsBuilder::new() .with_encoding_stats_as_mask(true) .with_encoding_stats_policy(ParquetStatisticsPolicy::skip_except(&[0])) + .with_column_stats_policy(ParquetStatisticsPolicy::skip_except(&[0])) .build(); let file_reader = Arc::new( SerializedFileReader::new_with_options(file.try_clone().unwrap(), options).unwrap(), @@ -1951,6 +1954,7 @@ mod tests { for (idx, column) in row_group_metadata.columns().iter().enumerate() { assert!(column.page_encoding_stats().is_none()); assert_eq!(column.page_encoding_stats_mask().is_some(), idx == 0); + assert_eq!(column.statistics().is_some(), idx == 0); } } From 67bb86cfae88b9fef6f2f95835badbb31b922f65 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 16 Dec 2025 11:15:52 -0800 Subject: [PATCH 5/8] add tests of skipping size stats --- parquet/src/arrow/arrow_reader/mod.rs | 42 ++++++++++++++++++++++++++- parquet/src/file/serialized_reader.rs | 36 +++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 1dc4e9f30672..d8dac0dac54b 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1542,7 +1542,7 @@ pub(crate) mod tests { } #[test] - fn test_page_encoding_stats_skipped() { + fn test_stats_stats_skipped() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{testdata}/alltypes_tiny_pages.parquet"); let file = File::open(path).unwrap(); @@ -1583,6 +1583,46 @@ pub(crate) mod tests { } } + #[test] + fn test_size_stats_stats_skipped() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/repeated_primitive_no_list.parquet"); + let file = File::open(path).unwrap(); + + // test skipping all + let arrow_options = + ArrowReaderOptions::new().with_size_stats_policy(ParquetStatisticsPolicy::SkipAll); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + file.try_clone().unwrap(), + arrow_options, + ) + .unwrap(); + + let row_group_metadata = builder.metadata.row_group(0); + for column in row_group_metadata.columns() { + assert!(column.repetition_level_histogram().is_none()); + assert!(column.definition_level_histogram().is_none()); + assert!(column.unencoded_byte_array_data_bytes().is_none()); + } + + // test skipping all but one column and converting to mask + let arrow_options = ArrowReaderOptions::new() + .with_encoding_stats_as_mask(true) + .with_size_stats_policy(ParquetStatisticsPolicy::skip_except(&[1])); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + file.try_clone().unwrap(), + arrow_options, + ) + .unwrap(); + + let row_group_metadata = builder.metadata.row_group(0); + for (idx, column) in row_group_metadata.columns().iter().enumerate() { + assert_eq!(column.repetition_level_histogram().is_some(), idx == 1); + assert_eq!(column.definition_level_histogram().is_some(), idx == 1); + assert_eq!(column.unencoded_byte_array_data_bytes().is_some(), idx == 1); + } + } + #[test] fn test_arrow_reader_single_column() { let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet"); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index a74084715443..c075af445525 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1958,6 +1958,42 @@ mod tests { } } + #[test] + fn test_file_reader_size_stats_skipped() { + let file = get_test_file("repeated_primitive_no_list.parquet"); + + // test skipping all + let options = ReadOptionsBuilder::new() + .with_size_stats_policy(ParquetStatisticsPolicy::SkipAll) + .build(); + let file_reader = Arc::new( + SerializedFileReader::new_with_options(file.try_clone().unwrap(), options).unwrap(), + ); + + let row_group_metadata = file_reader.metadata.row_group(0); + for column in row_group_metadata.columns() { + assert!(column.repetition_level_histogram().is_none()); + assert!(column.definition_level_histogram().is_none()); + assert!(column.unencoded_byte_array_data_bytes().is_none()); + } + + // test skipping all but one column + let options = ReadOptionsBuilder::new() + .with_encoding_stats_as_mask(true) + .with_size_stats_policy(ParquetStatisticsPolicy::skip_except(&[1])) + .build(); + let file_reader = Arc::new( + SerializedFileReader::new_with_options(file.try_clone().unwrap(), options).unwrap(), + ); + + let row_group_metadata = file_reader.metadata.row_group(0); + for (idx, column) in row_group_metadata.columns().iter().enumerate() { + assert_eq!(column.repetition_level_histogram().is_some(), idx == 1); + assert_eq!(column.definition_level_histogram().is_some(), idx == 1); + assert_eq!(column.unencoded_byte_array_data_bytes().is_some(), idx == 1); + } + } + #[test] fn test_file_reader_with_no_filter() -> Result<()> { let test_file = get_test_file("alltypes_plain.parquet"); From 072ecf62ef7e78cb22423fc97149f0e5301672fc Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 16 Dec 2025 11:57:06 -0800 Subject: [PATCH 6/8] fix cut and paste error --- parquet/src/file/metadata/options.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/options.rs b/parquet/src/file/metadata/options.rs index c5342a83629a..8f421b1ead55 100644 --- a/parquet/src/file/metadata/options.rs +++ b/parquet/src/file/metadata/options.rs @@ -221,7 +221,7 @@ impl ParquetMetaDataOptions { /// /// The default policy is to decode all `size_statistics`. /// - /// [`statistics`]: + /// [`size_statistics`]: /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L936 pub fn set_size_stats_policy(&mut self, policy: ParquetStatisticsPolicy) { self.size_stats_policy = policy; From 835bff15ac5d9ed8b2f91899fb6baf4d0d9d5da5 Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 9 Jan 2026 14:07:40 -0800 Subject: [PATCH 7/8] revert some changes made to benchmarks so these still match 57.x --- parquet/benches/metadata.rs | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index e630c656eb93..a7f7f277ee68 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -179,14 +179,8 @@ fn criterion_benchmark(c: &mut Criterion) { }); let meta_data = get_footer_bytes(data.clone()); - c.bench_function("decode parquet metadata", |b| { - b.iter(|| { - ParquetMetaDataReader::decode_metadata(&meta_data).unwrap(); - }) - }); - let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); - c.bench_function("decode metadata (full stats)", |b| { + c.bench_function("decode parquet metadata", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) .unwrap(); @@ -194,7 +188,9 @@ fn criterion_benchmark(c: &mut Criterion) { }); let schema = ParquetMetaDataReader::decode_schema(&meta_data).unwrap(); - let options = ParquetMetaDataOptions::new().with_schema(schema); + let options = ParquetMetaDataOptions::new() + .with_schema(schema) + .with_encoding_stats_as_mask(false); c.bench_function("decode metadata with schema", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) @@ -219,8 +215,9 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let options = - ParquetMetaDataOptions::new().with_column_stats_policy(ParquetStatisticsPolicy::SkipAll); + let options = ParquetMetaDataOptions::new() + .with_column_stats_policy(ParquetStatisticsPolicy::SkipAll) + .with_encoding_stats_as_mask(false); c.bench_function("decode metadata with skip column stats", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options)) @@ -229,21 +226,17 @@ fn criterion_benchmark(c: &mut Criterion) { }); let buf: Bytes = black_box(encoded_meta(false, false)).into(); - c.bench_function("decode parquet metadata (wide)", |b| { - b.iter(|| { - ParquetMetaDataReader::decode_metadata(&buf).unwrap(); - }) - }); - let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false); - c.bench_function("decode metadata (wide) (full stats)", |b| { + c.bench_function("decode parquet metadata (wide)", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); }) }); let schema = ParquetMetaDataReader::decode_schema(&buf).unwrap(); - let options = ParquetMetaDataOptions::new().with_schema(schema); + let options = ParquetMetaDataOptions::new() + .with_schema(schema) + .with_encoding_stats_as_mask(false); c.bench_function("decode metadata (wide) with schema", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); @@ -265,8 +258,9 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - let options = - ParquetMetaDataOptions::new().with_column_stats_policy(ParquetStatisticsPolicy::SkipAll); + let options = ParquetMetaDataOptions::new() + .with_column_stats_policy(ParquetStatisticsPolicy::SkipAll) + .with_encoding_stats_as_mask(false); c.bench_function("decode metadata (wide) with skip column stats", |b| { b.iter(|| { ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap(); From dd633a1b6e0a087385b93174ceabd7179c7c622c Mon Sep 17 00:00:00 2001 From: seidl Date: Fri, 9 Jan 2026 14:28:50 -0800 Subject: [PATCH 8/8] change open benches to match old behavior --- parquet/benches/metadata.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index a7f7f277ee68..c9a6cf3b762c 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -168,12 +168,20 @@ fn criterion_benchmark(c: &mut Criterion) { let data = Bytes::from(data); c.bench_function("open(default)", |b| { - b.iter(|| SerializedFileReader::new(data.clone()).unwrap()) + b.iter(|| { + let options = ReadOptionsBuilder::new() + .with_encoding_stats_as_mask(false) + .build(); + SerializedFileReader::new_with_options(data.clone(), options).unwrap() + }) }); c.bench_function("open(page index)", |b| { b.iter(|| { - let options = ReadOptionsBuilder::new().with_page_index().build(); + let options = ReadOptionsBuilder::new() + .with_page_index() + .with_encoding_stats_as_mask(false) + .build(); SerializedFileReader::new_with_options(data.clone(), options).unwrap() }) });