From f1c6dc0314249a902631f00585cf23a048c1e436 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 26 Aug 2023 22:11:53 +0800 Subject: [PATCH 01/51] Parquet: Implement skeleton for BloomFilter --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/bloom_filter.h | 6 + cpp/src/parquet/bloom_filter_builder.cc | 117 +++++++++++++++++ cpp/src/parquet/bloom_filter_builder.h | 70 +++++++++++ cpp/src/parquet/column_writer.cc | 161 +++++++++++++++++++++--- cpp/src/parquet/column_writer.h | 4 +- cpp/src/parquet/file_writer.cc | 61 ++++++++- cpp/src/parquet/metadata.cc | 77 ++++++++---- cpp/src/parquet/metadata.h | 29 +++-- cpp/src/parquet/properties.h | 100 ++++++++++++++- 10 files changed, 570 insertions(+), 56 deletions(-) create mode 100644 cpp/src/parquet/bloom_filter_builder.cc create mode 100644 cpp/src/parquet/bloom_filter_builder.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index eb2e2d8fed88f..021e34fc8b09a 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -156,6 +156,7 @@ set(PARQUET_SRCS arrow/writer.cc bloom_filter.cc bloom_filter_reader.cc + Bloom_filter_builder.cc column_reader.cc column_scanner.cc column_writer.cc diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index e8ef5c0bd60db..963f4cc005741 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -167,6 +167,12 @@ class PARQUET_EXPORT BloomFilter { virtual ~BloomFilter() = default; + // Variant of const pointer argument to facilitate template + uint64_t Hash(const int32_t* value) const { return Hash(*value); } + uint64_t Hash(const int64_t* value) const { return Hash(*value); } + uint64_t Hash(const float* value) const { return Hash(*value); } + uint64_t Hash(const double* value) const { return Hash(*value); } + protected: // Hash strategy available for Bloom filter. enum class HashStrategy : uint32_t { XXHASH = 0 }; diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc new file mode 100644 index 0000000000000..7ad462c654f71 --- /dev/null +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -0,0 +1,117 @@ +#include "parquet/bloom_filter_builder.h" + +#include + +#include "arrow/io/interfaces.h" + +#include "metadata.h" +#include "parquet/bloom_filter.h" +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/internal_file_encryptor.h" +#include "parquet/exception.h" +#include "parquet/properties.h" + +namespace parquet { + +class BloomFilterBuilderImpl : public BloomFilterBuilder { + public: + explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema, + WriterProperties properties) + : schema_(schema), properties_(std::move(properties)) {} + /// Append a new row group to host all incoming bloom filters. + void AppendRowGroup() override; + + BloomFilter* GetOrCreateBloomFilter( + int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) override; + + /// Serialize all bloom filters with header and bitset in the order of row group and + /// column id. Column encryption is not implemented yet. The side effect is that it + /// deletes all bloom filters after they have been flushed. + void WriteTo(::arrow::io::OutputStream* sink, + BloomFilterLocation* location) override; + + void Finish() override { finished_ = true; } + + private: + /// Make sure column ordinal is not out of bound and the builder is in good state. + void CheckState(int32_t column_ordinal) const { + if (finished_) { + throw ParquetException("PageIndexBuilder is already finished."); + } + if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) { + throw ParquetException("Invalid column ordinal: ", column_ordinal); + } + if (row_group_bloom_filters_.empty()) { + throw ParquetException("No row group appended to PageIndexBuilder."); + } + } + + const SchemaDescriptor* schema_; + WriterProperties properties_; + bool finished_ = false; + + // vector: row_group_ordinal + // map: column_ordinal -> bloom filter + std::vector>> row_group_bloom_filters_; +}; + +std::unique_ptr BloomFilterBuilder::Make( + const SchemaDescriptor* schema, const WriterProperties& properties) { + return std::unique_ptr( + new BloomFilterBuilderImpl(schema, properties)); +} + +void BloomFilterBuilderImpl::AppendRowGroup() { row_group_bloom_filters_.emplace_back(); } + +BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter( + int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) { + CheckState(column_ordinal); + std::unique_ptr& bloom_filter = + row_group_bloom_filters_.back()[column_ordinal]; + if (bloom_filter == nullptr) { + auto block_split_bloom_filter = std::make_unique(); + block_split_bloom_filter->Init(BlockSplitBloomFilter::OptimalNumOfBytes( + bloom_filter_options.ndv, bloom_filter_options.fpp)); + } + return bloom_filter.get(); +} + +void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, + BloomFilterLocation* location) { + if (!finished_) { + throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder."); + } + if (row_group_bloom_filters_.empty()) { + // Return quickly if there is no bloom filter + return; + } + + for (size_t row_group_ordinal = 0; row_group_ordinal < row_group_bloom_filters_.size(); + ++row_group_ordinal) { + const auto& row_group_bloom_filters = row_group_bloom_filters_[row_group_ordinal]; + // the whole row group has no bloom filter + if (row_group_bloom_filters.empty()) { + continue; + } + bool has_valid_bloom_filter = false; + int num_columns = schema_->num_columns(); + std::vector> locations(num_columns, std::nullopt); + + // serialize bloom filter by ascending order of column id + for (int32_t column_id = 0; column_id < num_columns; ++column_id) { + auto iter = row_group_bloom_filters.find(column_id); + if (iter != row_group_bloom_filters.cend()) { + PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); + iter->second->WriteTo(sink); + PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); + has_valid_bloom_filter = true; + locations[column_id] = IndexLocation{offset, static_cast(pos - offset)}; + } + } + if (has_valid_bloom_filter) { + location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations)); + } + } +} + +} // namespace parquet diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h new file mode 100644 index 0000000000000..046c169f4e280 --- /dev/null +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#pragma once + +#include "arrow/io/interfaces.h" +#include "parquet/types.h" + +namespace parquet { + +class BloomFilter; +class SchemaDescriptor; +class InternalFileEncryptor; +struct BloomFilterOptions; +struct BloomFilterLocation; + +namespace schema { +class ColumnPath; +} + +/// \brief Interface for collecting bloom filter of a parquet file. +class PARQUET_EXPORT BloomFilterBuilder { + public: + /// \brief API convenience to create a BloomFilterBuilder. + static std::unique_ptr Make(const SchemaDescriptor* schema, + const WriterProperties& properties); + + /// Append a new row group to host all incoming bloom filters. + virtual void AppendRowGroup() = 0; + + /// \brief Get the BloomFilter from column ordinal. + /// + /// \param column_ordinal Column ordinal in schema, which is only for leaf columns. + /// \return ColumnIndexBuilder for the column and its memory ownership belongs to + /// the PageIndexBuilder. + virtual BloomFilter* GetOrCreateBloomFilter( + int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) = 0; + + /// \brief Write the bloom filter to sink. + /// + /// \param[out] sink The output stream to write the bloom filter. + /// \param[out] location The location of all page index to the start of sink. + virtual void WriteTo(::arrow::io::OutputStream* sink, + BloomFilterLocation* location) = 0; + + /// \brief Complete the bloom filter builder and no more write is allowed. + virtual void Finish() = 0; + + virtual ~BloomFilterBuilder() = default; +}; + +} // namespace parquet diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index e34420b9f6e79..22c37c5629119 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -43,6 +43,8 @@ #include "arrow/util/rle_encoding.h" #include "arrow/util/type_traits.h" #include "arrow/visit_array_inline.h" +#include "arrow/visit_data_inline.h" +#include "parquet/bloom_filter.h" #include "parquet/column_page.h" #include "parquet/encoding.h" #include "parquet/encryption/encryption_internal.h" @@ -735,7 +737,8 @@ class ColumnWriterImpl { public: ColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata, std::unique_ptr pager, const bool use_dictionary, - Encoding::type encoding, const WriterProperties* properties) + Encoding::type encoding, const WriterProperties* properties, + BloomFilter* bloom_filter) : metadata_(metadata), descr_(metadata->descr()), level_info_(ComputeLevelInfo(metadata->descr())), @@ -754,7 +757,8 @@ class ColumnWriterImpl { closed_(false), fallback_(false), definition_levels_sink_(allocator_), - repetition_levels_sink_(allocator_) { + repetition_levels_sink_(allocator_), + bloom_filter_(bloom_filter) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); repetition_levels_rle_ = @@ -885,6 +889,8 @@ class ColumnWriterImpl { std::vector> data_pages_; + BloomFilter* bloom_filter_; + private: void InitSinks() { definition_levels_sink_.Rewind(0); @@ -1203,9 +1209,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< TypedColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata, std::unique_ptr pager, const bool use_dictionary, - Encoding::type encoding, const WriterProperties* properties) - : ColumnWriterImpl(metadata, std::move(pager), use_dictionary, encoding, - properties) { + Encoding::type encoding, const WriterProperties* properties, + BloomFilter* bloom_filter) + : ColumnWriterImpl(metadata, std::move(pager), use_dictionary, encoding, properties, + bloom_filter) { current_encoder_ = MakeEncoder(DType::type_num, encoding, use_dictionary, descr_, properties->memory_pool()); // We have to dynamic_cast as some compilers don't want to static_cast @@ -1587,6 +1594,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< if (page_statistics_ != nullptr) { page_statistics_->Update(values, num_values, num_nulls); } + UpdateBloomFilter(values, num_values); } /// \brief Write values with spaces and update page statistics accordingly. @@ -1608,14 +1616,21 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< if (num_values != num_spaced_values) { current_value_encoder_->PutSpaced(values, static_cast(num_spaced_values), valid_bits, valid_bits_offset); + UpdateBloomFilterSpaced(values, num_spaced_values, valid_bits, valid_bits_offset); } else { current_value_encoder_->Put(values, static_cast(num_values)); + UpdateBloomFilter(values, num_values); } if (page_statistics_ != nullptr) { page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_spaced_values, num_values, num_nulls); } } + + void UpdateBloomFilter(const T* values, int64_t num_values); + void UpdateBloomFilterSpaced(const T* values, int64_t num_values, + const uint8_t* valid_bits, int64_t valid_bits_offset); + void UpdateBloomFilterArray(const ::arrow::Array& values); }; template @@ -1689,9 +1704,14 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( } int64_t non_null_count = chunk_indices->length() - chunk_indices->null_count(); - page_statistics_->IncrementNullCount(num_chunk_levels - non_null_count); - page_statistics_->IncrementNumValues(non_null_count); - page_statistics_->Update(*referenced_dictionary, /*update_counts=*/false); + if (page_statistics_ != nullptr) { + page_statistics_->IncrementNullCount(num_chunk_levels - non_null_count); + page_statistics_->IncrementNumValues(non_null_count); + page_statistics_->Update(*referenced_dictionary, /*update_counts=*/false); + } + if (bloom_filter_ != nullptr) { + UpdateBloomFilterArray(*referenced_dictionary); + } }; int64_t value_offset = 0; @@ -1708,7 +1728,7 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( AddIfNotNull(rep_levels, offset)); std::shared_ptr writeable_indices = indices->Slice(value_offset, batch_num_spaced_values); - if (page_statistics_) { + if (page_statistics_ || bloom_filter_) { update_stats(/*num_chunk_levels=*/batch_size, writeable_indices); } PARQUET_ASSIGN_OR_THROW( @@ -2163,6 +2183,9 @@ Status TypedColumnWriterImpl::WriteArrowDense( // ---------------------------------------------------------------------- // Write Arrow to BYTE_ARRAY +template <> +void TypedColumnWriterImpl::UpdateBloomFilterArray( + const ::arrow::Array& values); template <> Status TypedColumnWriterImpl::WriteArrowDense( @@ -2195,6 +2218,7 @@ Status TypedColumnWriterImpl::WriteArrowDense( page_statistics_->IncrementNullCount(batch_size - non_null); page_statistics_->IncrementNumValues(non_null); } + UpdateBloomFilterArray(*data_slice); CommitWriteAndCheckPageLimit(batch_size, batch_num_values, batch_size - non_null, check_page); CheckDictionarySizeLimit(); @@ -2319,12 +2343,113 @@ Status TypedColumnWriterImpl::WriteArrowDense( return Status::OK(); } +template +void TypedColumnWriterImpl::UpdateBloomFilter(const T* values, + int64_t num_values) { + if (bloom_filter_) { + // TODO(mwish): Would it allocate too much memory? Would an std::array + // better here? + std::vector hashes(num_values); + bloom_filter_->Hashes(values, static_cast(num_values), hashes.data()); + bloom_filter_->InsertHashes(hashes.data(), static_cast(num_values)); + } +} + +template <> +void TypedColumnWriterImpl::UpdateBloomFilter(const FLBA* values, + int64_t num_values) { + if (bloom_filter_) { + for (int64_t i = 0; i < num_values; ++i) { + bloom_filter_->InsertHash(bloom_filter_->Hash(values + i, descr_->type_length())); + } + } +} + +template <> +void TypedColumnWriterImpl::UpdateBloomFilter(const bool*, int64_t) { + DCHECK(bloom_filter_ == nullptr); +} + +template +void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const T* values, + int64_t num_values, + const uint8_t* valid_bits, + int64_t valid_bits_offset) { + if (bloom_filter_) { + ::arrow::internal::VisitSetBitRunsVoid( + valid_bits, valid_bits_offset, num_values, [&](int64_t position, int64_t length) { + for (int64_t i = 0; i < length; i++) { + bloom_filter_->InsertHash(bloom_filter_->Hash(values + i + position)); + } + }); + } +} + +template <> +void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const bool*, int64_t, + const uint8_t*, + int64_t) { + DCHECK(bloom_filter_ == nullptr); +} + +template <> +void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const FLBA* values, + int64_t num_values, + const uint8_t* valid_bits, + int64_t valid_bits_offset) { + if (bloom_filter_) { + ::arrow::internal::VisitSetBitRunsVoid( + valid_bits, valid_bits_offset, num_values, [&](int64_t position, int64_t length) { + for (int64_t i = 0; i < length; i++) { + bloom_filter_->InsertHash( + bloom_filter_->Hash(values + i + position, descr_->type_length())); + } + }); + } +} + +template +void UpdateBinaryBloomFilter(BloomFilter* bloom_filter, const ArrayType& array) { + PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( + *array.data(), + [&](const std::string_view& view) { + const ByteArray value{view}; + bloom_filter->InsertHash(bloom_filter->Hash(&value)); + return Status::OK(); + }, + []() { return Status::OK(); })); +} + +template <> +void TypedColumnWriterImpl::UpdateBloomFilterArray( + const ::arrow::Array& values) { + if (bloom_filter_) { + if (!::arrow::is_base_binary_like(values.type_id())) { + throw ParquetException("Only BaseBinaryArray and subclasses supported"); + } + if (!::arrow::is_dictionary(values.type_id())) { + // Dictionary is handled in WriteArrowDense, so this should never happen. + throw ParquetException("UpdateBloomFilterArray not support dictionary"); + } + + if (::arrow::is_binary_like(values.type_id())) { + UpdateBinaryBloomFilter(bloom_filter_, + checked_cast(values)); + } else { + DCHECK(::arrow::is_large_binary_like(values.type_id())); + UpdateBinaryBloomFilter(bloom_filter_, + checked_cast(values)); + } + } +} + // ---------------------------------------------------------------------- // Dynamic column writer constructor std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* metadata, std::unique_ptr pager, - const WriterProperties* properties) { + const WriterProperties* properties, + BloomFilter* bloom_filter) { const ColumnDescriptor* descr = metadata->descr(); const bool use_dictionary = properties->dictionary_enabled(descr->path()) && descr->physical_type() != Type::BOOLEAN; @@ -2335,28 +2460,28 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met switch (descr->physical_type()) { case Type::BOOLEAN: return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties); + metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); case Type::INT32: return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties); + metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); case Type::INT64: return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties); + metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); case Type::INT96: return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties); + metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); case Type::FLOAT: return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties); + metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); case Type::DOUBLE: return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties); + metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); case Type::BYTE_ARRAY: return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties); + metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties); + metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); default: ParquetException::NYI("type reader not implemented"); } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 88a42acc2f706..625ca14007f73 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -44,6 +44,7 @@ class CodecOptions; namespace parquet { struct ArrowWriteContext; +class BloomFilter; class ColumnChunkMetaDataBuilder; class ColumnDescriptor; class ColumnIndexBuilder; @@ -143,7 +144,8 @@ class PARQUET_EXPORT ColumnWriter { static std::shared_ptr Make(ColumnChunkMetaDataBuilder*, std::unique_ptr, - const WriterProperties* properties); + const WriterProperties* properties, + BloomFilter* bloom_filter = nullptr); /// \brief Closes the ColumnWriter, commits any buffered values to pages. /// \return Total size of the column in bytes diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 2a6a88df2dd0a..4a5bd40d1e019 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -25,6 +25,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "parquet/bloom_filter_builder.h" #include "parquet/column_writer.h" #include "parquet/encryption/encryption_internal.h" #include "parquet/encryption/internal_file_encryptor.h" @@ -91,7 +92,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal, const WriterProperties* properties, bool buffered_row_group = false, InternalFileEncryptor* file_encryptor = nullptr, - PageIndexBuilder* page_index_builder = nullptr) + PageIndexBuilder* page_index_builder = nullptr, + BloomFilterBuilder* bloom_filter_builder = nullptr) : sink_(std::move(sink)), metadata_(metadata), properties_(properties), @@ -103,7 +105,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { num_rows_(0), buffered_row_group_(buffered_row_group), file_encryptor_(file_encryptor), - page_index_builder_(page_index_builder) { + page_index_builder_(page_index_builder), + bloom_filter_builder_(bloom_filter_builder) { if (buffered_row_group) { InitColumns(); } else { @@ -156,6 +159,16 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto codec_options = properties_->codec_options(path) ? properties_->codec_options(path).get() : nullptr; + BloomFilter* bloom_filter = nullptr; + std::optional bloom_filter_props = + properties_->bloom_filter_options(path); + bool bloom_filter_enabled = bloom_filter_builder_ && + bloom_filter_props.has_value() && + col_meta->descr()->physical_type() != Type::BOOLEAN; + if (bloom_filter_enabled) { + bloom_filter = bloom_filter_builder_->GetOrCreateBloomFilter( + column_ordinal, bloom_filter_props.value()); + } std::unique_ptr pager; if (!codec_options) { @@ -171,7 +184,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { data_encryptor, properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); } - column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_); + column_writers_[0] = + ColumnWriter::Make(col_meta, std::move(pager), properties_, bloom_filter); return column_writers_[0].get(); } @@ -262,6 +276,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { bool buffered_row_group_; InternalFileEncryptor* file_encryptor_; PageIndexBuilder* page_index_builder_; + BloomFilterBuilder* bloom_filter_builder_; void CheckRowsWritten() const { // verify when only one column is written at a time @@ -307,7 +322,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto codec_options = properties_->codec_options(path) ? (properties_->codec_options(path)).get() : nullptr; - + BloomFilter* bloom_filter = nullptr; + + std::optional bloom_filter_props = + properties_->bloom_filter_options(path); + bool bloom_filter_enabled = bloom_filter_builder_ && + bloom_filter_props.has_value() && + col_meta->descr()->physical_type() != Type::BOOLEAN; + if (bloom_filter_enabled) { + bloom_filter = bloom_filter_builder_->GetOrCreateBloomFilter( + column_ordinal, bloom_filter_props.value()); + } std::unique_ptr pager; if (!codec_options) { pager = PageWriter::Open( @@ -323,7 +348,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); } column_writers_.push_back( - ColumnWriter::Make(col_meta, std::move(pager), properties_)); + ColumnWriter::Make(col_meta, std::move(pager), properties_, bloom_filter)); } } @@ -360,6 +385,10 @@ class FileSerializer : public ParquetFileWriter::Contents { } row_group_writer_.reset(); + // In Parquet standard, the Bloom filter data can be stored before the page indexes + // after all row groups or stored between row groups. We choose to store it before + // the page indexes after all row groups. + WriteBloomFilter(); WritePageIndex(); // Write magic bytes and metadata @@ -484,6 +513,22 @@ class FileSerializer : public ParquetFileWriter::Contents { } } + void WriteBloomFilter() { + if (bloom_filter_builder_ != nullptr) { + if (properties_->file_encryption_properties()) { + throw ParquetException("Encryption is not supported with bloom filter"); + } + // Serialize page index after all row groups have been written and report + // location to the file metadata. + BloomFilterLocation bloom_filter_location; + // TODO(mwish): remove file_encryptor? + bloom_filter_builder_->Finish(); + bloom_filter_builder_->WriteTo(sink_.get(), file_encryptor_.get(), + &bloom_filter_location); + metadata_->SetBloomFilterLocation(bloom_filter_location); + } + } + std::shared_ptr sink_; bool is_open_; const std::shared_ptr properties_; @@ -494,6 +539,8 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr row_group_writer_; std::unique_ptr page_index_builder_; std::unique_ptr file_encryptor_; + // Only one of the bloom filter writer is active at a time + std::unique_ptr bloom_filter_builder_; void StartFile() { auto file_encryption_properties = properties_->file_encryption_properties(); @@ -531,7 +578,9 @@ class FileSerializer : public ParquetFileWriter::Contents { PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4)); } } - + if (properties_->bloom_filter_enabled()) { + bloom_filter_builder_ = BloomFilterBuilder::Make(schema(), *properties_); + } if (properties_->page_index_enabled()) { page_index_builder_ = PageIndexBuilder::Make(&schema_); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 8aedf5b926add..5617221340607 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1781,7 +1781,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { key_value_metadata_(std::move(key_value_metadata)) { if (properties_->file_encryption_properties() != nullptr && properties_->file_encryption_properties()->encrypted_footer()) { - crypto_metadata_.reset(new format::FileCryptoMetaData()); + crypto_metadata_ = std::make_unique(); } } @@ -1793,36 +1793,65 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { } void SetPageIndexLocation(const PageIndexLocation& location) { - auto set_index_location = + auto set_index_location = [this](size_t row_group_ordinal, + const FileIndexLocation& file_index_location, + bool column_index) { + auto& row_group_metadata = this->row_groups_.at(row_group_ordinal); + auto iter = file_index_location.find(row_group_ordinal); + if (iter != file_index_location.cend()) { + const auto& row_group_index_location = iter->second; + for (size_t i = 0; i < row_group_index_location.size(); ++i) { + if (i >= row_group_metadata.columns.size()) { + throw ParquetException("Cannot find metadata for column ordinal ", i); + } + auto& column_metadata = row_group_metadata.columns.at(i); + const auto& index_location = row_group_index_location.at(i); + if (index_location.has_value()) { + if (column_index) { + column_metadata.__set_column_index_offset(index_location->offset); + column_metadata.__set_column_index_length(index_location->length); + } else { + column_metadata.__set_offset_index_offset(index_location->offset); + column_metadata.__set_offset_index_length(index_location->length); + } + } + } + } + }; + + for (size_t i = 0; i < row_groups_.size(); ++i) { + set_index_location(i, location.column_index_location, true); + set_index_location(i, location.offset_index_location, false); + } + } + + // Update location to all bloom filters in the parquet file. + void SetBloomFilterLocation(const BloomFilterLocation& location) { + auto set_bloom_filter_location = [this](size_t row_group_ordinal, - const PageIndexLocation::FileIndexLocation& file_index_location, - bool column_index) { + const FileIndexLocation& file_bloom_filter_location) { auto& row_group_metadata = this->row_groups_.at(row_group_ordinal); - auto iter = file_index_location.find(row_group_ordinal); - if (iter != file_index_location.cend()) { - const auto& row_group_index_location = iter->second; - for (size_t i = 0; i < row_group_index_location.size(); ++i) { + auto iter = file_bloom_filter_location.find(row_group_ordinal); + if (iter != file_bloom_filter_location.cend()) { + const auto& row_group_bloom_filter_location = iter->second; + for (size_t i = 0; i < row_group_bloom_filter_location.size(); ++i) { if (i >= row_group_metadata.columns.size()) { throw ParquetException("Cannot find metadata for column ordinal ", i); } - auto& column_metadata = row_group_metadata.columns.at(i); - const auto& index_location = row_group_index_location.at(i); - if (index_location.has_value()) { - if (column_index) { - column_metadata.__set_column_index_offset(index_location->offset); - column_metadata.__set_column_index_length(index_location->length); - } else { - column_metadata.__set_offset_index_offset(index_location->offset); - column_metadata.__set_offset_index_length(index_location->length); - } + auto& column = row_group_metadata.columns.at(i); + auto& column_metadata = column.meta_data; + const auto& bloom_filter_location = row_group_bloom_filter_location.at(i); + if (bloom_filter_location.has_value()) { + column_metadata.__set_bloom_filter_offset(bloom_filter_location->offset); + // TODO(mwish): Allow this in Parquet 2.10. + // column_metadata.__set_bloom_filter_length(bloom_filter_location->length); } } } }; for (size_t i = 0; i < row_groups_.size(); ++i) { - set_index_location(i, location.column_index_location, true); - set_index_location(i, location.offset_index_location, false); + set_bloom_filter_location(i, location.bloom_filter_location); } } @@ -1954,8 +1983,8 @@ std::unique_ptr FileMetaDataBuilder::Make( FileMetaDataBuilder::FileMetaDataBuilder( const SchemaDescriptor* schema, std::shared_ptr props, std::shared_ptr key_value_metadata) - : impl_{std::unique_ptr(new FileMetaDataBuilderImpl( - schema, std::move(props), std::move(key_value_metadata)))} {} + : impl_{std::make_unique(schema, std::move(props), + std::move(key_value_metadata))} {} FileMetaDataBuilder::~FileMetaDataBuilder() = default; @@ -1967,6 +1996,10 @@ void FileMetaDataBuilder::SetPageIndexLocation(const PageIndexLocation& location impl_->SetPageIndexLocation(location); } +void FileMetaDataBuilder::SetBloomFilterLocation(const BloomFilterLocation& location) { + impl_->SetBloomFilterLocation(location); +} + std::unique_ptr FileMetaDataBuilder::Finish( const std::shared_ptr& key_value_metadata) { return impl_->Finish(key_value_metadata); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index e62b2d187a20b..ccaac097b2298 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -157,7 +157,6 @@ class PARQUET_EXPORT ColumnChunkMetaData { const std::string& file_path() const; // column metadata - bool is_metadata_set() const; Type::type type() const; int64_t num_values() const; std::shared_ptr path_in_schema() const; @@ -508,21 +507,32 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { std::unique_ptr impl_; }; +/// Alias type of page index and bloom filter location of a row group. The index +/// location is located by column ordinal. If the column does not have the index, +/// its value is set to std::nullopt. +using RowGroupIndexLocation = std::vector>; + +/// Alias type of page index and bloom filter location of a parquet file. The +/// index location is located by the row group ordinal. +using FileIndexLocation = std::map; + /// \brief Public struct for location to all page indexes in a parquet file. struct PageIndexLocation { - /// Alias type of page index location of a row group. The index location - /// is located by column ordinal. If the column does not have the page index, - /// its value is set to std::nullopt. - using RowGroupIndexLocation = std::vector>; - /// Alias type of page index location of a parquet file. The index location - /// is located by the row group ordinal. - using FileIndexLocation = std::map; /// Row group column index locations which uses row group ordinal as the key. FileIndexLocation column_index_location; /// Row group offset index locations which uses row group ordinal as the key. FileIndexLocation offset_index_location; }; +/// \brief Public struct for location to all bloom filters in a parquet file. +struct BloomFilterLocation { + /// Row group bloom filter index locations which uses row group ordinal as the key. + /// + /// Note: Before Parquet 2.10, the bloom filter index only have "offset". But here + /// we use "IndexLocation" with length to support the future extension. + FileIndexLocation bloom_filter_location; +}; + class PARQUET_EXPORT FileMetaDataBuilder { public: ARROW_DEPRECATED("Deprecated in 12.0.0. Use overload without KeyValueMetadata instead.") @@ -542,6 +552,9 @@ class PARQUET_EXPORT FileMetaDataBuilder { // Update location to all page indexes in the parquet file void SetPageIndexLocation(const PageIndexLocation& location); + // Update location to all bloom filters in the parquet file. + void SetBloomFilterLocation(const BloomFilterLocation& location); + // Complete the Thrift structure std::unique_ptr Finish( const std::shared_ptr& key_value_metadata = NULLPTR); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index bd7eb9dc7abd6..d67db86bf4e5a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -139,6 +139,18 @@ static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN; static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false; +static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024; +static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05; + +/// Note: Currently we don't support bloom filter for boolean columns, +/// so if enable bloom filter for boolean columns, it will be ignored. +struct BloomFilterOptions { + int32_t ndv = DEFAULT_BLOOM_FILTER_NDV; + double fpp = DEFAULT_BLOOM_FILTER_FPP; +}; + +static constexpr std::optional DEFAULT_IS_BLOOM_FILTER_OPTIONS = + std::nullopt; class PARQUET_EXPORT ColumnProperties { public: @@ -147,7 +159,9 @@ class PARQUET_EXPORT ColumnProperties { bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE, - bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED) + bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED, + std::optional bloom_filter_options = + DEFAULT_IS_BLOOM_FILTER_OPTIONS) : encoding_(encoding), codec_(codec), dictionary_enabled_(dictionary_enabled), @@ -186,6 +200,24 @@ class PARQUET_EXPORT ColumnProperties { page_index_enabled_ = page_index_enabled; } + void set_bloom_filter_options(std::optional bloom_filter_options) { + if (bloom_filter_options) { + if (bloom_filter_options->fpp > 1.0 || bloom_filter_options->fpp < 0.0) { + throw ParquetException( + "Bloom Filter False positive probability must be between 0.0 and 1.0"); + } + } + bloom_filter_options_ = bloom_filter_options; + } + + void set_bloom_filter_enabled(bool bloom_filter_enabled) { + if (!bloom_filter_enabled) { + bloom_filter_options_ = std::nullopt; + } else if (!bloom_filter_options_) { + bloom_filter_options_ = BloomFilterOptions(); + } + } + Encoding::type encoding() const { return encoding_; } Compression::type compression() const { return codec_; } @@ -202,6 +234,12 @@ class PARQUET_EXPORT ColumnProperties { bool page_index_enabled() const { return page_index_enabled_; } + std::optional bloom_filter_options() const { + return bloom_filter_options_; + } + + bool bloom_filter_enabled() const { return bloom_filter_options_ != std::nullopt; } + private: Encoding::type encoding_; Compression::type codec_; @@ -210,6 +248,7 @@ class PARQUET_EXPORT ColumnProperties { size_t max_stats_size_; std::shared_ptr codec_options_; bool page_index_enabled_; + std::optional bloom_filter_options_; }; class PARQUET_EXPORT WriterProperties { @@ -532,6 +571,44 @@ class PARQUET_EXPORT WriterProperties { return this->disable_statistics(path->ToDotString()); } + /// Enable writing bloom filter in general for all columns. Default disabled. + /// + /// Please check the link below for more details: + /// https://github.com/apache/parquet-format/blob/master/BloomFilter.md + Builder* enable_bloom_filter() { + default_column_properties_.set_bloom_filter_enabled(true); + return this; + } + + /// Enable bloom filter for the column specified by `path`. + /// Default disabled. + Builder* enable_bloom_filter(const std::string& path) { + auto iter = bloom_filter_options_.find(path); + if (iter == bloom_filter_options_.end() || iter->second == std::nullopt) { + bloom_filter_options_[path] = BloomFilterOptions(); + } + return this; + } + + /// Enable bloom filter for the column specified by `path`. + /// Default disabled. + Builder* enable_bloom_filter(const std::shared_ptr& path) { + return this->enable_bloom_filter(path->ToDotString()); + } + + /// Disable bloom filter for the column specified by `path`. + /// Default disabled. + Builder* disable_bloom_filter(const std::string& path) { + bloom_filter_options_[path] = std::nullopt; + return this; + } + + /// Disable bloom filter for the column specified by `path`. + /// Default enabled. + Builder* disable_bloom_filter(const std::shared_ptr& path) { + return this->disable_bloom_filter(path->ToDotString()); + } + /// Allow decimals with 1 <= precision <= 18 to be stored as integers. /// /// In Parquet, DECIMAL can be stored in any of the following physical types: @@ -629,6 +706,8 @@ class PARQUET_EXPORT WriterProperties { get(item.first).set_statistics_enabled(item.second); for (const auto& item : page_index_enabled_) get(item.first).set_page_index_enabled(item.second); + for (const auto& item : bloom_filter_options_) + get(item.first).set_bloom_filter_options(item.second); return std::shared_ptr(new WriterProperties( pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, @@ -663,6 +742,8 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map dictionary_enabled_; std::unordered_map statistics_enabled_; std::unordered_map page_index_enabled_; + std::unordered_map> + bloom_filter_options_; }; inline MemoryPool* memory_pool() const { return pool_; } @@ -757,6 +838,23 @@ class PARQUET_EXPORT WriterProperties { return false; } + bool bloom_filter_enabled() const { + if (default_column_properties_.bloom_filter_enabled()) { + return true; + } + for (const auto& item : column_properties_) { + if (item.second.bloom_filter_enabled()) { + return true; + } + } + return false; + } + + std::optional bloom_filter_options( + const std::shared_ptr& path) const { + return column_properties(path).bloom_filter_options(); + } + inline FileEncryptionProperties* file_encryption_properties() const { return file_encryption_properties_.get(); } From 6ebd6dafecc242531d7c7cf3d5705ce5106af043 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 26 Aug 2023 23:14:05 +0800 Subject: [PATCH 02/51] tiny fixing --- cpp/src/parquet/CMakeLists.txt | 4 +-- cpp/src/parquet/bloom_filter_builder.cc | 6 ++-- cpp/src/parquet/bloom_filter_builder.h | 1 - ...r_test.cc => bloom_filter_parquet_test.cc} | 34 +++++++++++++++++++ cpp/src/parquet/file_writer.cc | 4 +-- 5 files changed, 40 insertions(+), 9 deletions(-) rename cpp/src/parquet/{bloom_filter_reader_test.cc => bloom_filter_parquet_test.cc} (66%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 021e34fc8b09a..af5f978ca9647 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -156,7 +156,7 @@ set(PARQUET_SRCS arrow/writer.cc bloom_filter.cc bloom_filter_reader.cc - Bloom_filter_builder.cc + bloom_filter_builder.cc column_reader.cc column_scanner.cc column_writer.cc @@ -336,7 +336,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h" add_parquet_test(internals-test SOURCES bloom_filter_test.cc - bloom_filter_reader_test.cc + bloom_filter_parquet_test.cc properties_test.cc statistics_test.cc encoding_test.cc diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 7ad462c654f71..fafea30ba60c2 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -6,8 +6,6 @@ #include "metadata.h" #include "parquet/bloom_filter.h" -#include "parquet/encryption/encryption.h" -#include "parquet/encryption/internal_file_encryptor.h" #include "parquet/exception.h" #include "parquet/properties.h" @@ -69,9 +67,11 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter( std::unique_ptr& bloom_filter = row_group_bloom_filters_.back()[column_ordinal]; if (bloom_filter == nullptr) { - auto block_split_bloom_filter = std::make_unique(); + auto block_split_bloom_filter = + std::make_unique(properties_.memory_pool()); block_split_bloom_filter->Init(BlockSplitBloomFilter::OptimalNumOfBytes( bloom_filter_options.ndv, bloom_filter_options.fpp)); + bloom_filter = std::move(block_split_bloom_filter); } return bloom_filter.get(); } diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index 046c169f4e280..7b90a3c2bb096 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -28,7 +28,6 @@ namespace parquet { class BloomFilter; class SchemaDescriptor; -class InternalFileEncryptor; struct BloomFilterOptions; struct BloomFilterLocation; diff --git a/cpp/src/parquet/bloom_filter_reader_test.cc b/cpp/src/parquet/bloom_filter_parquet_test.cc similarity index 66% rename from cpp/src/parquet/bloom_filter_reader_test.cc rename to cpp/src/parquet/bloom_filter_parquet_test.cc index e297ab7045120..824cdd55498b9 100644 --- a/cpp/src/parquet/bloom_filter_reader_test.cc +++ b/cpp/src/parquet/bloom_filter_parquet_test.cc @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. +#include #include #include "parquet/bloom_filter.h" +#include "parquet/bloom_filter_builder.h" #include "parquet/bloom_filter_reader.h" #include "parquet/file_reader.h" #include "parquet/test_util.h" @@ -69,4 +71,36 @@ TEST(BloomFilterReader, FileNotHaveBloomFilter) { ASSERT_EQ(nullptr, bloom_filter); } +TEST(BloomFilterBuilderTest, Basic) { + SchemaDescriptor schema; + schema::NodePtr root = + schema::GroupNode::Make("schema", Repetition::REPEATED, {schema::ByteArray("c1")}); + schema.Init(root); + auto properties = WriterProperties::Builder().build(); + auto builder = BloomFilterBuilder::Make(&schema, *properties); + // AppendRowGroup() is not called and expect throw. + BloomFilterOptions default_options; + ASSERT_THROW(builder->GetOrCreateBloomFilter(0, default_options), ParquetException); + + builder->AppendRowGroup(); + // GetOrCreateBloomFilter() with wrong column ordinal expect throw. + ASSERT_THROW(builder->GetOrCreateBloomFilter(1, default_options), ParquetException); + auto bloom_filter = builder->GetOrCreateBloomFilter(0, default_options); + bloom_filter->InsertHash(100); + bloom_filter->InsertHash(200); + builder->Finish(); + auto sink = CreateOutputStream(); + BloomFilterLocation location; + builder->WriteTo(sink.get(), &location); + EXPECT_EQ(1, location.bloom_filter_location.size()); + + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + ReaderProperties reader_properties; + ::arrow::io::BufferReader reader(buffer); + auto filter = parquet::BlockSplitBloomFilter::Deserialize(reader_properties, &reader); + EXPECT_TRUE(filter.FindHash(100)); + EXPECT_TRUE(filter.FindHash(200)); + EXPECT_FALSE(filter.FindHash(300)); +} + } // namespace parquet::test diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 4a5bd40d1e019..4b54fcc4fdf42 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -521,10 +521,8 @@ class FileSerializer : public ParquetFileWriter::Contents { // Serialize page index after all row groups have been written and report // location to the file metadata. BloomFilterLocation bloom_filter_location; - // TODO(mwish): remove file_encryptor? bloom_filter_builder_->Finish(); - bloom_filter_builder_->WriteTo(sink_.get(), file_encryptor_.get(), - &bloom_filter_location); + bloom_filter_builder_->WriteTo(sink_.get(), &bloom_filter_location); metadata_->SetBloomFilterLocation(bloom_filter_location); } } From 70c9267c63f261700c957897723c575dfb2e0f9f Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 26 Aug 2023 23:32:30 +0800 Subject: [PATCH 03/51] tiny update test --- cpp/src/parquet/bloom_filter_builder.cc | 2 +- cpp/src/parquet/bloom_filter_parquet_test.cc | 55 +++++++++++++++----- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index fafea30ba60c2..b6b3b5f99dd11 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -100,7 +100,7 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, // serialize bloom filter by ascending order of column id for (int32_t column_id = 0; column_id < num_columns; ++column_id) { auto iter = row_group_bloom_filters.find(column_id); - if (iter != row_group_bloom_filters.cend()) { + if (iter != row_group_bloom_filters.cend() && iter->second != nullptr) { PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); iter->second->WriteTo(sink); PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); diff --git a/cpp/src/parquet/bloom_filter_parquet_test.cc b/cpp/src/parquet/bloom_filter_parquet_test.cc index 824cdd55498b9..f598ea437a1a3 100644 --- a/cpp/src/parquet/bloom_filter_parquet_test.cc +++ b/cpp/src/parquet/bloom_filter_parquet_test.cc @@ -71,7 +71,44 @@ TEST(BloomFilterReader, FileNotHaveBloomFilter) { ASSERT_EQ(nullptr, bloom_filter); } -TEST(BloomFilterBuilderTest, Basic) { +// , c1 has bloom filter. +TEST(BloomFilterBuilderTest, BasicRoundTrip) { + SchemaDescriptor schema; + schema::NodePtr root = schema::GroupNode::Make( + "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::ByteArray("c2")}); + schema.Init(root); + auto properties = WriterProperties::Builder().build(); + auto builder = BloomFilterBuilder::Make(&schema, *properties); + builder->AppendRowGroup(); + auto bloom_filter = builder->GetOrCreateBloomFilter(0, BloomFilterOptions()); + std::vector insert_hashes = {100, 200}; + for (uint64_t hash : insert_hashes) { + bloom_filter->InsertHash(hash); + } + builder->Finish(); + auto sink = CreateOutputStream(); + BloomFilterLocation location; + builder->WriteTo(sink.get(), &location); + EXPECT_EQ(1, location.bloom_filter_location.size()); + EXPECT_EQ(2, location.bloom_filter_location[0].size()); + EXPECT_TRUE(location.bloom_filter_location[0][0].has_value()); + EXPECT_FALSE(location.bloom_filter_location[0][1].has_value()); + + int32_t bloom_filter_offset = location.bloom_filter_location[0][0]->offset; + int32_t bloom_filter_length = location.bloom_filter_location[0][0]->length; + + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + ReaderProperties reader_properties; + ::arrow::io::BufferReader reader( + ::arrow::SliceBuffer(buffer, bloom_filter_offset, bloom_filter_length)); + auto filter = parquet::BlockSplitBloomFilter::Deserialize(reader_properties, &reader); + for (uint64_t hash : insert_hashes) { + EXPECT_TRUE(bloom_filter->FindHash(hash)); + } + EXPECT_FALSE(filter.FindHash(300)); +} + +TEST(BloomFilterBuilderTest, InvalidOperations) { SchemaDescriptor schema; schema::NodePtr root = schema::GroupNode::Make("schema", Repetition::REPEATED, {schema::ByteArray("c1")}); @@ -85,22 +122,14 @@ TEST(BloomFilterBuilderTest, Basic) { builder->AppendRowGroup(); // GetOrCreateBloomFilter() with wrong column ordinal expect throw. ASSERT_THROW(builder->GetOrCreateBloomFilter(1, default_options), ParquetException); - auto bloom_filter = builder->GetOrCreateBloomFilter(0, default_options); - bloom_filter->InsertHash(100); - bloom_filter->InsertHash(200); - builder->Finish(); + builder->GetOrCreateBloomFilter(0, default_options); auto sink = CreateOutputStream(); BloomFilterLocation location; + // WriteTo() before Finish() expect throw. + ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException); + builder->Finish(); builder->WriteTo(sink.get(), &location); EXPECT_EQ(1, location.bloom_filter_location.size()); - - ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); - ReaderProperties reader_properties; - ::arrow::io::BufferReader reader(buffer); - auto filter = parquet::BlockSplitBloomFilter::Deserialize(reader_properties, &reader); - EXPECT_TRUE(filter.FindHash(100)); - EXPECT_TRUE(filter.FindHash(200)); - EXPECT_FALSE(filter.FindHash(300)); } } // namespace parquet::test From 48350d855ff9d31e46d315930c5ba7f5ba7f3329 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 27 Aug 2023 00:21:56 +0800 Subject: [PATCH 04/51] trying to fix ci --- cpp/src/parquet/bloom_filter_parquet_test.cc | 12 ++- cpp/src/parquet/column_writer.cc | 6 ++ cpp/src/parquet/column_writer_test.cc | 99 +++++++++++++++++++- 3 files changed, 113 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_parquet_test.cc b/cpp/src/parquet/bloom_filter_parquet_test.cc index f598ea437a1a3..67b9b714f02c4 100644 --- a/cpp/src/parquet/bloom_filter_parquet_test.cc +++ b/cpp/src/parquet/bloom_filter_parquet_test.cc @@ -77,10 +77,16 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { schema::NodePtr root = schema::GroupNode::Make( "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::ByteArray("c2")}); schema.Init(root); - auto properties = WriterProperties::Builder().build(); - auto builder = BloomFilterBuilder::Make(&schema, *properties); + auto writer_properties = default_writer_properties(); + auto builder = BloomFilterBuilder::Make(&schema, *writer_properties); builder->AppendRowGroup(); - auto bloom_filter = builder->GetOrCreateBloomFilter(0, BloomFilterOptions()); + BloomFilterOptions bloom_filter_options; + bloom_filter_options.ndv = 100; + auto bloom_filter = builder->GetOrCreateBloomFilter(0, bloom_filter_options); + ASSERT_NE(nullptr, bloom_filter); + ASSERT_EQ(bloom_filter->GetBitsetSize(), + BlockSplitBloomFilter::OptimalNumOfBytes(bloom_filter_options.ndv, + bloom_filter_options.fpp)); std::vector insert_hashes = {100, 200}; for (uint64_t hash : insert_hashes) { bloom_filter->InsertHash(hash); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 22c37c5629119..eca859daf3309 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2420,6 +2420,12 @@ void UpdateBinaryBloomFilter(BloomFilter* bloom_filter, const ArrayType& array) []() { return Status::OK(); })); } +template +void TypedColumnWriterImpl::UpdateBloomFilterArray(const ::arrow::Array& values) { + // Only ByteArray type would write ::arrow::Array directly. + ParquetException::NYI("Unreachable"); +} + template <> void TypedColumnWriterImpl::UpdateBloomFilterArray( const ::arrow::Array& values) { diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 58199c402bd7a..2e19e6c830b00 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -25,6 +25,8 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" +#include "parquet/bloom_filter.h" +#include "parquet/bloom_filter_builder.h" #include "parquet/column_reader.h" #include "parquet/column_writer.h" #include "parquet/file_reader.h" @@ -373,7 +375,6 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { const ColumnDescriptor* descr_; - private: std::unique_ptr metadata_; std::shared_ptr<::arrow::io::BufferOutputStream> sink_; std::shared_ptr writer_properties_; @@ -1591,5 +1592,101 @@ TEST(TestColumnWriter, WriteDataPageV2HeaderNullCount) { } } +template +class TestBloomFilterWriter : public TestPrimitiveWriter { + public: + void SetUp() override { + TestPrimitiveWriter::SetUp(); + builder_ = nullptr; + bloom_filter_ = nullptr; + } + + std::shared_ptr> BuildWriterWithBloomFilter( + int64_t output_size = SMALL_SIZE, + const ColumnProperties& column_properties = ColumnProperties()); + + std::unique_ptr builder_; + BloomFilter* bloom_filter_; +}; + +template +std::shared_ptr> +TestBloomFilterWriter::BuildWriterWithBloomFilter( + int64_t output_size, const ColumnProperties& column_properties) { + this->sink_ = CreateOutputStream(); + WriterProperties::Builder wp_builder; + if (column_properties.encoding() == Encoding::PLAIN_DICTIONARY || + column_properties.encoding() == Encoding::RLE_DICTIONARY) { + wp_builder.enable_dictionary(); + wp_builder.dictionary_pagesize_limit(DICTIONARY_PAGE_SIZE); + } else { + wp_builder.disable_dictionary(); + wp_builder.encoding(column_properties.encoding()); + } + wp_builder.max_statistics_size(column_properties.max_statistics_size()); + this->writer_properties_ = wp_builder.build(); + + this->metadata_ = + ColumnChunkMetaDataBuilder::Make(this->writer_properties_, this->descr_); + std::unique_ptr pager = PageWriter::Open( + this->sink_, column_properties.compression(), this->metadata_.get()); + builder_ = BloomFilterBuilder::Make(&this->schema_, *this->writer_properties_); + // Initial RowGroup + builder_->AppendRowGroup(); + BloomFilterOptions options; + options.ndv = output_size; + bloom_filter_ = builder_->GetOrCreateBloomFilter(0, options); + std::shared_ptr writer = + ColumnWriter::Make(this->metadata_.get(), std::move(pager), + this->writer_properties_.get(), bloom_filter_); + return std::static_pointer_cast>(writer); +} + +// Note: BooleanType is Excluded. +using TestBloomFilterTypes = ::testing::Types; + +TYPED_TEST_SUITE(TestBloomFilterWriter, TestBloomFilterTypes); + +TYPED_TEST(TestBloomFilterWriter, Basic) { + this->GenerateData(SMALL_SIZE); + ColumnProperties column_properties; + column_properties.set_bloom_filter_enabled(true); + + auto writer = this->BuildWriterWithBloomFilter(SMALL_SIZE, column_properties); + writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); + writer->Close(); + + // Read all rows so we are sure that also the non-dictionary pages are read correctly + this->SetupValuesOut(SMALL_SIZE); + this->ReadColumnFully(); + ASSERT_EQ(SMALL_SIZE, this->values_read_); + this->values_.resize(SMALL_SIZE); + ASSERT_EQ(this->values_, this->values_out_); + + // Verify bloom filter + for (auto& value : this->values_) { + if constexpr (std::is_same_v) { + EXPECT_TRUE(this->bloom_filter_->FindHash( + this->bloom_filter_->Hash(&value, this->descr_->type_length()))); + } else { + EXPECT_TRUE(this->bloom_filter_->FindHash(this->bloom_filter_->Hash(&value))); + } + } +} + +using BooleanTestBloomFilterWriter = TestBloomFilterWriter; +TEST_F(BooleanTestBloomFilterWriter, BooleanNotSupport) { + this->GenerateData(SMALL_SIZE); + ColumnProperties column_properties; + column_properties.set_bloom_filter_enabled(true); + + auto writer = this->BuildWriterWithBloomFilter(SMALL_SIZE, column_properties); + writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); + writer->Close(); + + EXPECT_EQ(nullptr, bloom_filter_); +} + } // namespace test } // namespace parquet From d2a659e502cbbb9149c9a77635fe0c8558098144 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 27 Aug 2023 00:37:37 +0800 Subject: [PATCH 05/51] fix lint --- cpp/src/parquet/bloom_filter_builder.cc | 24 ++++++++++++++++++++++-- cpp/src/parquet/file_writer.cc | 3 +-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index b6b3b5f99dd11..a6e6ad9f9ede2 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -1,3 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + #include "parquet/bloom_filter_builder.h" #include @@ -25,8 +46,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { /// Serialize all bloom filters with header and bitset in the order of row group and /// column id. Column encryption is not implemented yet. The side effect is that it /// deletes all bloom filters after they have been flushed. - void WriteTo(::arrow::io::OutputStream* sink, - BloomFilterLocation* location) override; + void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override; void Finish() override { finished_ = true; } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 4b54fcc4fdf42..b855e4752b8d1 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -162,8 +162,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { BloomFilter* bloom_filter = nullptr; std::optional bloom_filter_props = properties_->bloom_filter_options(path); - bool bloom_filter_enabled = bloom_filter_builder_ && - bloom_filter_props.has_value() && + bool bloom_filter_enabled = bloom_filter_builder_ && bloom_filter_props.has_value() && col_meta->descr()->physical_type() != Type::BOOLEAN; if (bloom_filter_enabled) { bloom_filter = bloom_filter_builder_->GetOrCreateBloomFilter( From 41236d87edd3ba468d8b7ccbb4850e90ce59e61c Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 27 Aug 2023 00:44:13 +0800 Subject: [PATCH 06/51] fix some style problem --- cpp/src/parquet/bloom_filter_builder.cc | 2 +- cpp/src/parquet/bloom_filter_builder.h | 6 +++--- cpp/src/parquet/file_writer.cc | 2 ++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index a6e6ad9f9ede2..deaa689b5eb65 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -25,9 +25,9 @@ #include "arrow/io/interfaces.h" -#include "metadata.h" #include "parquet/bloom_filter.h" #include "parquet/exception.h" +#include "parquet/metadata.h" #include "parquet/properties.h" namespace parquet { diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index 7b90a3c2bb096..9806c6220d2c3 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -48,15 +48,15 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \brief Get the BloomFilter from column ordinal. /// /// \param column_ordinal Column ordinal in schema, which is only for leaf columns. - /// \return ColumnIndexBuilder for the column and its memory ownership belongs to - /// the PageIndexBuilder. + /// \return BloomFilter for the column and its memory ownership belongs to + /// the BloomFilterBuilder. virtual BloomFilter* GetOrCreateBloomFilter( int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) = 0; /// \brief Write the bloom filter to sink. /// /// \param[out] sink The output stream to write the bloom filter. - /// \param[out] location The location of all page index to the start of sink. + /// \param[out] location The location of all bloom filter to the start of sink. virtual void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) = 0; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index b855e4752b8d1..e6a40a80495c3 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -523,6 +523,8 @@ class FileSerializer : public ParquetFileWriter::Contents { bloom_filter_builder_->Finish(); bloom_filter_builder_->WriteTo(sink_.get(), &bloom_filter_location); metadata_->SetBloomFilterLocation(bloom_filter_location); + // Release the memory for BloomFilter. + bloom_filter_builder_ = nullptr; } } From 8afba819b3de7f8a501ac8b11c9f9c8b786ffc88 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 27 Aug 2023 02:09:16 +0800 Subject: [PATCH 07/51] add file roundtrip test --- .../parquet/arrow/arrow_reader_writer_test.cc | 113 +++++++++++++++++- cpp/src/parquet/bloom_filter_builder.cc | 7 +- cpp/src/parquet/bloom_filter_parquet_test.cc | 6 +- cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/column_writer.h | 2 +- cpp/src/parquet/column_writer_test.cc | 13 -- cpp/src/parquet/file_writer.cc | 8 +- cpp/src/parquet/properties.h | 6 + 8 files changed, 133 insertions(+), 24 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 8585b1ccf11aa..e24b10f0a7ddb 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -66,6 +66,8 @@ #include "parquet/arrow/schema.h" #include "parquet/arrow/test_util.h" #include "parquet/arrow/writer.h" +#include "parquet/bloom_filter.h" +#include "parquet/bloom_filter_reader.h" #include "parquet/column_writer.h" #include "parquet/file_writer.h" #include "parquet/page_index.h" @@ -5256,7 +5258,7 @@ auto encode_double = [](double value) { } // namespace -class ParquetPageIndexRoundTripTest : public ::testing::Test { +class ParquetIndexRoundTripTest { public: void WriteFile(const std::shared_ptr& writer_properties, const std::shared_ptr<::arrow::Table>& table) { @@ -5280,10 +5282,17 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test { ASSERT_OK_AND_ASSIGN(buffer_, sink->Finish()); } + protected: + std::shared_ptr buffer_; +}; + +class ParquetPageIndexRoundTripTest : public ::testing::Test, + public ParquetIndexRoundTripTest { + public: void ReadPageIndexes(int expect_num_row_groups, int expect_num_pages, const std::set& expect_columns_without_index = {}) { auto read_properties = default_arrow_reader_properties(); - auto reader = ParquetFileReader::Open(std::make_shared(buffer_)); + auto reader = ParquetFileReader::Open(std::make_shared(this->buffer_)); auto metadata = reader->metadata(); ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups()); @@ -5348,7 +5357,6 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test { } protected: - std::shared_ptr buffer_; std::vector column_indexes_; }; @@ -5584,5 +5592,104 @@ TEST_F(ParquetPageIndexRoundTripTest, EnablePerColumn) { /*null_counts=*/{0}})); } +class ParquetBloomFilterRoundTripTest : public ::testing::Test, + public ParquetIndexRoundTripTest { + public: + void ReadBloomFilters(int expect_num_row_groups, + const std::set& expect_columns_without_filter = {}) { + auto read_properties = default_arrow_reader_properties(); + auto reader = ParquetFileReader::Open(std::make_shared(buffer_)); + + auto metadata = reader->metadata(); + ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups()); + + auto& bloom_filter_reader = reader->GetBloomFilterReader(); + + for (int rg = 0; rg < metadata->num_row_groups(); ++rg) { + auto row_group_reader = bloom_filter_reader.RowGroup(rg); + ASSERT_NE(row_group_reader, nullptr); + + for (int col = 0; col < metadata->num_columns(); ++col) { + bool expect_no_bloom_filter = expect_columns_without_filter.find(col) != + expect_columns_without_filter.cend(); + + auto bloom_filter = row_group_reader->GetColumnBloomFilter(col); + if (expect_no_bloom_filter) { + ASSERT_EQ(bloom_filter, nullptr); + } else { + bloom_filters_.push_back(std::move(bloom_filter)); + } + } + } + } + + template + void verifyBloomFilter(const BloomFilter* bloom_filter, + const ::arrow::ChunkedArray& chunked_array) { + auto iter = ::arrow::stl::Begin(chunked_array); + auto end = ::arrow::stl::End(chunked_array); + while (iter != end) { + auto value = *iter; + if (value == std::nullopt) { + ++iter; + continue; + } + if constexpr (std::is_same_v) { + ByteArray ba(value.value()); + EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); + } else { + EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(value.value()))); + } + ++iter; + } + } + + protected: + std::vector> bloom_filters_; +}; + +TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { + BloomFilterOptions options; + options.ndv = 100; + auto writer_properties = WriterProperties::Builder() + .set_bloom_filter_options(options) + ->max_row_group_length(4) + ->build(); + auto schema = ::arrow::schema( + {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); + auto table = ::arrow::TableFromJSON(schema, {R"([ +[1, "a" ], +[2, "b" ], +[3, "c" ], +[null, "d"], +[5, null], +[6, "f" ] +])"}); + WriteFile(writer_properties, table); + + ReadBloomFilters(/*expect_num_row_groups=*/2); + ASSERT_EQ(4, bloom_filters_.size()); + { + ASSERT_NE(nullptr, bloom_filters_[0]); + auto col = table->column(0)->Slice(0, 4); + verifyBloomFilter<::arrow::Int64Type>(bloom_filters_[0].get(), *col); + } + { + ASSERT_NE(nullptr, bloom_filters_[1]); + auto col = table->column(1)->Slice(0, 4); + verifyBloomFilter<::arrow::StringType>(bloom_filters_[1].get(), *col); + } + { + ASSERT_NE(nullptr, bloom_filters_[2]); + auto col = table->column(0)->Slice(4, 2); + verifyBloomFilter<::arrow::Int64Type>(bloom_filters_[2].get(), *col); + } + { + ASSERT_NE(nullptr, bloom_filters_[3]); + auto col = table->column(1)->Slice(4, 2); + verifyBloomFilter<::arrow::StringType>(bloom_filters_[3].get(), *col); + } +} + } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index deaa689b5eb65..b2161f0343034 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -54,13 +54,16 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { /// Make sure column ordinal is not out of bound and the builder is in good state. void CheckState(int32_t column_ordinal) const { if (finished_) { - throw ParquetException("PageIndexBuilder is already finished."); + throw ParquetException("BloomFilterBuilder is already finished."); } if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) { throw ParquetException("Invalid column ordinal: ", column_ordinal); } if (row_group_bloom_filters_.empty()) { - throw ParquetException("No row group appended to PageIndexBuilder."); + throw ParquetException("No row group appended to BloomFilterBuilder."); + } + if (schema_->Column(column_ordinal)->physical_type() == Type::BOOLEAN) { + throw ParquetException("BloomFilterBuilder not supports Boolean."); } } diff --git a/cpp/src/parquet/bloom_filter_parquet_test.cc b/cpp/src/parquet/bloom_filter_parquet_test.cc index 67b9b714f02c4..38c158ce7b58a 100644 --- a/cpp/src/parquet/bloom_filter_parquet_test.cc +++ b/cpp/src/parquet/bloom_filter_parquet_test.cc @@ -116,8 +116,8 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { TEST(BloomFilterBuilderTest, InvalidOperations) { SchemaDescriptor schema; - schema::NodePtr root = - schema::GroupNode::Make("schema", Repetition::REPEATED, {schema::ByteArray("c1")}); + schema::NodePtr root = schema::GroupNode::Make( + "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::Boolean("c2")}); schema.Init(root); auto properties = WriterProperties::Builder().build(); auto builder = BloomFilterBuilder::Make(&schema, *properties); @@ -127,6 +127,8 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { builder->AppendRowGroup(); // GetOrCreateBloomFilter() with wrong column ordinal expect throw. + ASSERT_THROW(builder->GetOrCreateBloomFilter(2, default_options), ParquetException); + // GetOrCreateBloomFilter() with boolean expect throw. ASSERT_THROW(builder->GetOrCreateBloomFilter(1, default_options), ParquetException); builder->GetOrCreateBloomFilter(0, default_options); auto sink = CreateOutputStream(); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index eca859daf3309..34519f6d210b9 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2433,7 +2433,7 @@ void TypedColumnWriterImpl::UpdateBloomFilterArray( if (!::arrow::is_base_binary_like(values.type_id())) { throw ParquetException("Only BaseBinaryArray and subclasses supported"); } - if (!::arrow::is_dictionary(values.type_id())) { + if (::arrow::is_dictionary(values.type_id())) { // Dictionary is handled in WriteArrowDense, so this should never happen. throw ParquetException("UpdateBloomFilterArray not support dictionary"); } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 625ca14007f73..0070cd5aabece 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -145,7 +145,7 @@ class PARQUET_EXPORT ColumnWriter { static std::shared_ptr Make(ColumnChunkMetaDataBuilder*, std::unique_ptr, const WriterProperties* properties, - BloomFilter* bloom_filter = nullptr); + BloomFilter* bloom_filter = NULLPTR); /// \brief Closes the ColumnWriter, commits any buffered values to pages. /// \return Total size of the column in bytes diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 2e19e6c830b00..69463d9060d32 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1675,18 +1675,5 @@ TYPED_TEST(TestBloomFilterWriter, Basic) { } } -using BooleanTestBloomFilterWriter = TestBloomFilterWriter; -TEST_F(BooleanTestBloomFilterWriter, BooleanNotSupport) { - this->GenerateData(SMALL_SIZE); - ColumnProperties column_properties; - column_properties.set_bloom_filter_enabled(true); - - auto writer = this->BuildWriterWithBloomFilter(SMALL_SIZE, column_properties); - writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); - writer->Close(); - - EXPECT_EQ(nullptr, bloom_filter_); -} - } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index e6a40a80495c3..38fd39b6bb82e 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -421,9 +421,13 @@ class FileSerializer : public ParquetFileWriter::Contents { if (page_index_builder_) { page_index_builder_->AppendRowGroup(); } + if (bloom_filter_builder_) { + bloom_filter_builder_->AppendRowGroup(); + } std::unique_ptr contents(new RowGroupSerializer( sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), - buffered_row_group, file_encryptor_.get(), page_index_builder_.get())); + buffered_row_group, file_encryptor_.get(), page_index_builder_.get(), + bloom_filter_builder_.get())); row_group_writer_ = std::make_unique(std::move(contents)); return row_group_writer_.get(); } @@ -524,7 +528,7 @@ class FileSerializer : public ParquetFileWriter::Contents { bloom_filter_builder_->WriteTo(sink_.get(), &bloom_filter_location); metadata_->SetBloomFilterLocation(bloom_filter_location); // Release the memory for BloomFilter. - bloom_filter_builder_ = nullptr; + // bloom_filter_builder_ = nullptr; } } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index d67db86bf4e5a..74d3f76db596e 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -609,6 +609,12 @@ class PARQUET_EXPORT WriterProperties { return this->disable_bloom_filter(path->ToDotString()); } + Builder* set_bloom_filter_options( + std::optional bloom_filter_options) { + default_column_properties_.set_bloom_filter_options(bloom_filter_options); + return this; + } + /// Allow decimals with 1 <= precision <= 18 to be stored as integers. /// /// In Parquet, DECIMAL can be stored in any of the following physical types: From 96c6691fe5849b3be95d014b66969b36a845b6c2 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 27 Aug 2023 02:12:18 +0800 Subject: [PATCH 08/51] add file roundtrip test --- cpp/src/parquet/column_writer_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 69463d9060d32..01c62d0c5bbeb 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1634,7 +1634,7 @@ TestBloomFilterWriter::BuildWriterWithBloomFilter( // Initial RowGroup builder_->AppendRowGroup(); BloomFilterOptions options; - options.ndv = output_size; + options.ndv = static_cast(output_size); bloom_filter_ = builder_->GetOrCreateBloomFilter(0, options); std::shared_ptr writer = ColumnWriter::Make(this->metadata_.get(), std::move(pager), From c1313411da4b1a3951c2a5eeec3ffca51cbb0e03 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 27 Aug 2023 02:38:25 +0800 Subject: [PATCH 09/51] fix document and ci --- cpp/src/parquet/bloom_filter_builder.cc | 2 ++ cpp/src/parquet/bloom_filter_builder.h | 7 +++++-- cpp/src/parquet/bloom_filter_parquet_test.cc | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index b2161f0343034..ab6e7fcdd54e8 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -21,7 +21,9 @@ #include "parquet/bloom_filter_builder.h" +#include #include +#include #include "arrow/io/interfaces.h" diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index 9806c6220d2c3..44f706dcdb3df 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -48,8 +48,11 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \brief Get the BloomFilter from column ordinal. /// /// \param column_ordinal Column ordinal in schema, which is only for leaf columns. - /// \return BloomFilter for the column and its memory ownership belongs to - /// the BloomFilterBuilder. + /// \param bloom_filter_options The options(like num distinct values and false positive + /// rate) to create a BloomFilter. + /// + /// \return BloomFilter for the column and its memory ownership belongs to the + /// BloomFilterBuilder. virtual BloomFilter* GetOrCreateBloomFilter( int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) = 0; diff --git a/cpp/src/parquet/bloom_filter_parquet_test.cc b/cpp/src/parquet/bloom_filter_parquet_test.cc index 38c158ce7b58a..e0ba45b41f3f1 100644 --- a/cpp/src/parquet/bloom_filter_parquet_test.cc +++ b/cpp/src/parquet/bloom_filter_parquet_test.cc @@ -100,7 +100,7 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { EXPECT_TRUE(location.bloom_filter_location[0][0].has_value()); EXPECT_FALSE(location.bloom_filter_location[0][1].has_value()); - int32_t bloom_filter_offset = location.bloom_filter_location[0][0]->offset; + int64_t bloom_filter_offset = location.bloom_filter_location[0][0]->offset; int32_t bloom_filter_length = location.bloom_filter_location[0][0]->length; ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); From 220b58e52267910659bb666ec9c344e1c9d3a4f3 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 27 Aug 2023 03:26:46 +0800 Subject: [PATCH 10/51] Update: tiny style fix --- cpp/src/parquet/bloom_filter_builder.cc | 3 +-- cpp/src/parquet/file_writer.cc | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index ab6e7fcdd54e8..b18442cfa6ecf 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -80,8 +80,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { std::unique_ptr BloomFilterBuilder::Make( const SchemaDescriptor* schema, const WriterProperties& properties) { - return std::unique_ptr( - new BloomFilterBuilderImpl(schema, properties)); + return std::make_unique(schema, properties); } void BloomFilterBuilderImpl::AppendRowGroup() { row_group_bloom_filters_.emplace_back(); } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 38fd39b6bb82e..9341b3695dd9f 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -528,7 +528,7 @@ class FileSerializer : public ParquetFileWriter::Contents { bloom_filter_builder_->WriteTo(sink_.get(), &bloom_filter_location); metadata_->SetBloomFilterLocation(bloom_filter_location); // Release the memory for BloomFilter. - // bloom_filter_builder_ = nullptr; + bloom_filter_builder_ = nullptr; } } From b756241c5eb6f949e181126d70b236d4f73caada Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 2 Sep 2023 14:50:27 +0800 Subject: [PATCH 11/51] Bloom Filter Resolve comments: 1. Try to using `std::array` to compute bloom hashing 2. Move Bloom class to type_fwd.h 3. Rename test to `bloom_filter_reader_writer_test` 4. Separate `properties` set options to `enable` and `disable`, and remove the set default --- cpp/src/parquet/CMakeLists.txt | 2 +- .../parquet/arrow/arrow_reader_writer_test.cc | 27 +++----- cpp/src/parquet/bloom_filter.h | 15 +++-- cpp/src/parquet/bloom_filter_builder.cc | 6 +- cpp/src/parquet/bloom_filter_builder.h | 14 ++--- ....cc => bloom_filter_reader_writer_test.cc} | 0 cpp/src/parquet/column_writer.cc | 56 ++++++++++------- cpp/src/parquet/column_writer.h | 1 - cpp/src/parquet/file_writer.cc | 2 +- cpp/src/parquet/metadata.cc | 2 +- cpp/src/parquet/metadata.h | 4 +- cpp/src/parquet/properties.h | 61 +++++++------------ cpp/src/parquet/type_fwd.h | 6 ++ 13 files changed, 94 insertions(+), 102 deletions(-) rename cpp/src/parquet/{bloom_filter_parquet_test.cc => bloom_filter_reader_writer_test.cc} (100%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index af5f978ca9647..5912575c21301 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -336,7 +336,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h" add_parquet_test(internals-test SOURCES bloom_filter_test.cc - bloom_filter_parquet_test.cc + bloom_filter_reader_writer_test.cc properties_test.cc statistics_test.cc encoding_test.cc diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index e24b10f0a7ddb..d1d4520e93904 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5624,23 +5624,13 @@ class ParquetBloomFilterRoundTripTest : public ::testing::Test, } template - void verifyBloomFilter(const BloomFilter* bloom_filter, + void VerifyBloomFilter(const BloomFilter* bloom_filter, const ::arrow::ChunkedArray& chunked_array) { - auto iter = ::arrow::stl::Begin(chunked_array); - auto end = ::arrow::stl::End(chunked_array); - while (iter != end) { - auto value = *iter; + for (auto value : ::arrow::stl::Iterate(chunked_array)) { if (value == std::nullopt) { - ++iter; continue; } - if constexpr (std::is_same_v) { - ByteArray ba(value.value()); - EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); - } else { - EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(value.value()))); - } - ++iter; + EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(value.value()))); } } @@ -5652,7 +5642,8 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { BloomFilterOptions options; options.ndv = 100; auto writer_properties = WriterProperties::Builder() - .set_bloom_filter_options(options) + .enable_bloom_filter_options(options, "c0") + ->enable_bloom_filter_options(options, "c1") ->max_row_group_length(4) ->build(); auto schema = ::arrow::schema( @@ -5672,22 +5663,22 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { { ASSERT_NE(nullptr, bloom_filters_[0]); auto col = table->column(0)->Slice(0, 4); - verifyBloomFilter<::arrow::Int64Type>(bloom_filters_[0].get(), *col); + VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[0].get(), *col); } { ASSERT_NE(nullptr, bloom_filters_[1]); auto col = table->column(1)->Slice(0, 4); - verifyBloomFilter<::arrow::StringType>(bloom_filters_[1].get(), *col); + VerifyBloomFilter<::arrow::StringType>(bloom_filters_[1].get(), *col); } { ASSERT_NE(nullptr, bloom_filters_[2]); auto col = table->column(0)->Slice(4, 2); - verifyBloomFilter<::arrow::Int64Type>(bloom_filters_[2].get(), *col); + VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[2].get(), *col); } { ASSERT_NE(nullptr, bloom_filters_[3]); auto col = table->column(1)->Slice(4, 2); - verifyBloomFilter<::arrow::StringType>(bloom_filters_[3].get(), *col); + VerifyBloomFilter<::arrow::StringType>(bloom_filters_[3].get(), *col); } } diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index 963f4cc005741..155cd778dced2 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -167,11 +167,16 @@ class PARQUET_EXPORT BloomFilter { virtual ~BloomFilter() = default; - // Variant of const pointer argument to facilitate template - uint64_t Hash(const int32_t* value) const { return Hash(*value); } - uint64_t Hash(const int64_t* value) const { return Hash(*value); } - uint64_t Hash(const float* value) const { return Hash(*value); } - uint64_t Hash(const double* value) const { return Hash(*value); } + // Variant of const reference argument to facilitate template + uint64_t Hash(const ByteArray& value) const { return Hash(&value); } + uint64_t Hash(const FLBA& value, uint32_t type_len) const { + return Hash(&value, type_len); + } + uint64_t Hash(const Int96& value) const { return Hash(&value); } + uint64_t Hash(const std::string_view& value) const { + ByteArray ba(value); + return Hash(&ba); + } protected: // Hash strategy available for Bloom filter. diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index b18442cfa6ecf..2686b362e9392 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -37,8 +37,8 @@ namespace parquet { class BloomFilterBuilderImpl : public BloomFilterBuilder { public: explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema, - WriterProperties properties) - : schema_(schema), properties_(std::move(properties)) {} + const WriterProperties& properties) + : schema_(schema), properties_(properties) {} /// Append a new row group to host all incoming bloom filters. void AppendRowGroup() override; @@ -65,7 +65,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { throw ParquetException("No row group appended to BloomFilterBuilder."); } if (schema_->Column(column_ordinal)->physical_type() == Type::BOOLEAN) { - throw ParquetException("BloomFilterBuilder not supports Boolean."); + throw ParquetException("BloomFilterBuilder does not support boolean type."); } } diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index 44f706dcdb3df..a61941cc44aa4 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -15,13 +15,9 @@ // specific language governing permissions and limitations // under the License. -// This module defines an abstract interface for iterating through pages in a -// Parquet column chunk within a row group. It could be extended in the future -// to iterate through all data pages in all chunks in a file. - #pragma once -#include "arrow/io/interfaces.h" +#include "arrow/io/type_fwd.h" #include "parquet/types.h" namespace parquet { @@ -31,10 +27,6 @@ class SchemaDescriptor; struct BloomFilterOptions; struct BloomFilterLocation; -namespace schema { -class ColumnPath; -} - /// \brief Interface for collecting bloom filter of a parquet file. class PARQUET_EXPORT BloomFilterBuilder { public: @@ -58,12 +50,16 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \brief Write the bloom filter to sink. /// + /// The bloom filter must have been finished first. + /// /// \param[out] sink The output stream to write the bloom filter. /// \param[out] location The location of all bloom filter to the start of sink. virtual void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) = 0; /// \brief Complete the bloom filter builder and no more write is allowed. + /// + /// This method must be called before WriteTo. virtual void Finish() = 0; virtual ~BloomFilterBuilder() = default; diff --git a/cpp/src/parquet/bloom_filter_parquet_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc similarity index 100% rename from cpp/src/parquet/bloom_filter_parquet_test.cc rename to cpp/src/parquet/bloom_filter_reader_writer_test.cc diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 05002e0e422af..92408c0e972c7 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2349,21 +2349,31 @@ Status TypedColumnWriterImpl::WriteArrowDense( template void TypedColumnWriterImpl::UpdateBloomFilter(const T* values, int64_t num_values) { + constexpr int64_t kHashBatchSize = 256; if (bloom_filter_) { - // TODO(mwish): Would it allocate too much memory? Would an std::array - // better here? - std::vector hashes(num_values); - bloom_filter_->Hashes(values, static_cast(num_values), hashes.data()); - bloom_filter_->InsertHashes(hashes.data(), static_cast(num_values)); + std::array hashes; + for (int64_t i = 0; i < num_values; i += kHashBatchSize) { + int64_t current_hash_batch_size = std::min(kHashBatchSize, num_values - i); + bloom_filter_->Hashes(values, static_cast(current_hash_batch_size), + hashes.data()); + bloom_filter_->InsertHashes(hashes.data(), + static_cast(current_hash_batch_size)); + } } } template <> void TypedColumnWriterImpl::UpdateBloomFilter(const FLBA* values, int64_t num_values) { + constexpr int64_t kHashBatchSize = 256; if (bloom_filter_) { - for (int64_t i = 0; i < num_values; ++i) { - bloom_filter_->InsertHash(bloom_filter_->Hash(values + i, descr_->type_length())); + std::array hashes; + for (int64_t i = 0; i < num_values; i += kHashBatchSize) { + int64_t current_hash_batch_size = std::min(kHashBatchSize, num_values - i); + bloom_filter_->Hashes(values, descr_->type_length(), + static_cast(current_hash_batch_size), hashes.data()); + bloom_filter_->InsertHashes(hashes.data(), + static_cast(current_hash_batch_size)); } } } @@ -2382,7 +2392,7 @@ void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const T* values, ::arrow::internal::VisitSetBitRunsVoid( valid_bits, valid_bits_offset, num_values, [&](int64_t position, int64_t length) { for (int64_t i = 0; i < length; i++) { - bloom_filter_->InsertHash(bloom_filter_->Hash(values + i + position)); + bloom_filter_->InsertHash(bloom_filter_->Hash(*(values + i + position))); } }); } @@ -2416,19 +2426,12 @@ void UpdateBinaryBloomFilter(BloomFilter* bloom_filter, const ArrayType& array) PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( *array.data(), [&](const std::string_view& view) { - const ByteArray value{view}; - bloom_filter->InsertHash(bloom_filter->Hash(&value)); + bloom_filter->InsertHash(bloom_filter->Hash(view)); return Status::OK(); }, []() { return Status::OK(); })); } -template -void TypedColumnWriterImpl::UpdateBloomFilterArray(const ::arrow::Array& values) { - // Only ByteArray type would write ::arrow::Array directly. - ParquetException::NYI("Unreachable"); -} - template <> void TypedColumnWriterImpl::UpdateBloomFilterArray( const ::arrow::Array& values) { @@ -2436,10 +2439,6 @@ void TypedColumnWriterImpl::UpdateBloomFilterArray( if (!::arrow::is_base_binary_like(values.type_id())) { throw ParquetException("Only BaseBinaryArray and subclasses supported"); } - if (::arrow::is_dictionary(values.type_id())) { - // Dictionary is handled in WriteArrowDense, so this should never happen. - throw ParquetException("UpdateBloomFilterArray not support dictionary"); - } if (::arrow::is_binary_like(values.type_id())) { UpdateBinaryBloomFilter(bloom_filter_, @@ -2452,6 +2451,12 @@ void TypedColumnWriterImpl::UpdateBloomFilterArray( } } +template +void TypedColumnWriterImpl::UpdateBloomFilterArray(const ::arrow::Array& values) { + // Only ByteArray type would write ::arrow::Array directly. + ParquetException::NYI("Unreachable"); +} + // ---------------------------------------------------------------------- // Dynamic column writer constructor @@ -2473,9 +2478,14 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met encoding = properties->dictionary_index_encoding(); } switch (descr->physical_type()) { - case Type::BOOLEAN: + case Type::BOOLEAN: { + if (bloom_filter != nullptr) { + throw ParquetException("Bloom filter is not supported for boolean type"); + } return std::make_shared>( - metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); + metadata, std::move(pager), use_dictionary, encoding, properties, + /*bloom_filter=*/nullptr); + } case Type::INT32: return std::make_shared>( metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); @@ -2498,7 +2508,7 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met return std::make_shared>( metadata, std::move(pager), use_dictionary, encoding, properties, bloom_filter); default: - ParquetException::NYI("type reader not implemented"); + ParquetException::NYI("column writer not implemented for this Parquet type"); } // Unreachable code, but suppress compiler warning return std::shared_ptr(nullptr); diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 0070cd5aabece..a4ba0fd97c73f 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -44,7 +44,6 @@ class CodecOptions; namespace parquet { struct ArrowWriteContext; -class BloomFilter; class ColumnChunkMetaDataBuilder; class ColumnDescriptor; class ColumnIndexBuilder; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 9341b3695dd9f..264d0e46b7de9 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -519,7 +519,7 @@ class FileSerializer : public ParquetFileWriter::Contents { void WriteBloomFilter() { if (bloom_filter_builder_ != nullptr) { if (properties_->file_encryption_properties()) { - throw ParquetException("Encryption is not supported with bloom filter"); + ParquetException::NYI("Encryption is not supported with bloom filter"); } // Serialize page index after all row groups have been written and report // location to the file metadata. diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 5617221340607..7dac85a29115a 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1843,7 +1843,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const auto& bloom_filter_location = row_group_bloom_filter_location.at(i); if (bloom_filter_location.has_value()) { column_metadata.__set_bloom_filter_offset(bloom_filter_location->offset); - // TODO(mwish): Allow this in Parquet 2.10. + // TODO(mwish): Allow this after Parquet 2.10 is released. // column_metadata.__set_bloom_filter_length(bloom_filter_location->length); } } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index ccaac097b2298..b5166d9be2873 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -508,8 +508,8 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { }; /// Alias type of page index and bloom filter location of a row group. The index -/// location is located by column ordinal. If the column does not have the index, -/// its value is set to std::nullopt. +/// location is located by column ordinal. If a column does not have a page index +/// (respectively bloom filter), its value is set to std::nullopt. using RowGroupIndexLocation = std::vector>; /// Alias type of page index and bloom filter location of a parquet file. The diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 1f47ea33afc8f..1320fddaeb679 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -142,10 +142,10 @@ static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false; static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024; static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05; -/// Note: Currently we don't support bloom filter for boolean columns, -/// so if enable bloom filter for boolean columns, it will be ignored. struct BloomFilterOptions { + /// The number of distinct values to expect to be inserted into the bloom. int32_t ndv = DEFAULT_BLOOM_FILTER_NDV; + /// The false positive probability expected from the bloom. double fpp = DEFAULT_BLOOM_FILTER_FPP; }; @@ -210,14 +210,6 @@ class PARQUET_EXPORT ColumnProperties { bloom_filter_options_ = bloom_filter_options; } - void set_bloom_filter_enabled(bool bloom_filter_enabled) { - if (!bloom_filter_enabled) { - bloom_filter_options_ = std::nullopt; - } else if (!bloom_filter_options_) { - bloom_filter_options_ = BloomFilterOptions(); - } - } - Encoding::type encoding() const { return encoding_; } Compression::type compression() const { return codec_; } @@ -571,31 +563,6 @@ class PARQUET_EXPORT WriterProperties { return this->disable_statistics(path->ToDotString()); } - /// Enable writing bloom filter in general for all columns. Default disabled. - /// - /// Please check the link below for more details: - /// https://github.com/apache/parquet-format/blob/master/BloomFilter.md - Builder* enable_bloom_filter() { - default_column_properties_.set_bloom_filter_enabled(true); - return this; - } - - /// Enable bloom filter for the column specified by `path`. - /// Default disabled. - Builder* enable_bloom_filter(const std::string& path) { - auto iter = bloom_filter_options_.find(path); - if (iter == bloom_filter_options_.end() || iter->second == std::nullopt) { - bloom_filter_options_[path] = BloomFilterOptions(); - } - return this; - } - - /// Enable bloom filter for the column specified by `path`. - /// Default disabled. - Builder* enable_bloom_filter(const std::shared_ptr& path) { - return this->enable_bloom_filter(path->ToDotString()); - } - /// Disable bloom filter for the column specified by `path`. /// Default disabled. Builder* disable_bloom_filter(const std::string& path) { @@ -609,12 +576,30 @@ class PARQUET_EXPORT WriterProperties { return this->disable_bloom_filter(path->ToDotString()); } - Builder* set_bloom_filter_options( - std::optional bloom_filter_options) { - default_column_properties_.set_bloom_filter_options(bloom_filter_options); + /// Enable bloom filter options for the column specified by `path`. + /// + /// Default disabled. + /// + /// Note: Currently we don't support bloom filter for boolean columns, + /// so if enable bloom filter for boolean columns, it will be ignored. + Builder* enable_bloom_filter_options(BloomFilterOptions bloom_filter_options, + const std::string& path) { + bloom_filter_options_[path] = bloom_filter_options; return this; } + /// Enable bloom filter options for the column specified by `path`. + /// + /// Default disabled. + /// + /// Note: Currently we don't support bloom filter for boolean columns, + /// so if enable bloom filter for boolean columns, it will be ignored. + Builder* enable_bloom_filter_options( + BloomFilterOptions bloom_filter_options, + const std::shared_ptr& path) { + return this->enable_bloom_filter_options(bloom_filter_options, path->ToDotString()); + } + /// Allow decimals with 1 <= precision <= 18 to be stored as integers. /// /// In Parquet, DECIMAL can be stored in any of the following physical types: diff --git a/cpp/src/parquet/type_fwd.h b/cpp/src/parquet/type_fwd.h index 3e66f32fc0322..1982eeac3dd69 100644 --- a/cpp/src/parquet/type_fwd.h +++ b/cpp/src/parquet/type_fwd.h @@ -79,6 +79,12 @@ class WriterPropertiesBuilder; class ArrowWriterProperties; class ArrowWriterPropertiesBuilder; +class BloomFilter; + +namespace schema { +class ColumnPath; +} // namespace schema + namespace arrow { class FileWriter; From f43505bf35b34c1aa355db8afd6ae2ff7c3efd4e Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 2 Sep 2023 15:29:52 +0800 Subject: [PATCH 12/51] make space writing a batched writing --- cpp/src/parquet/bloom_filter_builder.cc | 2 +- cpp/src/parquet/bloom_filter_reader.h | 2 +- cpp/src/parquet/column_writer.cc | 24 +++++++++++++++++------- cpp/src/parquet/page_index.h | 2 +- cpp/src/parquet/properties.h | 13 ++++--------- 5 files changed, 24 insertions(+), 19 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 2686b362e9392..8aae59e3e968c 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -121,7 +121,7 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, int num_columns = schema_->num_columns(); std::vector> locations(num_columns, std::nullopt); - // serialize bloom filter by ascending order of column id + // serialize bloom filter in ascending order of column id for (int32_t column_id = 0; column_id < num_columns; ++column_id) { auto iter = row_group_bloom_filters.find(column_id); if (iter != row_group_bloom_filters.cend() && iter->second != nullptr) { diff --git a/cpp/src/parquet/bloom_filter_reader.h b/cpp/src/parquet/bloom_filter_reader.h index cbd267dd1972d..46e046156da7a 100644 --- a/cpp/src/parquet/bloom_filter_reader.h +++ b/cpp/src/parquet/bloom_filter_reader.h @@ -17,7 +17,7 @@ #pragma once -#include "arrow/io/interfaces.h" +#include "arrow/io/type_fwd.h" #include "parquet/properties.h" #include "parquet/type_fwd.h" diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 92408c0e972c7..1b9a6f429a805 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1630,6 +1630,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } } + constexpr static inline int64_t kHashBatchSize = 256; void UpdateBloomFilter(const T* values, int64_t num_values); void UpdateBloomFilterSpaced(const T* values, int64_t num_values, const uint8_t* valid_bits, int64_t valid_bits_offset); @@ -2349,7 +2350,6 @@ Status TypedColumnWriterImpl::WriteArrowDense( template void TypedColumnWriterImpl::UpdateBloomFilter(const T* values, int64_t num_values) { - constexpr int64_t kHashBatchSize = 256; if (bloom_filter_) { std::array hashes; for (int64_t i = 0; i < num_values; i += kHashBatchSize) { @@ -2365,7 +2365,6 @@ void TypedColumnWriterImpl::UpdateBloomFilter(const T* values, template <> void TypedColumnWriterImpl::UpdateBloomFilter(const FLBA* values, int64_t num_values) { - constexpr int64_t kHashBatchSize = 256; if (bloom_filter_) { std::array hashes; for (int64_t i = 0; i < num_values; i += kHashBatchSize) { @@ -2389,10 +2388,16 @@ void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset) { if (bloom_filter_) { + std::array hashes; ::arrow::internal::VisitSetBitRunsVoid( valid_bits, valid_bits_offset, num_values, [&](int64_t position, int64_t length) { - for (int64_t i = 0; i < length; i++) { - bloom_filter_->InsertHash(bloom_filter_->Hash(*(values + i + position))); + for (int64_t i = 0; i < length; i += kHashBatchSize) { + auto current_hash_batch_size = std::min(kHashBatchSize, length - i); + bloom_filter_->Hashes(values + i + position, + static_cast(current_hash_batch_size), + hashes.data()); + bloom_filter_->InsertHashes(hashes.data(), + static_cast(current_hash_batch_size)); } }); } @@ -2411,11 +2416,16 @@ void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const FLBA* values const uint8_t* valid_bits, int64_t valid_bits_offset) { if (bloom_filter_) { + std::array hashes; ::arrow::internal::VisitSetBitRunsVoid( valid_bits, valid_bits_offset, num_values, [&](int64_t position, int64_t length) { - for (int64_t i = 0; i < length; i++) { - bloom_filter_->InsertHash( - bloom_filter_->Hash(values + i + position, descr_->type_length())); + for (int64_t i = 0; i < length; i += kHashBatchSize) { + auto current_hash_batch_size = std::min(kHashBatchSize, length - i); + bloom_filter_->Hashes(values + i + position, descr_->type_length(), + static_cast(current_hash_batch_size), + hashes.data()); + bloom_filter_->InsertHashes(hashes.data(), + static_cast(current_hash_batch_size)); } }); } diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index b6ea5fd6abc08..1d7a13d090668 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -17,7 +17,7 @@ #pragma once -#include "arrow/io/interfaces.h" +#include "arrow/io/type_fwd.h" #include "parquet/types.h" #include diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 1320fddaeb679..50048e2eb3f89 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -830,15 +830,10 @@ class PARQUET_EXPORT WriterProperties { } bool bloom_filter_enabled() const { - if (default_column_properties_.bloom_filter_enabled()) { - return true; - } - for (const auto& item : column_properties_) { - if (item.second.bloom_filter_enabled()) { - return true; - } - } - return false; + // Note: we disallow enable bloom filter by default for now. + // So no need to check `default_column_properties_.bloom_filter_enabled()`. + return std::any_of(column_properties_.begin(), column_properties_.end(), + [](auto& p) { return p.second.bloom_filter_enabled(); }); } std::optional bloom_filter_options( From 3497f4a7c7508568fd068d04e93b96bc0fca1a3a Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 2 Sep 2023 15:51:55 +0800 Subject: [PATCH 13/51] update bloom_filter builder interface --- cpp/src/parquet/bloom_filter_builder.cc | 53 ++++++++++++++----------- cpp/src/parquet/bloom_filter_builder.h | 5 +-- cpp/src/parquet/file_writer.cc | 13 ++---- 3 files changed, 34 insertions(+), 37 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 8aae59e3e968c..7951bc95af060 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -37,13 +37,12 @@ namespace parquet { class BloomFilterBuilderImpl : public BloomFilterBuilder { public: explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema, - const WriterProperties& properties) - : schema_(schema), properties_(properties) {} + WriterProperties properties) + : schema_(schema), properties_(std::move(properties)) {} /// Append a new row group to host all incoming bloom filters. void AppendRowGroup() override; - BloomFilter* GetOrCreateBloomFilter( - int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) override; + BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) override; /// Serialize all bloom filters with header and bitset in the order of row group and /// column id. Column encryption is not implemented yet. The side effect is that it @@ -61,7 +60,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) { throw ParquetException("Invalid column ordinal: ", column_ordinal); } - if (row_group_bloom_filters_.empty()) { + if (file_bloom_filters_.empty()) { throw ParquetException("No row group appended to BloomFilterBuilder."); } if (schema_->Column(column_ordinal)->physical_type() == Type::BOOLEAN) { @@ -73,9 +72,8 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { WriterProperties properties_; bool finished_ = false; - // vector: row_group_ordinal - // map: column_ordinal -> bloom filter - std::vector>> row_group_bloom_filters_; + using RowGroupBloomFilters = std::map>; + std::vector file_bloom_filters_; }; std::unique_ptr BloomFilterBuilder::Make( @@ -83,13 +81,20 @@ std::unique_ptr BloomFilterBuilder::Make( return std::make_unique(schema, properties); } -void BloomFilterBuilderImpl::AppendRowGroup() { row_group_bloom_filters_.emplace_back(); } +void BloomFilterBuilderImpl::AppendRowGroup() { file_bloom_filters_.emplace_back(); } -BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter( - int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) { +BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordinal) { CheckState(column_ordinal); - std::unique_ptr& bloom_filter = - row_group_bloom_filters_.back()[column_ordinal]; + if (schema_->Column(column_ordinal)->physical_type() == Type::BOOLEAN) { + return nullptr; + } + auto bloom_filter_options_opt = + properties_.bloom_filter_options(schema_->Column(column_ordinal)->path()); + if (bloom_filter_options_opt == std::nullopt) { + return nullptr; + } + BloomFilterOptions& bloom_filter_options = bloom_filter_options_opt.value(); + std::unique_ptr& bloom_filter = file_bloom_filters_.back()[column_ordinal]; if (bloom_filter == nullptr) { auto block_split_bloom_filter = std::make_unique(properties_.memory_pool()); @@ -105,14 +110,14 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, if (!finished_) { throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder."); } - if (row_group_bloom_filters_.empty()) { + if (file_bloom_filters_.empty()) { // Return quickly if there is no bloom filter return; } - for (size_t row_group_ordinal = 0; row_group_ordinal < row_group_bloom_filters_.size(); + for (size_t row_group_ordinal = 0; row_group_ordinal < file_bloom_filters_.size(); ++row_group_ordinal) { - const auto& row_group_bloom_filters = row_group_bloom_filters_[row_group_ordinal]; + const auto& row_group_bloom_filters = file_bloom_filters_[row_group_ordinal]; // the whole row group has no bloom filter if (row_group_bloom_filters.empty()) { continue; @@ -122,15 +127,15 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, std::vector> locations(num_columns, std::nullopt); // serialize bloom filter in ascending order of column id - for (int32_t column_id = 0; column_id < num_columns; ++column_id) { - auto iter = row_group_bloom_filters.find(column_id); - if (iter != row_group_bloom_filters.cend() && iter->second != nullptr) { - PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); - iter->second->WriteTo(sink); - PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); - has_valid_bloom_filter = true; - locations[column_id] = IndexLocation{offset, static_cast(pos - offset)}; + for (auto& [column_id, filter] : row_group_bloom_filters) { + if (filter == nullptr) { + continue; } + PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); + filter->WriteTo(sink); + PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); + has_valid_bloom_filter = true; + locations[column_id] = IndexLocation{offset, static_cast(pos - offset)}; } if (has_valid_bloom_filter) { location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations)); diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index a61941cc44aa4..efca0a5aa5aa3 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -40,13 +40,10 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \brief Get the BloomFilter from column ordinal. /// /// \param column_ordinal Column ordinal in schema, which is only for leaf columns. - /// \param bloom_filter_options The options(like num distinct values and false positive - /// rate) to create a BloomFilter. /// /// \return BloomFilter for the column and its memory ownership belongs to the /// BloomFilterBuilder. - virtual BloomFilter* GetOrCreateBloomFilter( - int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) = 0; + virtual BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) = 0; /// \brief Write the bloom filter to sink. /// diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 264d0e46b7de9..3963f07a88a83 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -159,15 +159,10 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto codec_options = properties_->codec_options(path) ? properties_->codec_options(path).get() : nullptr; - BloomFilter* bloom_filter = nullptr; - std::optional bloom_filter_props = - properties_->bloom_filter_options(path); - bool bloom_filter_enabled = bloom_filter_builder_ && bloom_filter_props.has_value() && - col_meta->descr()->physical_type() != Type::BOOLEAN; - if (bloom_filter_enabled) { - bloom_filter = bloom_filter_builder_->GetOrCreateBloomFilter( - column_ordinal, bloom_filter_props.value()); - } + BloomFilter* bloom_filter = + bloom_filter_builder_ + ? nullptr + : bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal); std::unique_ptr pager; if (!codec_options) { From fecd0f0f7873c737004f437d3247247f66bd52f3 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 2 Sep 2023 16:09:58 +0800 Subject: [PATCH 14/51] update BloomFilterBuilder arguments --- .../bloom_filter_reader_writer_test.cc | 26 ++++++++++++------- cpp/src/parquet/column_writer_test.cc | 17 ++++++++---- cpp/src/parquet/file_writer.cc | 13 ++-------- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index e0ba45b41f3f1..9b6afc6ad575f 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -77,12 +77,14 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { schema::NodePtr root = schema::GroupNode::Make( "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::ByteArray("c2")}); schema.Init(root); - auto writer_properties = default_writer_properties(); - auto builder = BloomFilterBuilder::Make(&schema, *writer_properties); - builder->AppendRowGroup(); + WriterProperties::Builder properties_builder; BloomFilterOptions bloom_filter_options; bloom_filter_options.ndv = 100; - auto bloom_filter = builder->GetOrCreateBloomFilter(0, bloom_filter_options); + properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); + auto writer_properties = properties_builder.build(); + auto builder = BloomFilterBuilder::Make(&schema, *writer_properties); + builder->AppendRowGroup(); + auto bloom_filter = builder->GetOrCreateBloomFilter(0); ASSERT_NE(nullptr, bloom_filter); ASSERT_EQ(bloom_filter->GetBitsetSize(), BlockSplitBloomFilter::OptimalNumOfBytes(bloom_filter_options.ndv, @@ -119,18 +121,22 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { schema::NodePtr root = schema::GroupNode::Make( "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::Boolean("c2")}); schema.Init(root); - auto properties = WriterProperties::Builder().build(); + WriterProperties::Builder properties_builder; + BloomFilterOptions bloom_filter_options; + bloom_filter_options.ndv = 100; + properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); + properties_builder.enable_bloom_filter_options(bloom_filter_options, "c2"); + auto properties = properties_builder.build(); auto builder = BloomFilterBuilder::Make(&schema, *properties); // AppendRowGroup() is not called and expect throw. - BloomFilterOptions default_options; - ASSERT_THROW(builder->GetOrCreateBloomFilter(0, default_options), ParquetException); + ASSERT_THROW(builder->GetOrCreateBloomFilter(0), ParquetException); builder->AppendRowGroup(); // GetOrCreateBloomFilter() with wrong column ordinal expect throw. - ASSERT_THROW(builder->GetOrCreateBloomFilter(2, default_options), ParquetException); + ASSERT_THROW(builder->GetOrCreateBloomFilter(2), ParquetException); // GetOrCreateBloomFilter() with boolean expect throw. - ASSERT_THROW(builder->GetOrCreateBloomFilter(1, default_options), ParquetException); - builder->GetOrCreateBloomFilter(0, default_options); + ASSERT_THROW(builder->GetOrCreateBloomFilter(1), ParquetException); + builder->GetOrCreateBloomFilter(0); auto sink = CreateOutputStream(); BloomFilterLocation location; // WriteTo() before Finish() expect throw. diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index a057414aa0162..e37cea22ebd9f 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1647,6 +1647,13 @@ TestBloomFilterWriter::BuildWriterWithBloomFilter( wp_builder.encoding(column_properties.encoding()); } wp_builder.max_statistics_size(column_properties.max_statistics_size()); + auto path = this->schema_.Column(0)->path(); + if (column_properties.bloom_filter_enabled()) { + wp_builder.enable_bloom_filter_options( + column_properties.bloom_filter_options().value(), path); + } else { + wp_builder.disable_bloom_filter(path); + } this->writer_properties_ = wp_builder.build(); this->metadata_ = @@ -1656,9 +1663,7 @@ TestBloomFilterWriter::BuildWriterWithBloomFilter( builder_ = BloomFilterBuilder::Make(&this->schema_, *this->writer_properties_); // Initial RowGroup builder_->AppendRowGroup(); - BloomFilterOptions options; - options.ndv = static_cast(output_size); - bloom_filter_ = builder_->GetOrCreateBloomFilter(0, options); + bloom_filter_ = builder_->GetOrCreateBloomFilter(0); std::shared_ptr writer = ColumnWriter::Make(this->metadata_.get(), std::move(pager), this->writer_properties_.get(), bloom_filter_); @@ -1674,7 +1679,9 @@ TYPED_TEST_SUITE(TestBloomFilterWriter, TestBloomFilterTypes); TYPED_TEST(TestBloomFilterWriter, Basic) { this->GenerateData(SMALL_SIZE); ColumnProperties column_properties; - column_properties.set_bloom_filter_enabled(true); + BloomFilterOptions options; + options.ndv = 10; + column_properties.set_bloom_filter_options(options); auto writer = this->BuildWriterWithBloomFilter(SMALL_SIZE, column_properties); writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); @@ -1693,7 +1700,7 @@ TYPED_TEST(TestBloomFilterWriter, Basic) { EXPECT_TRUE(this->bloom_filter_->FindHash( this->bloom_filter_->Hash(&value, this->descr_->type_length()))); } else { - EXPECT_TRUE(this->bloom_filter_->FindHash(this->bloom_filter_->Hash(&value))); + EXPECT_TRUE(this->bloom_filter_->FindHash(this->bloom_filter_->Hash(value))); } } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 3963f07a88a83..5e72599ca87ef 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -316,17 +316,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto codec_options = properties_->codec_options(path) ? (properties_->codec_options(path)).get() : nullptr; - BloomFilter* bloom_filter = nullptr; - - std::optional bloom_filter_props = - properties_->bloom_filter_options(path); - bool bloom_filter_enabled = bloom_filter_builder_ && - bloom_filter_props.has_value() && - col_meta->descr()->physical_type() != Type::BOOLEAN; - if (bloom_filter_enabled) { - bloom_filter = bloom_filter_builder_->GetOrCreateBloomFilter( - column_ordinal, bloom_filter_props.value()); - } + BloomFilter* bloom_filter = + bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal); std::unique_ptr pager; if (!codec_options) { pager = PageWriter::Open( From 29cc1c113a964de9f95681547b86f0ef04eba352 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 2 Sep 2023 16:14:09 +0800 Subject: [PATCH 15/51] fix compile --- cpp/src/parquet/column_writer_test.cc | 2 -- cpp/src/parquet/file_writer.cc | 9 +++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index e37cea22ebd9f..db64aa36af7b9 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -688,7 +688,6 @@ TYPED_TEST(TestPrimitiveWriter, RequiredLargeChunk) { // Just read the first SMALL_SIZE rows to ensure we could read it back in this->ReadColumn(); ASSERT_EQ(SMALL_SIZE, this->values_read_); - this->values_.resize(SMALL_SIZE); ASSERT_EQ(this->values_, this->values_out_); } @@ -1691,7 +1690,6 @@ TYPED_TEST(TestBloomFilterWriter, Basic) { this->SetupValuesOut(SMALL_SIZE); this->ReadColumnFully(); ASSERT_EQ(SMALL_SIZE, this->values_read_); - this->values_.resize(SMALL_SIZE); ASSERT_EQ(this->values_, this->values_out_); // Verify bloom filter diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 5e72599ca87ef..ca8416159da91 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -161,8 +161,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { : nullptr; BloomFilter* bloom_filter = bloom_filter_builder_ - ? nullptr - : bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal); + ? bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal) + : nullptr; std::unique_ptr pager; if (!codec_options) { @@ -317,7 +317,9 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ? (properties_->codec_options(path)).get() : nullptr; BloomFilter* bloom_filter = - bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal); + bloom_filter_builder_ + ? bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal) + : nullptr; std::unique_ptr pager; if (!codec_options) { pager = PageWriter::Open( @@ -528,7 +530,6 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr row_group_writer_; std::unique_ptr page_index_builder_; std::unique_ptr file_encryptor_; - // Only one of the bloom filter writer is active at a time std::unique_ptr bloom_filter_builder_; void StartFile() { From ffbb491ae7eff05a67971253b5b0cd63313c6086 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 2 Sep 2023 16:35:34 +0800 Subject: [PATCH 16/51] try to satisfy win compiler --- cpp/src/parquet/bloom_filter_builder.cc | 9 ++++++++- cpp/src/parquet/bloom_filter_builder.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 7951bc95af060..cd15b82b0ee65 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -81,7 +81,14 @@ std::unique_ptr BloomFilterBuilder::Make( return std::make_unique(schema, properties); } -void BloomFilterBuilderImpl::AppendRowGroup() { file_bloom_filters_.emplace_back(); } +void BloomFilterBuilderImpl::AppendRowGroup() { + if (finished_) { + throw ParquetException( + "Cannot call AppendRowGroup() to finished BloomFilterBuilder."); + } + RowGroupBloomFilters row_group_bloom_filters; + file_bloom_filters_.emplace_back(std::move(row_group_bloom_filters)); +} BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordinal) { CheckState(column_ordinal); diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index efca0a5aa5aa3..f638ce78e870e 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -35,6 +35,8 @@ class PARQUET_EXPORT BloomFilterBuilder { const WriterProperties& properties); /// Append a new row group to host all incoming bloom filters. + /// + /// This method must be called before Finish. virtual void AppendRowGroup() = 0; /// \brief Get the BloomFilter from column ordinal. From 4d63428b80b9553e9b30e63133d03c1d9c26ad12 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 2 Sep 2023 17:16:35 +0800 Subject: [PATCH 17/51] change all to vector --- cpp/src/parquet/bloom_filter_builder.cc | 7 ++++--- cpp/src/parquet/column_writer_test.cc | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index cd15b82b0ee65..3853fe9a1f8b9 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -72,7 +72,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { WriterProperties properties_; bool finished_ = false; - using RowGroupBloomFilters = std::map>; + using RowGroupBloomFilters = std::vector>; std::vector file_bloom_filters_; }; @@ -86,7 +86,7 @@ void BloomFilterBuilderImpl::AppendRowGroup() { throw ParquetException( "Cannot call AppendRowGroup() to finished BloomFilterBuilder."); } - RowGroupBloomFilters row_group_bloom_filters; + RowGroupBloomFilters row_group_bloom_filters(schema_->num_columns()); file_bloom_filters_.emplace_back(std::move(row_group_bloom_filters)); } @@ -134,7 +134,8 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, std::vector> locations(num_columns, std::nullopt); // serialize bloom filter in ascending order of column id - for (auto& [column_id, filter] : row_group_bloom_filters) { + for (int column_id = 0; column_id < num_columns; ++column_id) { + auto& filter = row_group_bloom_filters[column_id]; if (filter == nullptr) { continue; } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index db64aa36af7b9..5ff5ff1b56aa8 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -688,6 +688,7 @@ TYPED_TEST(TestPrimitiveWriter, RequiredLargeChunk) { // Just read the first SMALL_SIZE rows to ensure we could read it back in this->ReadColumn(); ASSERT_EQ(SMALL_SIZE, this->values_read_); + this->values_.resize(SMALL_SIZE); ASSERT_EQ(this->values_, this->values_out_); } From 8e9cb164dac8e2b5e3505a01e793459192d339aa Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 11 Sep 2023 18:40:40 +0800 Subject: [PATCH 18/51] resolve comment --- .../parquet/arrow/arrow_reader_writer_test.cc | 14 +++++++------- cpp/src/parquet/file_writer.cc | 3 +++ cpp/src/parquet/metadata.cc | 4 +--- cpp/src/parquet/properties.h | 17 +++++++---------- 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index d1d4520e93904..5d7df21222105 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5649,13 +5649,13 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { auto schema = ::arrow::schema( {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); auto table = ::arrow::TableFromJSON(schema, {R"([ -[1, "a" ], -[2, "b" ], -[3, "c" ], -[null, "d"], -[5, null], -[6, "f" ] -])"}); + [1, "a"], + [2, "b"], + [3, "c"], + [null, "d"], + [5, null], + [6, "f"] + ])"}); WriteFile(writer_properties, table); ReadBloomFilters(/*expect_num_row_groups=*/2); diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index ca8416159da91..1820819d2fe6e 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -375,6 +375,9 @@ class FileSerializer : public ParquetFileWriter::Contents { // In Parquet standard, the Bloom filter data can be stored before the page indexes // after all row groups or stored between row groups. We choose to store it before // the page indexes after all row groups. + // Also, Putting all bloom filters together may provide a good chance to coalesce + // I/Os of different bloom filters. Especially when only one column has enabled it, + // which is the common case. WriteBloomFilter(); WritePageIndex(); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 7dac85a29115a..b4f1d944991b9 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1835,9 +1835,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { if (iter != file_bloom_filter_location.cend()) { const auto& row_group_bloom_filter_location = iter->second; for (size_t i = 0; i < row_group_bloom_filter_location.size(); ++i) { - if (i >= row_group_metadata.columns.size()) { - throw ParquetException("Cannot find metadata for column ordinal ", i); - } + DCHECK(i < row_group_metadata.columns.size()); auto& column = row_group_metadata.columns.at(i); auto& column_metadata = column.meta_data; const auto& bloom_filter_location = row_group_bloom_filter_location.at(i); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 50048e2eb3f89..d727c91dfc2d0 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -142,16 +142,13 @@ static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false; static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024; static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05; -struct BloomFilterOptions { +struct PARQUET_EXPORT BloomFilterOptions { /// The number of distinct values to expect to be inserted into the bloom. int32_t ndv = DEFAULT_BLOOM_FILTER_NDV; /// The false positive probability expected from the bloom. double fpp = DEFAULT_BLOOM_FILTER_FPP; }; -static constexpr std::optional DEFAULT_IS_BLOOM_FILTER_OPTIONS = - std::nullopt; - class PARQUET_EXPORT ColumnProperties { public: ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING, @@ -159,9 +156,7 @@ class PARQUET_EXPORT ColumnProperties { bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE, - bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED, - std::optional bloom_filter_options = - DEFAULT_IS_BLOOM_FILTER_OPTIONS) + bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED) : encoding_(encoding), codec_(codec), dictionary_enabled_(dictionary_enabled), @@ -204,7 +199,8 @@ class PARQUET_EXPORT ColumnProperties { if (bloom_filter_options) { if (bloom_filter_options->fpp > 1.0 || bloom_filter_options->fpp < 0.0) { throw ParquetException( - "Bloom Filter False positive probability must be between 0.0 and 1.0"); + "Bloom Filter False positive probability must be between 0.0 and 1.0, got " + + std::to_string(bloom_filter_options->fpp)); } } bloom_filter_options_ = bloom_filter_options; @@ -830,8 +826,9 @@ class PARQUET_EXPORT WriterProperties { } bool bloom_filter_enabled() const { - // Note: we disallow enable bloom filter by default for now. - // So no need to check `default_column_properties_.bloom_filter_enabled()`. + // Note: We do not encourage enabling bloom filter for all columns. So + // default_column_properties_.bloom_filter_enabled is always false and + // cannot be altered by user. Thus we can safely skip checking it here. return std::any_of(column_properties_.begin(), column_properties_.end(), [](auto& p) { return p.second.bloom_filter_enabled(); }); } From feccee9b38b58de7667e499b625801133c0d6af8 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 10 Oct 2023 19:38:57 +0800 Subject: [PATCH 19/51] fix some comment --- cpp/src/parquet/bloom_filter_builder.cc | 13 ++++------- cpp/src/parquet/bloom_filter_builder.h | 23 ++++++++++++------- .../bloom_filter_reader_writer_test.cc | 6 ++--- cpp/src/parquet/file_writer.cc | 1 - cpp/src/parquet/metadata.cc | 6 ++--- cpp/src/parquet/properties.h | 12 +++++----- 6 files changed, 30 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 3853fe9a1f8b9..6d7397af4ac7f 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -49,8 +49,6 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { /// deletes all bloom filters after they have been flushed. void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override; - void Finish() override { finished_ = true; } - private: /// Make sure column ordinal is not out of bound and the builder is in good state. void CheckState(int32_t column_ordinal) const { @@ -100,7 +98,7 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin if (bloom_filter_options_opt == std::nullopt) { return nullptr; } - BloomFilterOptions& bloom_filter_options = bloom_filter_options_opt.value(); + BloomFilterOptions& bloom_filter_options = *bloom_filter_options_opt; std::unique_ptr& bloom_filter = file_bloom_filters_.back()[column_ordinal]; if (bloom_filter == nullptr) { auto block_split_bloom_filter = @@ -114,13 +112,10 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) { - if (!finished_) { - throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder."); - } - if (file_bloom_filters_.empty()) { - // Return quickly if there is no bloom filter - return; + if (finished_) { + throw ParquetException("Cannot call WriteTo() multiple times."); } + finished_ = true; for (size_t row_group_ordinal = 0; row_group_ordinal < file_bloom_filters_.size(); ++row_group_ordinal) { diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index f638ce78e870e..4853e5edd5d22 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -28,6 +28,18 @@ struct BloomFilterOptions; struct BloomFilterLocation; /// \brief Interface for collecting bloom filter of a parquet file. +/// +/// ``` +/// auto bloom_filter_builder = BloomFilterBuilder::Make(schema, properties); +/// for (int i = 0; i < num_row_groups; i++) { +/// bloom_filter_builder->AppendRowGroup(); +/// auto* bloom_filter = +/// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column); +/// // Add bloom filter entries. +/// // ... +/// } +/// bloom_filter_builder->WriteTo(sink, location); +/// ``` class PARQUET_EXPORT BloomFilterBuilder { public: /// \brief API convenience to create a BloomFilterBuilder. @@ -36,7 +48,7 @@ class PARQUET_EXPORT BloomFilterBuilder { /// Append a new row group to host all incoming bloom filters. /// - /// This method must be called before Finish. + /// This method must be called before WriteTo. virtual void AppendRowGroup() = 0; /// \brief Get the BloomFilter from column ordinal. @@ -49,18 +61,13 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \brief Write the bloom filter to sink. /// - /// The bloom filter must have been finished first. + /// The bloom filter cannot be modified after this method is called. /// /// \param[out] sink The output stream to write the bloom filter. - /// \param[out] location The location of all bloom filter to the start of sink. + /// \param[out] location The location of all bloom filter relative to the start of sink. virtual void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) = 0; - /// \brief Complete the bloom filter builder and no more write is allowed. - /// - /// This method must be called before WriteTo. - virtual void Finish() = 0; - virtual ~BloomFilterBuilder() = default; }; diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 94d5c215a2c16..042968aed7ae2 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -94,7 +94,6 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { for (uint64_t hash : insert_hashes) { bloom_filter->InsertHash(hash); } - builder->Finish(); auto sink = CreateOutputStream(); BloomFilterLocation location; builder->WriteTo(sink.get(), &location); @@ -140,11 +139,10 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { builder->GetOrCreateBloomFilter(0); auto sink = CreateOutputStream(); BloomFilterLocation location; - // WriteTo() before Finish() expect throw. - ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException); - builder->Finish(); builder->WriteTo(sink.get(), &location); EXPECT_EQ(1, location.bloom_filter_location.size()); + // Multiple WriteTo() expect throw. + ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException); } } // namespace parquet::test diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 6eed86ee1efa1..ba461e27298ad 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -511,7 +511,6 @@ class FileSerializer : public ParquetFileWriter::Contents { // Serialize page index after all row groups have been written and report // location to the file metadata. BloomFilterLocation bloom_filter_location; - bloom_filter_builder_->Finish(); bloom_filter_builder_->WriteTo(sink_.get(), &bloom_filter_location); metadata_->SetBloomFilterLocation(bloom_filter_location); // Release the memory for BloomFilter. diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 29b0ce0c457c9..9b0a85dfd05e0 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1837,12 +1837,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const auto& row_group_bloom_filter_location = iter->second; for (size_t i = 0; i < row_group_bloom_filter_location.size(); ++i) { DCHECK(i < row_group_metadata.columns.size()); - auto& column = row_group_metadata.columns.at(i); + auto& column = row_group_metadata.columns[i]; auto& column_metadata = column.meta_data; - const auto& bloom_filter_location = row_group_bloom_filter_location.at(i); + const auto& bloom_filter_location = row_group_bloom_filter_location[i]; if (bloom_filter_location.has_value()) { column_metadata.__set_bloom_filter_offset(bloom_filter_location->offset); - // TODO(mwish): Allow this after Parquet 2.10 is released. + // TODO(mwish): GH-38181: Allow this after Parquet 2.10 is released. // column_metadata.__set_bloom_filter_length(bloom_filter_location->length); } } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 5b18cf1a1f23b..61555e5e98675 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -143,9 +143,9 @@ static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024; static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05; struct PARQUET_EXPORT BloomFilterOptions { - /// The number of distinct values to expect to be inserted into the bloom. + /// Expected number of distinct values to be inserted into the bloom filter. int32_t ndv = DEFAULT_BLOOM_FILTER_NDV; - /// The false positive probability expected from the bloom. + /// False-positive probability of the bloom filter. double fpp = DEFAULT_BLOOM_FILTER_FPP; }; @@ -199,7 +199,7 @@ class PARQUET_EXPORT ColumnProperties { if (bloom_filter_options) { if (bloom_filter_options->fpp > 1.0 || bloom_filter_options->fpp < 0.0) { throw ParquetException( - "Bloom Filter False positive probability must be between 0.0 and 1.0, got " + + "Bloom filter false-positive probability must fall in [0.0, 1.0], got " + std::to_string(bloom_filter_options->fpp)); } } @@ -582,7 +582,7 @@ class PARQUET_EXPORT WriterProperties { /// Default disabled. /// /// Note: Currently we don't support bloom filter for boolean columns, - /// so if enable bloom filter for boolean columns, it will be ignored. + /// so it will be ignored if the column is of boolean type. Builder* enable_bloom_filter_options(BloomFilterOptions bloom_filter_options, const std::string& path) { bloom_filter_options_[path] = bloom_filter_options; @@ -594,7 +594,7 @@ class PARQUET_EXPORT WriterProperties { /// Default disabled. /// /// Note: Currently we don't support bloom filter for boolean columns, - /// so if enable bloom filter for boolean columns, it will be ignored. + /// so it will be ignored if the column is of boolean type. Builder* enable_bloom_filter_options( BloomFilterOptions bloom_filter_options, const std::shared_ptr& path) { @@ -835,7 +835,7 @@ class PARQUET_EXPORT WriterProperties { // default_column_properties_.bloom_filter_enabled is always false and // cannot be altered by user. Thus we can safely skip checking it here. return std::any_of(column_properties_.begin(), column_properties_.end(), - [](auto& p) { return p.second.bloom_filter_enabled(); }); + [](const auto& p) { return p.second.bloom_filter_enabled(); }); } std::optional bloom_filter_options( From 90245e7261f8fd57dd3f709f128325b5e9210098 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 10 Oct 2023 19:49:26 +0800 Subject: [PATCH 20/51] add cached version test --- .../bloom_filter_reader_writer_test.cc | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 042968aed7ae2..9bc1ca84bd565 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -131,6 +131,37 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { // AppendRowGroup() is not called and expect throw. ASSERT_THROW(builder->GetOrCreateBloomFilter(0), ParquetException); + builder->AppendRowGroup(); + // GetOrCreateBloomFilter() with wrong column ordinal expect throw. + ASSERT_THROW(builder->GetOrCreateBloomFilter(2), ParquetException); + // GetOrCreateBloomFilter() with boolean expect throw. + ASSERT_THROW(builder->GetOrCreateBloomFilter(1), ParquetException); + auto filter = builder->GetOrCreateBloomFilter(0); + // Call GetOrCreateBloomFilter the second time it is actually a cached version. + EXPECT_EQ(filter, builder->GetOrCreateBloomFilter(0)); + auto sink = CreateOutputStream(); + BloomFilterLocation location; + builder->WriteTo(sink.get(), &location); + EXPECT_EQ(1, location.bloom_filter_location.size()); + // Multiple WriteTo() expect throw. + ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException); +} + +TEST(BloomFilterBuilderTest, GetOrCreate) { + SchemaDescriptor schema; + schema::NodePtr root = schema::GroupNode::Make( + "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::Boolean("c2")}); + schema.Init(root); + WriterProperties::Builder properties_builder; + BloomFilterOptions bloom_filter_options; + bloom_filter_options.ndv = 100; + properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); + properties_builder.enable_bloom_filter_options(bloom_filter_options, "c2"); + auto properties = properties_builder.build(); + auto builder = BloomFilterBuilder::Make(&schema, *properties); + // AppendRowGroup() is not called and expect throw. + ASSERT_THROW(builder->GetOrCreateBloomFilter(0), ParquetException); + builder->AppendRowGroup(); // GetOrCreateBloomFilter() with wrong column ordinal expect throw. ASSERT_THROW(builder->GetOrCreateBloomFilter(2), ParquetException); From d924e368771a100ff69384edd2747b28c4672360 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 10 Oct 2023 20:05:52 +0800 Subject: [PATCH 21/51] cleaning the code for column-props --- cpp/src/parquet/bloom_filter_builder.cc | 6 ++--- cpp/src/parquet/bloom_filter_builder.h | 4 ++- cpp/src/parquet/file_writer.cc | 33 +++++++++++++------------ 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 6d7397af4ac7f..e74c6c04e48da 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -90,11 +90,11 @@ void BloomFilterBuilderImpl::AppendRowGroup() { BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordinal) { CheckState(column_ordinal); - if (schema_->Column(column_ordinal)->physical_type() == Type::BOOLEAN) { + const ColumnDescriptor* column_descr = schema_->Column(column_ordinal); + if (column_descr->physical_type() == Type::BOOLEAN) { return nullptr; } - auto bloom_filter_options_opt = - properties_.bloom_filter_options(schema_->Column(column_ordinal)->path()); + auto bloom_filter_options_opt = properties_.bloom_filter_options(column_descr->path()); if (bloom_filter_options_opt == std::nullopt) { return nullptr; } diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index 4853e5edd5d22..289731f4d489c 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -56,7 +56,9 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \param column_ordinal Column ordinal in schema, which is only for leaf columns. /// /// \return BloomFilter for the column and its memory ownership belongs to the - /// BloomFilterBuilder. + /// BloomFilterBuilder. It will throw an exception if the BloomFilter is already + /// Finished or column_ordinal is out of bound. It will return nullptr if bloom filter + /// is not enabled for the column. virtual BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) = 0; /// \brief Write the bloom filter to sink. diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index ba461e27298ad..71233df337fe5 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -143,22 +143,23 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const int32_t column_ordinal = next_column_index_++; const auto& path = col_meta->descr()->path(); + const ColumnProperties& column_properties = properties_->column_properties(path); auto meta_encryptor = file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString()) : nullptr; auto data_encryptor = file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString()) : nullptr; - auto ci_builder = page_index_builder_ && properties_->page_index_enabled(path) && - properties_->statistics_enabled(path) + auto ci_builder = page_index_builder_ && column_properties.page_index_enabled() && + column_properties.statistics_enabled() ? page_index_builder_->GetColumnIndexBuilder(column_ordinal) : nullptr; - auto oi_builder = page_index_builder_ && properties_->page_index_enabled(path) + auto oi_builder = page_index_builder_ && column_properties.page_index_enabled() ? page_index_builder_->GetOffsetIndexBuilder(column_ordinal) : nullptr; - auto codec_options = properties_->codec_options(path) - ? properties_->codec_options(path).get() - : nullptr; + const CodecOptions* codec_options = column_properties.codec_options() + ? column_properties.codec_options().get() + : nullptr; BloomFilter* bloom_filter = bloom_filter_builder_ ? bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal) @@ -166,17 +167,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { std::unique_ptr pager; if (!codec_options) { - pager = PageWriter::Open(sink_, properties_->compression(path), col_meta, - row_group_ordinal_, static_cast(column_ordinal), - properties_->memory_pool(), false, meta_encryptor, - data_encryptor, properties_->page_checksum_enabled(), - ci_builder, oi_builder, CodecOptions()); + pager = PageWriter::Open( + sink_, column_properties.compression(), col_meta, row_group_ordinal_, + static_cast(column_ordinal), properties_->memory_pool(), + /*buffered_row_group=*/false, meta_encryptor, data_encryptor, + properties_->page_checksum_enabled(), ci_builder, oi_builder, CodecOptions()); } else { - pager = PageWriter::Open(sink_, properties_->compression(path), col_meta, - row_group_ordinal_, static_cast(column_ordinal), - properties_->memory_pool(), false, meta_encryptor, - data_encryptor, properties_->page_checksum_enabled(), - ci_builder, oi_builder, *codec_options); + pager = PageWriter::Open( + sink_, column_properties.compression(), col_meta, row_group_ordinal_, + static_cast(column_ordinal), properties_->memory_pool(), + /*buffered_row_group=*/false, meta_encryptor, data_encryptor, + properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); } column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_, bloom_filter); From 03401933034bba1d91bd10f46aaf4ec74c595201 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 10 Oct 2023 20:45:02 +0800 Subject: [PATCH 22/51] optimize get bf --- cpp/src/parquet/file_writer.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 71233df337fe5..e8f1d621741ad 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -161,7 +161,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ? column_properties.codec_options().get() : nullptr; BloomFilter* bloom_filter = - bloom_filter_builder_ + bloom_filter_builder_ && column_properties.page_index_enabled() ? bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal) : nullptr; @@ -301,6 +301,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { for (int i = 0; i < num_columns(); i++) { auto col_meta = metadata_->NextColumnChunk(); const auto& path = col_meta->descr()->path(); + const ColumnProperties& column_properties = properties_->column_properties(path); const int32_t column_ordinal = next_column_index_++; auto meta_encryptor = file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString()) @@ -308,17 +309,17 @@ class RowGroupSerializer : public RowGroupWriter::Contents { auto data_encryptor = file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString()) : nullptr; - auto ci_builder = page_index_builder_ && properties_->page_index_enabled(path) + auto ci_builder = page_index_builder_ && column_properties.page_index_enabled() ? page_index_builder_->GetColumnIndexBuilder(column_ordinal) : nullptr; - auto oi_builder = page_index_builder_ && properties_->page_index_enabled(path) + auto oi_builder = page_index_builder_ && column_properties.page_index_enabled() ? page_index_builder_->GetOffsetIndexBuilder(column_ordinal) : nullptr; - auto codec_options = properties_->codec_options(path) - ? (properties_->codec_options(path)).get() + auto codec_options = column_properties.codec_options() + ? column_properties.codec_options().get() : nullptr; BloomFilter* bloom_filter = - bloom_filter_builder_ + bloom_filter_builder_ && column_properties.bloom_filter_enabled() ? bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal) : nullptr; std::unique_ptr pager; From 23828e16d74c784f872d62dcdcabf752abb98103 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 17 Mar 2024 04:17:26 +0800 Subject: [PATCH 23/51] comment minor fix --- cpp/src/parquet/bloom_filter_reader_writer_test.cc | 4 ++-- cpp/src/parquet/column_writer_test.cc | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 9bc1ca84bd565..9d508c6320cdb 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -28,7 +28,7 @@ namespace parquet::test { TEST(BloomFilterReader, ReadBloomFilter) { - std::string dir_string(parquet::test::get_data_dir()); + std::string dir_string(get_data_dir()); std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet"; auto reader = ParquetFileReader::OpenFile(path, false); auto file_metadata = reader->metadata(); @@ -59,7 +59,7 @@ TEST(BloomFilterReader, ReadBloomFilter) { TEST(BloomFilterReader, FileNotHaveBloomFilter) { // Can still get a BloomFilterReader and a RowGroupBloomFilter // reader, but cannot get a non-null BloomFilter. - std::string dir_string(parquet::test::get_data_dir()); + std::string dir_string(get_data_dir()); std::string path = dir_string + "/alltypes_plain.parquet"; auto reader = ParquetFileReader::OpenFile(path, false); auto file_metadata = reader->metadata(); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 4ec8ded16a7b0..c14b29f95c6a6 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1706,8 +1706,7 @@ class TestBloomFilterWriter : public TestPrimitiveWriter { } std::shared_ptr> BuildWriterWithBloomFilter( - int64_t output_size = SMALL_SIZE, - const ColumnProperties& column_properties = ColumnProperties()); + int64_t output_size, const ColumnProperties& column_properties); std::unique_ptr builder_; BloomFilter* bloom_filter_; @@ -1761,7 +1760,7 @@ TYPED_TEST(TestBloomFilterWriter, Basic) { this->GenerateData(SMALL_SIZE); ColumnProperties column_properties; BloomFilterOptions options; - options.ndv = 10; + options.ndv = SMALL_SIZE; column_properties.set_bloom_filter_options(options); auto writer = this->BuildWriterWithBloomFilter(SMALL_SIZE, column_properties); From 6fd57dc91ece3ec53f2d01b3da8f082d64d39e9c Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 17 Mar 2024 04:54:05 +0800 Subject: [PATCH 24/51] fix comment and add bloom-filter-length --- cpp/src/parquet/bloom_filter_builder.cc | 4 +--- cpp/src/parquet/bloom_filter_builder.h | 16 +++++++++++----- cpp/src/parquet/metadata.cc | 4 ++-- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index e74c6c04e48da..8bfa7643e8f68 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -91,9 +91,7 @@ void BloomFilterBuilderImpl::AppendRowGroup() { BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordinal) { CheckState(column_ordinal); const ColumnDescriptor* column_descr = schema_->Column(column_ordinal); - if (column_descr->physical_type() == Type::BOOLEAN) { - return nullptr; - } + DCHECK_NE(column_descr->physical_type(), Type::BOOLEAN); auto bloom_filter_options_opt = properties_.bloom_filter_options(column_descr->path()); if (bloom_filter_options_opt == std::nullopt) { return nullptr; diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index 289731f4d489c..8ed09b8767771 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -35,20 +35,21 @@ struct BloomFilterLocation; /// bloom_filter_builder->AppendRowGroup(); /// auto* bloom_filter = /// bloom_filter_builder->GetOrCreateBloomFilter(bloom_filter_column); -/// // Add bloom filter entries. +/// // Add bloom filter entries in `bloom_filter`. /// // ... /// } /// bloom_filter_builder->WriteTo(sink, location); /// ``` class PARQUET_EXPORT BloomFilterBuilder { public: - /// \brief API convenience to create a BloomFilterBuilder. + /// \brief API to create a BloomFilterBuilder. static std::unique_ptr Make(const SchemaDescriptor* schema, const WriterProperties& properties); /// Append a new row group to host all incoming bloom filters. /// - /// This method must be called before WriteTo. + /// This method must be called before `GetOrCreateBloomFilter` + /// in a row group. virtual void AppendRowGroup() = 0; /// \brief Get the BloomFilter from column ordinal. @@ -57,8 +58,13 @@ class PARQUET_EXPORT BloomFilterBuilder { /// /// \return BloomFilter for the column and its memory ownership belongs to the /// BloomFilterBuilder. It will throw an exception if the BloomFilter is already - /// Finished or column_ordinal is out of bound. It will return nullptr if bloom filter - /// is not enabled for the column. + /// Finished or column_ordinal is out of bound. + /// + /// It will return nullptr if bloom filter is not enabled for the column. + /// + /// It will throw exception when BloomFilter BloomFilterBuilder is called with + /// out-of-order `column_ordinal`, without calling `AppendRowGroup` before + /// `GetOrCreateBloomFilter`. virtual BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) = 0; /// \brief Write the bloom filter to sink. diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index da6cd539e92cb..fa2930baa36c8 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1853,8 +1853,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const auto& bloom_filter_location = row_group_bloom_filter_location[i]; if (bloom_filter_location.has_value()) { column_metadata.__set_bloom_filter_offset(bloom_filter_location->offset); - // TODO(mwish): GH-38181: Allow this after Parquet 2.10 is released. - // column_metadata.__set_bloom_filter_length(bloom_filter_location->length); + // bloom_filter_length is added in Parquet 2.10 + column_metadata.__set_bloom_filter_length(bloom_filter_location->length); } } } From 86a8760fa5a97731a0c59a98d9d20cd3f4785a67 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 17 Mar 2024 05:28:18 +0800 Subject: [PATCH 25/51] Fix a bf bug --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 3 ++- cpp/src/parquet/file_writer.cc | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index bae0d813771d0..0caf824fa77e7 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5754,8 +5754,9 @@ class ParquetBloomFilterRoundTripTest : public ::testing::Test, auto bloom_filter = row_group_reader->GetColumnBloomFilter(col); if (expect_no_bloom_filter) { - ASSERT_EQ(bloom_filter, nullptr); + ASSERT_EQ(nullptr, bloom_filter); } else { + ASSERT_NE(nullptr, bloom_filter); bloom_filters_.push_back(std::move(bloom_filter)); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 0f9fb8b21d4bf..d485da0ccfa33 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -161,7 +161,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ? column_properties.codec_options().get() : nullptr; BloomFilter* bloom_filter = - bloom_filter_builder_ && column_properties.page_index_enabled() + bloom_filter_builder_ && column_properties.bloom_filter_enabled() ? bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal) : nullptr; From f8e724cc341cf679ca28e3604ff6a6e28763514c Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 17 Mar 2024 13:53:34 +0800 Subject: [PATCH 26/51] trying to use std::map for RowGroup filter --- cpp/src/parquet/bloom_filter_builder.cc | 30 ++++++++++++++----------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 8bfa7643e8f68..0294d14f9c899 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -49,6 +49,9 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { /// deletes all bloom filters after they have been flushed. void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override; + BloomFilterBuilderImpl(const BloomFilterBuilderImpl&) = delete; + BloomFilterBuilderImpl(BloomFilterBuilderImpl&&) = default; + private: /// Make sure column ordinal is not out of bound and the builder is in good state. void CheckState(int32_t column_ordinal) const { @@ -70,7 +73,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { WriterProperties properties_; bool finished_ = false; - using RowGroupBloomFilters = std::vector>; + using RowGroupBloomFilters = std::map>; std::vector file_bloom_filters_; }; @@ -84,8 +87,7 @@ void BloomFilterBuilderImpl::AppendRowGroup() { throw ParquetException( "Cannot call AppendRowGroup() to finished BloomFilterBuilder."); } - RowGroupBloomFilters row_group_bloom_filters(schema_->num_columns()); - file_bloom_filters_.emplace_back(std::move(row_group_bloom_filters)); + file_bloom_filters_.emplace_back(); } BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordinal) { @@ -96,16 +98,21 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin if (bloom_filter_options_opt == std::nullopt) { return nullptr; } - BloomFilterOptions& bloom_filter_options = *bloom_filter_options_opt; - std::unique_ptr& bloom_filter = file_bloom_filters_.back()[column_ordinal]; - if (bloom_filter == nullptr) { + BloomFilterOptions bloom_filter_options = *bloom_filter_options_opt; + auto& row_group_bloom_filter = file_bloom_filters_.back(); + auto iter = row_group_bloom_filter.find(column_ordinal); + if (iter == row_group_bloom_filter.end()) { auto block_split_bloom_filter = std::make_unique(properties_.memory_pool()); block_split_bloom_filter->Init(BlockSplitBloomFilter::OptimalNumOfBytes( bloom_filter_options.ndv, bloom_filter_options.fpp)); - bloom_filter = std::move(block_split_bloom_filter); + auto insert_result = row_group_bloom_filter.emplace( + column_ordinal, std::move(block_split_bloom_filter)); + ARROW_CHECK(insert_result.second); + iter = insert_result.first; } - return bloom_filter.get(); + ARROW_CHECK(iter->second != nullptr); + return iter->second.get(); } void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, @@ -127,11 +134,8 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, std::vector> locations(num_columns, std::nullopt); // serialize bloom filter in ascending order of column id - for (int column_id = 0; column_id < num_columns; ++column_id) { - auto& filter = row_group_bloom_filters[column_id]; - if (filter == nullptr) { - continue; - } + for (auto& [column_id, filter] : row_group_bloom_filters) { + ARROW_CHECK(filter != nullptr); PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); filter->WriteTo(sink); PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); From 447badf29a8ac987e08291ed9c4a7db8a7fabd9a Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 17 Mar 2024 14:45:40 +0800 Subject: [PATCH 27/51] trying to fix msvc compile --- cpp/src/parquet/bloom_filter_builder.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 0294d14f9c899..97b3500c62288 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -74,7 +74,10 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { bool finished_ = false; using RowGroupBloomFilters = std::map>; - std::vector file_bloom_filters_; + // Using unique_ptr because the `std::unique_ptr` is not copyable. + // MSVC has the issue below: https://github.com/microsoft/STL/issues/1036 + // So we use `std::unique_ptr>` to avoid the issue. + std::vector> file_bloom_filters_; }; std::unique_ptr BloomFilterBuilder::Make( @@ -87,7 +90,7 @@ void BloomFilterBuilderImpl::AppendRowGroup() { throw ParquetException( "Cannot call AppendRowGroup() to finished BloomFilterBuilder."); } - file_bloom_filters_.emplace_back(); + file_bloom_filters_.emplace_back(std::make_unique()); } BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordinal) { @@ -99,7 +102,7 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin return nullptr; } BloomFilterOptions bloom_filter_options = *bloom_filter_options_opt; - auto& row_group_bloom_filter = file_bloom_filters_.back(); + RowGroupBloomFilters& row_group_bloom_filter = *file_bloom_filters_.back(); auto iter = row_group_bloom_filter.find(column_ordinal); if (iter == row_group_bloom_filter.end()) { auto block_split_bloom_filter = @@ -124,7 +127,8 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, for (size_t row_group_ordinal = 0; row_group_ordinal < file_bloom_filters_.size(); ++row_group_ordinal) { - const auto& row_group_bloom_filters = file_bloom_filters_[row_group_ordinal]; + RowGroupBloomFilters& row_group_bloom_filters = + *file_bloom_filters_[row_group_ordinal]; // the whole row group has no bloom filter if (row_group_bloom_filters.empty()) { continue; From 0c1065c8924d98baf21b29b8ed0dfed0831d58d5 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 17 Mar 2024 21:49:20 +0800 Subject: [PATCH 28/51] fix comment --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 0caf824fa77e7..e3b86ceb48cb6 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5737,7 +5737,8 @@ class ParquetBloomFilterRoundTripTest : public ::testing::Test, void ReadBloomFilters(int expect_num_row_groups, const std::set& expect_columns_without_filter = {}) { auto read_properties = default_arrow_reader_properties(); - auto reader = ParquetFileReader::Open(std::make_shared(buffer_)); + auto reader = + ParquetFileReader::Open(std::make_shared(buffer_), read_properties); auto metadata = reader->metadata(); ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups()); From 5225e08d388db7397ae3a1a582273976f9c85ff0 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 17 Mar 2024 23:03:52 +0800 Subject: [PATCH 29/51] add test case for 2 row-groups --- .../bloom_filter_reader_writer_test.cc | 133 +++++++++++------- 1 file changed, 85 insertions(+), 48 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 9d508c6320cdb..5fed99589f2dd 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include "arrow/testing/gtest_util.h" @@ -28,31 +29,41 @@ namespace parquet::test { TEST(BloomFilterReader, ReadBloomFilter) { - std::string dir_string(get_data_dir()); - std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet"; - auto reader = ParquetFileReader::OpenFile(path, false); - auto file_metadata = reader->metadata(); - EXPECT_FALSE(file_metadata->is_encryption_algorithm_set()); - auto& bloom_filter_reader = reader->GetBloomFilterReader(); - auto row_group_0 = bloom_filter_reader.RowGroup(0); - ASSERT_NE(nullptr, row_group_0); - EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException); - auto bloom_filter = row_group_0->GetColumnBloomFilter(0); - ASSERT_NE(nullptr, bloom_filter); - EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException); - - // assert exists - { - std::string_view sv = "Hello"; - ByteArray ba{sv}; - EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); - } - - // no exists - { - std::string_view sv = "NOT_EXISTS"; - ByteArray ba{sv}; - EXPECT_FALSE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); + std::vector files = {"data_index_bloom_encoding_stats.parquet", + "data_index_bloom_encoding_with_length.parquet"}; + for (const auto& test_file : files) { + std::string dir_string(get_data_dir()); + std::string path = dir_string + "/" + test_file; + auto reader = ParquetFileReader::OpenFile(path, /*memory_map=*/false); + auto file_metadata = reader->metadata(); + EXPECT_FALSE(file_metadata->is_encryption_algorithm_set()); + auto& bloom_filter_reader = reader->GetBloomFilterReader(); + auto row_group_0 = bloom_filter_reader.RowGroup(0); + ASSERT_NE(nullptr, row_group_0); + EXPECT_THROW_THAT( + [&]() { bloom_filter_reader.RowGroup(1); }, ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("Invalid row group ordinal"))); + auto bloom_filter = row_group_0->GetColumnBloomFilter(0); + ASSERT_NE(nullptr, bloom_filter); + EXPECT_THROW_THAT([&]() { row_group_0->GetColumnBloomFilter(1); }, ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr( + "Invalid column index at column ordinal"))); + + // assert exists + { + std::string_view sv = "Hello"; + ByteArray ba{sv}; + EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); + } + + // no exists + { + std::string_view sv = "NOT_EXISTS"; + ByteArray ba{sv}; + EXPECT_FALSE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); + } } } @@ -84,36 +95,62 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); auto writer_properties = properties_builder.build(); auto builder = BloomFilterBuilder::Make(&schema, *writer_properties); - builder->AppendRowGroup(); - auto bloom_filter = builder->GetOrCreateBloomFilter(0); - ASSERT_NE(nullptr, bloom_filter); - ASSERT_EQ(bloom_filter->GetBitsetSize(), - BlockSplitBloomFilter::OptimalNumOfBytes(bloom_filter_options.ndv, - bloom_filter_options.fpp)); - std::vector insert_hashes = {100, 200}; - for (uint64_t hash : insert_hashes) { - bloom_filter->InsertHash(hash); - } + + auto append_values_to_bloom_filter = [&](const std::vector& insert_hashes) { + builder->AppendRowGroup(); + auto bloom_filter = builder->GetOrCreateBloomFilter(0); + ASSERT_NE(nullptr, bloom_filter); + ASSERT_EQ(bloom_filter->GetBitsetSize(), + BlockSplitBloomFilter::OptimalNumOfBytes(bloom_filter_options.ndv, + bloom_filter_options.fpp)); + for (uint64_t hash : insert_hashes) { + bloom_filter->InsertHash(hash); + } + }; + // First row-group + append_values_to_bloom_filter({100, 200}); + // Second row-group + append_values_to_bloom_filter({300, 400}); auto sink = CreateOutputStream(); BloomFilterLocation location; builder->WriteTo(sink.get(), &location); - EXPECT_EQ(1, location.bloom_filter_location.size()); - EXPECT_EQ(2, location.bloom_filter_location[0].size()); - EXPECT_TRUE(location.bloom_filter_location[0][0].has_value()); - EXPECT_FALSE(location.bloom_filter_location[0][1].has_value()); + EXPECT_EQ(2, location.bloom_filter_location.size()); + for (auto& [row_group_id, row_group_bloom_filter] : location.bloom_filter_location) { + EXPECT_EQ(2, row_group_bloom_filter.size()); + EXPECT_TRUE(row_group_bloom_filter[0].has_value()); + EXPECT_FALSE(row_group_bloom_filter[1].has_value()); + } - int64_t bloom_filter_offset = location.bloom_filter_location[0][0]->offset; - int32_t bloom_filter_length = location.bloom_filter_location[0][0]->length; + struct RowGroupBloomFilterCase { + int32_t row_group_id; + std::vector exists_hashes; + std::vector unexists_hashes; + }; ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); - ReaderProperties reader_properties; - ::arrow::io::BufferReader reader( - ::arrow::SliceBuffer(buffer, bloom_filter_offset, bloom_filter_length)); - auto filter = parquet::BlockSplitBloomFilter::Deserialize(reader_properties, &reader); - for (uint64_t hash : insert_hashes) { - EXPECT_TRUE(bloom_filter->FindHash(hash)); + + std::vector cases = { + RowGroupBloomFilterCase{/*row_group_id=*/0, /*exists_hashes=*/{100, 200}, + /*unexists_hashes=*/{300, 400}}, + RowGroupBloomFilterCase{/*row_group_id=*/1, /*exists_hashes=*/{300, 400}, + /*unexists_hashes=*/{100, 200}}}; + for (const auto& c : cases) { + int64_t bloom_filter_offset = + location.bloom_filter_location[c.row_group_id][0]->offset; + int32_t bloom_filter_length = + location.bloom_filter_location[c.row_group_id][0]->length; + + ReaderProperties reader_properties; + ::arrow::io::BufferReader reader( + ::arrow::SliceBuffer(buffer, bloom_filter_offset, bloom_filter_length)); + auto filter = parquet::BlockSplitBloomFilter::Deserialize(reader_properties, &reader); + for (uint64_t hash : c.exists_hashes) { + EXPECT_TRUE(filter.FindHash(hash)); + } + for (uint64_t hash : c.unexists_hashes) { + EXPECT_FALSE(filter.FindHash(hash)); + } } - EXPECT_FALSE(filter.FindHash(300)); } TEST(BloomFilterBuilderTest, InvalidOperations) { From a779982211807455e20d1aa6ea3e695117ad7f0e Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 17 Mar 2024 23:23:53 +0800 Subject: [PATCH 30/51] add test case for dictionary --- .../parquet/arrow/arrow_reader_writer_test.cc | 92 ++++++++++++++----- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index e3b86ceb48cb6..769622c488420 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5736,9 +5736,7 @@ class ParquetBloomFilterRoundTripTest : public ::testing::Test, public: void ReadBloomFilters(int expect_num_row_groups, const std::set& expect_columns_without_filter = {}) { - auto read_properties = default_arrow_reader_properties(); - auto reader = - ParquetFileReader::Open(std::make_shared(buffer_), read_properties); + auto reader = ParquetFileReader::Open(std::make_shared(buffer_)); auto metadata = reader->metadata(); ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups()); @@ -5780,6 +5778,8 @@ class ParquetBloomFilterRoundTripTest : public ::testing::Test, }; TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { + auto schema = ::arrow::schema( + {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); BloomFilterOptions options; options.ndv = 100; auto writer_properties = WriterProperties::Builder() @@ -5787,8 +5787,6 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { ->enable_bloom_filter_options(options, "c1") ->max_row_group_length(4) ->build(); - auto schema = ::arrow::schema( - {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); auto table = ::arrow::TableFromJSON(schema, {R"([ [1, "a"], [2, "b"], @@ -5801,25 +5799,73 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { ReadBloomFilters(/*expect_num_row_groups=*/2); ASSERT_EQ(4, bloom_filters_.size()); - { - ASSERT_NE(nullptr, bloom_filters_[0]); - auto col = table->column(0)->Slice(0, 4); - VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[0].get(), *col); - } - { - ASSERT_NE(nullptr, bloom_filters_[1]); - auto col = table->column(1)->Slice(0, 4); - VerifyBloomFilter<::arrow::StringType>(bloom_filters_[1].get(), *col); - } - { - ASSERT_NE(nullptr, bloom_filters_[2]); - auto col = table->column(0)->Slice(4, 2); - VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[2].get(), *col); + std::vector row_group_row_count{4, 2}; + int64_t current_row = 0; + int64_t bloom_filter_idx = 0; // current index in `bloom_filters_` + for (int64_t row_group_id = 0; row_group_id < 2; ++row_group_id) { + { + ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); + auto col = table->column(0)->Slice(current_row, row_group_row_count[row_group_id]); + VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[bloom_filter_idx].get(), *col); + ++bloom_filter_idx; + } + { + ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); + auto col = table->column(1)->Slice(current_row, row_group_row_count[row_group_id]); + VerifyBloomFilter<::arrow::StringType>(bloom_filters_[bloom_filter_idx].get(), + *col); + ++bloom_filter_idx; + } + current_row += row_group_row_count[row_group_id]; } - { - ASSERT_NE(nullptr, bloom_filters_[3]); - auto col = table->column(1)->Slice(4, 2); - VerifyBloomFilter<::arrow::StringType>(bloom_filters_[3].get(), *col); +} + +TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { + auto origin_schema = ::arrow::schema( + {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); + auto schema = ::arrow::schema( + {::arrow::field("c0", ::arrow::dictionary(::arrow::int64(), ::arrow::utf8())), + ::arrow::field("c1", ::arrow::dictionary(::arrow::int64(), ::arrow::utf8()))}); + bloom_filters_.clear(); + BloomFilterOptions options; + options.ndv = 100; + auto writer_properties = WriterProperties::Builder() + .enable_bloom_filter_options(options, "c0") + ->enable_bloom_filter_options(options, "c1") + ->max_row_group_length(4) + ->build(); + std::vector contents = {R"([ + [1, "a"], + [2, "a"], + [1, "c"], + [null, "d"], + [5, null], + [6, "d"] + ])"}; + auto table = ::arrow::TableFromJSON(schema, contents); + auto non_dict_table = ::arrow::TableFromJSON(origin_schema, contents); + WriteFile(writer_properties, table); + + ReadBloomFilters(/*expect_num_row_groups=*/2); + ASSERT_EQ(4, bloom_filters_.size()); + std::vector row_group_row_count{4, 2}; + int64_t current_row = 0; + int64_t bloom_filter_idx = 0; // current index in `bloom_filters_` + for (int64_t row_group_id = 0; row_group_id < 2; ++row_group_id) { + { + ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); + auto col = table->column(0)->Slice(current_row, row_group_row_count[row_group_id]); + VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[bloom_filter_idx].get(), *col); + ++bloom_filter_idx; + } + { + ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); + auto col = table->column(1)->Slice(current_row, row_group_row_count[row_group_id]); + VerifyBloomFilter<::arrow::StringType>(bloom_filters_[bloom_filter_idx].get(), + *col); + ++bloom_filter_idx; + } + current_row += row_group_row_count[row_group_id]; } } From 41954064d831ad56527b48993b3bc9afaad71e80 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 18 Mar 2024 00:18:16 +0800 Subject: [PATCH 31/51] minor update style for file_writer.cc --- .../parquet/arrow/arrow_reader_writer_test.cc | 14 ++++--- cpp/src/parquet/file_writer.cc | 38 +++++++------------ 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 769622c488420..9d1c3afa03dbd 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5824,7 +5824,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { auto origin_schema = ::arrow::schema( {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); auto schema = ::arrow::schema( - {::arrow::field("c0", ::arrow::dictionary(::arrow::int64(), ::arrow::utf8())), + {::arrow::field("c0", ::arrow::dictionary(::arrow::int64(), ::arrow::int64())), ::arrow::field("c1", ::arrow::dictionary(::arrow::int64(), ::arrow::utf8()))}); bloom_filters_.clear(); BloomFilterOptions options; @@ -5836,11 +5836,11 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { ->build(); std::vector contents = {R"([ [1, "a"], - [2, "a"], - [1, "c"], + [2, "b"], + [3, "c"], [null, "d"], [5, null], - [6, "d"] + [6, "f"] ])"}; auto table = ::arrow::TableFromJSON(schema, contents); auto non_dict_table = ::arrow::TableFromJSON(origin_schema, contents); @@ -5854,13 +5854,15 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { for (int64_t row_group_id = 0; row_group_id < 2; ++row_group_id) { { ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); - auto col = table->column(0)->Slice(current_row, row_group_row_count[row_group_id]); + auto col = non_dict_table->column(0)->Slice(current_row, + row_group_row_count[row_group_id]); VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[bloom_filter_idx].get(), *col); ++bloom_filter_idx; } { ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); - auto col = table->column(1)->Slice(current_row, row_group_row_count[row_group_id]); + auto col = non_dict_table->column(1)->Slice(current_row, + row_group_row_count[row_group_id]); VerifyBloomFilter<::arrow::StringType>(bloom_filters_[bloom_filter_idx].get(), *col); ++bloom_filter_idx; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index d485da0ccfa33..e4cd9f91aa8b8 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -165,20 +165,15 @@ class RowGroupSerializer : public RowGroupWriter::Contents { ? bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal) : nullptr; - std::unique_ptr pager; + CodecOptions default_codec_options; if (!codec_options) { - pager = PageWriter::Open( - sink_, column_properties.compression(), col_meta, row_group_ordinal_, - static_cast(column_ordinal), properties_->memory_pool(), - /*buffered_row_group=*/false, meta_encryptor, data_encryptor, - properties_->page_checksum_enabled(), ci_builder, oi_builder, CodecOptions()); - } else { - pager = PageWriter::Open( - sink_, column_properties.compression(), col_meta, row_group_ordinal_, - static_cast(column_ordinal), properties_->memory_pool(), - /*buffered_row_group=*/false, meta_encryptor, data_encryptor, - properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); + codec_options = &default_codec_options; } + std::unique_ptr pager = PageWriter::Open( + sink_, column_properties.compression(), col_meta, row_group_ordinal_, + static_cast(column_ordinal), properties_->memory_pool(), + /*buffered_row_group=*/false, meta_encryptor, data_encryptor, + properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_, bloom_filter); return column_writers_[0].get(); @@ -322,20 +317,15 @@ class RowGroupSerializer : public RowGroupWriter::Contents { bloom_filter_builder_ && column_properties.bloom_filter_enabled() ? bloom_filter_builder_->GetOrCreateBloomFilter(column_ordinal) : nullptr; - std::unique_ptr pager; + CodecOptions default_codec_options; if (!codec_options) { - pager = PageWriter::Open( - sink_, properties_->compression(path), col_meta, row_group_ordinal_, - static_cast(column_ordinal), properties_->memory_pool(), - buffered_row_group_, meta_encryptor, data_encryptor, - properties_->page_checksum_enabled(), ci_builder, oi_builder, CodecOptions()); - } else { - pager = PageWriter::Open( - sink_, properties_->compression(path), col_meta, row_group_ordinal_, - static_cast(column_ordinal), properties_->memory_pool(), - buffered_row_group_, meta_encryptor, data_encryptor, - properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); + codec_options = &default_codec_options; } + std::unique_ptr pager = PageWriter::Open( + sink_, properties_->compression(path), col_meta, row_group_ordinal_, + static_cast(column_ordinal), properties_->memory_pool(), + buffered_row_group_, meta_encryptor, data_encryptor, + properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); column_writers_.push_back( ColumnWriter::Make(col_meta, std::move(pager), properties_, bloom_filter)); } From 478889ddc65756f2a0ca9f658379d1bf7c580442 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 26 Mar 2024 22:04:27 +0800 Subject: [PATCH 32/51] resolve comment --- .../parquet/arrow/arrow_reader_writer_test.cc | 36 +++++++++++++++++++ cpp/src/parquet/bloom_filter.h | 13 +++++++ cpp/src/parquet/bloom_filter_builder.h | 18 ++++++---- cpp/src/parquet/column_writer.cc | 18 +++++----- cpp/src/parquet/file_writer.cc | 2 +- cpp/src/parquet/metadata.cc | 2 +- 6 files changed, 72 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 9d1c3afa03dbd..f499ac8e9465e 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5871,5 +5871,41 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { } } +TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripWithOneFilter) { + auto schema = ::arrow::schema( + {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); + BloomFilterOptions options; + options.ndv = 100; + auto writer_properties = WriterProperties::Builder() + .enable_bloom_filter_options(options, "c0") + ->disable_bloom_filter("c1") + ->max_row_group_length(4) + ->build(); + auto table = ::arrow::TableFromJSON(schema, {R"([ + [1, "a"], + [2, "b"], + [3, "c"], + [null, "d"], + [5, null], + [6, "f"] + ])"}); + WriteFile(writer_properties, table); + + ReadBloomFilters(/*expect_num_row_groups=*/2, /*expect_columns_without_filter=*/{1}); + ASSERT_EQ(2, bloom_filters_.size()); + std::vector row_group_row_count{4, 2}; + int64_t current_row = 0; + int64_t bloom_filter_idx = 0; // current index in `bloom_filters_` + for (int64_t row_group_id = 0; row_group_id < 2; ++row_group_id) { + { + ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); + auto col = table->column(0)->Slice(current_row, row_group_row_count[row_group_id]); + VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[bloom_filter_idx].get(), *col); + ++bloom_filter_idx; + } + current_row += row_group_row_count[row_group_id]; + } +} + } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index dd669607e28bd..191e9559ce5b3 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -168,11 +168,24 @@ class PARQUET_EXPORT BloomFilter { virtual ~BloomFilter() = default; // Variant of const reference argument to facilitate template + + /// Compute hash for ByteArray value by using its plain encoding result. + /// + /// @param value the value to hash. uint64_t Hash(const ByteArray& value) const { return Hash(&value); } + /// Compute hash for fixed byte array value by using its plain encoding result. + /// + /// @param value the value to hash. uint64_t Hash(const FLBA& value, uint32_t type_len) const { return Hash(&value, type_len); } + /// Compute hash for Int96 value by using its plain encoding result. + /// + /// @param value the value to hash. uint64_t Hash(const Int96& value) const { return Hash(&value); } + /// Compute hash for std::string_view value by using its plain encoding result. + /// + /// @param value the value to hash. uint64_t Hash(const std::string_view& value) const { ByteArray ba(value); return Hash(&ba); diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index 8ed09b8767771..02de29d38cddc 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -50,6 +50,9 @@ class PARQUET_EXPORT BloomFilterBuilder { /// /// This method must be called before `GetOrCreateBloomFilter` /// in a row group. + /// + /// \throws ParquetException It will throw an exception if the BloomFilter already + /// called `WriteTo`. virtual void AppendRowGroup() = 0; /// \brief Get the BloomFilter from column ordinal. @@ -57,14 +60,12 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \param column_ordinal Column ordinal in schema, which is only for leaf columns. /// /// \return BloomFilter for the column and its memory ownership belongs to the - /// BloomFilterBuilder. It will throw an exception if the BloomFilter is already - /// Finished or column_ordinal is out of bound. - /// - /// It will return nullptr if bloom filter is not enabled for the column. + /// BloomFilterBuilder. It will return nullptr if bloom filter is not enabled for the + /// column. /// - /// It will throw exception when BloomFilter BloomFilterBuilder is called with - /// out-of-order `column_ordinal`, without calling `AppendRowGroup` before - /// `GetOrCreateBloomFilter`. + /// \throws ParquetException It will throw an exception if the BloomFilter already + /// called `WriteTo`, column_ordinal is out of bound, or without calling + /// `AppendRowGroup` before `GetOrCreateBloomFilter`. virtual BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) = 0; /// \brief Write the bloom filter to sink. @@ -73,6 +74,9 @@ class PARQUET_EXPORT BloomFilterBuilder { /// /// \param[out] sink The output stream to write the bloom filter. /// \param[out] location The location of all bloom filter relative to the start of sink. + /// + /// \throws ParquetException It will throw an exception if the BloomFilter already + /// called `WriteTo`. virtual void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) = 0; diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 3862a2b9e71e0..a6ffaad4c065a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -157,6 +157,8 @@ inline const T* AddIfNotNull(const T* base, int64_t offset) { return nullptr; } +constexpr int64_t kHashBatchSize = 256; + } // namespace LevelEncoder::LevelEncoder() {} @@ -1644,7 +1646,6 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } } - constexpr static inline int64_t kHashBatchSize = 256; void UpdateBloomFilter(const T* values, int64_t num_values); void UpdateBloomFilterSpaced(const T* values, int64_t num_values, const uint8_t* valid_bits, int64_t valid_bits_offset); @@ -2475,13 +2476,14 @@ void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const FLBA* values template void UpdateBinaryBloomFilter(BloomFilter* bloom_filter, const ArrayType& array) { - PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( - *array.data(), - [&](const std::string_view& view) { - bloom_filter->InsertHash(bloom_filter->Hash(view)); - return Status::OK(); - }, - []() { return Status::OK(); })); + std::array hashes; + for (int64_t i = 0; i < array.length(); i += kHashBatchSize) { + int64_t current_hash_batch_size = std::min(kHashBatchSize, array.length() - i); + bloom_filter->Hashes(array.raw_values() + i, current_hash_batch_size, hashes.data()); + // current_hash_batch_size is less or equal than kHashBatchSize, so it's safe to cast + // to int. + bloom_filter->InsertHashes(hashes.data(), static_cast(current_hash_batch_size)); + } } template <> diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index a0d7dbd26408b..49274a8ba8117 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -300,7 +300,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { static_cast(column_ordinal), properties_->memory_pool(), buffered_row_group_, meta_encryptor, data_encryptor, properties_->page_checksum_enabled(), ci_builder, oi_builder, *codec_options); - return ColumnWriter::Make(col_meta, std::move(pager), properties_); + return ColumnWriter::Make(col_meta, std::move(pager), properties_, bloom_filter); } // If buffered_row_group_ is false, only column_writers_[0] is used as current writer. diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index fa2930baa36c8..e6cc9378b9e8f 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1853,7 +1853,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const auto& bloom_filter_location = row_group_bloom_filter_location[i]; if (bloom_filter_location.has_value()) { column_metadata.__set_bloom_filter_offset(bloom_filter_location->offset); - // bloom_filter_length is added in Parquet 2.10 + // bloom_filter_length is added by Parquet format 2.10.0 column_metadata.__set_bloom_filter_length(bloom_filter_location->length); } } From 2992072bcdec59eed83eb10568952b46209ff5ce Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 26 Mar 2024 22:21:20 +0800 Subject: [PATCH 33/51] fix comment for boolean col, and add test --- .../parquet/arrow/arrow_reader_writer_test.cc | 21 +++++++++++++++++++ cpp/src/parquet/properties.h | 4 ++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index f499ac8e9465e..a51b632db3107 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5907,5 +5907,26 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripWithOneFilter) { } } +TEST_F(ParquetBloomFilterRoundTripTest, ThrowForBoolean) { + auto schema = ::arrow::schema( + {::arrow::field("boolean_col", ::arrow::boolean())}; + BloomFilterOptions options; + options.ndv = 100; + auto writer_properties = WriterProperties::Builder() + .enable_bloom_filter_options(options, "boolean_col") + ->max_row_group_length(4) + ->build(); + auto table = ::arrow::TableFromJSON(schema, {R"([ + [true], + [null], + [false] + ])"}); + EXPECT_THROW_THAT([&]() { + WriteFile(writer_properties, table); }, ParquetException, + ::testing::Property( + &ParquetException::what, + ::testing::HasSubstr("BloomFilterBuilder does not support boolean type"))); +} + } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 5080a3cea6f0a..aa7395f819d1d 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -609,7 +609,7 @@ class PARQUET_EXPORT WriterProperties { /// Default disabled. /// /// Note: Currently we don't support bloom filter for boolean columns, - /// so it will be ignored if the column is of boolean type. + /// ParquetException will be thrown during write if the column is of boolean type. Builder* enable_bloom_filter_options(BloomFilterOptions bloom_filter_options, const std::string& path) { bloom_filter_options_[path] = bloom_filter_options; @@ -621,7 +621,7 @@ class PARQUET_EXPORT WriterProperties { /// Default disabled. /// /// Note: Currently we don't support bloom filter for boolean columns, - /// so it will be ignored if the column is of boolean type. + /// ParquetException will be thrown during write if the column is of boolean type. Builder* enable_bloom_filter_options( BloomFilterOptions bloom_filter_options, const std::shared_ptr& path) { From 48522615666b983ac5f0c933a4a817405ac15ae3 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 26 Mar 2024 23:04:19 +0800 Subject: [PATCH 34/51] trying to add bloom boolean test --- .../parquet/arrow/arrow_reader_writer_test.cc | 26 +++++++++++----- cpp/src/parquet/column_writer.cc | 31 ++++++++++++++----- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index a51b632db3107..76dda082332b0 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5908,8 +5908,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripWithOneFilter) { } TEST_F(ParquetBloomFilterRoundTripTest, ThrowForBoolean) { - auto schema = ::arrow::schema( - {::arrow::field("boolean_col", ::arrow::boolean())}; + auto schema = ::arrow::schema({::arrow::field("boolean_col", ::arrow::boolean())}); BloomFilterOptions options; options.ndv = 100; auto writer_properties = WriterProperties::Builder() @@ -5921,11 +5920,24 @@ TEST_F(ParquetBloomFilterRoundTripTest, ThrowForBoolean) { [null], [false] ])"}); - EXPECT_THROW_THAT([&]() { - WriteFile(writer_properties, table); }, ParquetException, - ::testing::Property( - &ParquetException::what, - ::testing::HasSubstr("BloomFilterBuilder does not support boolean type"))); + std::shared_ptr parquet_schema; + auto arrow_writer_properties = default_arrow_writer_properties(); + ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties, + *arrow_writer_properties, &parquet_schema)); + auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); + + // Write table to buffer. + auto sink = CreateOutputStream(); + auto pool = ::arrow::default_memory_pool(); + auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties); + std::unique_ptr arrow_writer; + ASSERT_OK(FileWriter::Make(pool, std::move(writer), schema, arrow_writer_properties, + &arrow_writer)); + EXPECT_THROW_THAT( + [&]() { ARROW_IGNORE_EXPR(arrow_writer->WriteTable(*table)); }, ParquetException, + ::testing::Property( + &ParquetException::what, + ::testing::HasSubstr("BloomFilterBuilder does not support boolean type"))); } } // namespace arrow diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index a6ffaad4c065a..27066e7834bf9 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2476,13 +2476,30 @@ void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const FLBA* values template void UpdateBinaryBloomFilter(BloomFilter* bloom_filter, const ArrayType& array) { - std::array hashes; - for (int64_t i = 0; i < array.length(); i += kHashBatchSize) { - int64_t current_hash_batch_size = std::min(kHashBatchSize, array.length() - i); - bloom_filter->Hashes(array.raw_values() + i, current_hash_batch_size, hashes.data()); - // current_hash_batch_size is less or equal than kHashBatchSize, so it's safe to cast - // to int. - bloom_filter->InsertHashes(hashes.data(), static_cast(current_hash_batch_size)); + // Using a smaller size because an extra `byte_arrays` are used. + constexpr int64_t kBinaryHashBatchSize = 64; + std::array byte_arrays; + std::array hashes; + int hashes_idx = 0; + auto flush_hashes = [&]() { + DCHECK(hashes_idx != 0); + bloom_filter->Hashes(byte_arrays.data(), static_cast(hashes_idx), hashes.data()); + bloom_filter->InsertHashes(hashes.data(), static_cast(hashes_idx)); + hashes_idx = 0; + }; + PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( + *array.data(), + [&](const std::string_view& view) { + if (hashes_idx == kHashBatchSize) { + flush_hashes(); + } + byte_arrays[hashes_idx] = view; + ++hashes_idx; + return Status::OK(); + }, + []() { return Status::OK(); })); + if (hashes_idx != 0) { + flush_hashes(); } } From add1afdf32f5a191d849dfbe35f10c72ed52c4fd Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 26 Mar 2024 23:21:59 +0800 Subject: [PATCH 35/51] fix test --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 76dda082332b0..eefd823dfb385 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5933,11 +5933,10 @@ TEST_F(ParquetBloomFilterRoundTripTest, ThrowForBoolean) { std::unique_ptr arrow_writer; ASSERT_OK(FileWriter::Make(pool, std::move(writer), schema, arrow_writer_properties, &arrow_writer)); - EXPECT_THROW_THAT( - [&]() { ARROW_IGNORE_EXPR(arrow_writer->WriteTable(*table)); }, ParquetException, - ::testing::Property( - &ParquetException::what, - ::testing::HasSubstr("BloomFilterBuilder does not support boolean type"))); + auto s = arrow_writer->WriteTable(*table); + EXPECT_TRUE(s.IsIOError()); + EXPECT_THAT(s.message(), + ::testing::HasSubstr("BloomFilterBuilder does not support boolean type")); } } // namespace arrow From bb8d4a57dafcb99679c1201d087be9815812375f Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 8 Apr 2024 23:18:26 +0800 Subject: [PATCH 36/51] fix some comments --- cpp/src/parquet/bloom_filter.h | 3 ++- cpp/src/parquet/bloom_filter_builder.cc | 19 ++++++------- cpp/src/parquet/bloom_filter_builder.h | 18 ++++++------- .../bloom_filter_reader_writer_test.cc | 27 ++++++++++++++----- cpp/src/parquet/column_writer_test.cc | 3 +-- cpp/src/parquet/file_writer.cc | 4 +-- cpp/src/parquet/type_fwd.h | 4 --- 7 files changed, 43 insertions(+), 35 deletions(-) diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index 191e9559ce5b3..f2207587c1371 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -173,6 +173,7 @@ class PARQUET_EXPORT BloomFilter { /// /// @param value the value to hash. uint64_t Hash(const ByteArray& value) const { return Hash(&value); } + /// Compute hash for fixed byte array value by using its plain encoding result. /// /// @param value the value to hash. @@ -186,7 +187,7 @@ class PARQUET_EXPORT BloomFilter { /// Compute hash for std::string_view value by using its plain encoding result. /// /// @param value the value to hash. - uint64_t Hash(const std::string_view& value) const { + uint64_t Hash(std::string_view value) const { ByteArray ba(value); return Hash(&ba); } diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 97b3500c62288..4be23358e0255 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -34,19 +34,20 @@ namespace parquet { +/// Column encryption for bloom filter is not implemented yet. class BloomFilterBuilderImpl : public BloomFilterBuilder { public: explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema, - WriterProperties properties) - : schema_(schema), properties_(std::move(properties)) {} + const WriterProperties* properties) + : schema_(schema), properties_(properties) {} /// Append a new row group to host all incoming bloom filters. void AppendRowGroup() override; BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) override; /// Serialize all bloom filters with header and bitset in the order of row group and - /// column id. Column encryption is not implemented yet. The side effect is that it - /// deletes all bloom filters after they have been flushed. + /// column id. The side effect is that it deletes all bloom filters after they have + /// been flushed. void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override; BloomFilterBuilderImpl(const BloomFilterBuilderImpl&) = delete; @@ -70,7 +71,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { } const SchemaDescriptor* schema_; - WriterProperties properties_; + const WriterProperties* properties_; bool finished_ = false; using RowGroupBloomFilters = std::map>; @@ -81,7 +82,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { }; std::unique_ptr BloomFilterBuilder::Make( - const SchemaDescriptor* schema, const WriterProperties& properties) { + const SchemaDescriptor* schema, const WriterProperties* properties) { return std::make_unique(schema, properties); } @@ -97,7 +98,7 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin CheckState(column_ordinal); const ColumnDescriptor* column_descr = schema_->Column(column_ordinal); DCHECK_NE(column_descr->physical_type(), Type::BOOLEAN); - auto bloom_filter_options_opt = properties_.bloom_filter_options(column_descr->path()); + auto bloom_filter_options_opt = properties_->bloom_filter_options(column_descr->path()); if (bloom_filter_options_opt == std::nullopt) { return nullptr; } @@ -106,12 +107,12 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin auto iter = row_group_bloom_filter.find(column_ordinal); if (iter == row_group_bloom_filter.end()) { auto block_split_bloom_filter = - std::make_unique(properties_.memory_pool()); + std::make_unique(properties_->memory_pool()); block_split_bloom_filter->Init(BlockSplitBloomFilter::OptimalNumOfBytes( bloom_filter_options.ndv, bloom_filter_options.fpp)); auto insert_result = row_group_bloom_filter.emplace( column_ordinal, std::move(block_split_bloom_filter)); - ARROW_CHECK(insert_result.second); + DCHECK(insert_result.second); iter = insert_result.first; } ARROW_CHECK(iter->second != nullptr); diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index 02de29d38cddc..dcfdbfbd9d025 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -44,15 +44,13 @@ class PARQUET_EXPORT BloomFilterBuilder { public: /// \brief API to create a BloomFilterBuilder. static std::unique_ptr Make(const SchemaDescriptor* schema, - const WriterProperties& properties); + const WriterProperties* properties); /// Append a new row group to host all incoming bloom filters. /// - /// This method must be called before `GetOrCreateBloomFilter` - /// in a row group. + /// This method must be called before `GetOrCreateBloomFilter` for a new row group. /// - /// \throws ParquetException It will throw an exception if the BloomFilter already - /// called `WriteTo`. + /// \throws ParquetException if WriteTo() has been called to flush bloom filters. virtual void AppendRowGroup() = 0; /// \brief Get the BloomFilter from column ordinal. @@ -63,9 +61,10 @@ class PARQUET_EXPORT BloomFilterBuilder { /// BloomFilterBuilder. It will return nullptr if bloom filter is not enabled for the /// column. /// - /// \throws ParquetException It will throw an exception if the BloomFilter already - /// called `WriteTo`, column_ordinal is out of bound, or without calling - /// `AppendRowGroup` before `GetOrCreateBloomFilter`. + /// \throws ParquetException if any of following conditions applies: + /// 1) column_ordinal is out of bound. + /// 2) `WriteTo()` has been called already. + /// 3) `AppendRowGroup()` is not called before `GetOrCreateBloomFilter()`. virtual BloomFilter* GetOrCreateBloomFilter(int32_t column_ordinal) = 0; /// \brief Write the bloom filter to sink. @@ -75,8 +74,7 @@ class PARQUET_EXPORT BloomFilterBuilder { /// \param[out] sink The output stream to write the bloom filter. /// \param[out] location The location of all bloom filter relative to the start of sink. /// - /// \throws ParquetException It will throw an exception if the BloomFilter already - /// called `WriteTo`. + /// \throws ParquetException if WriteTo() has been called to flush bloom filters. virtual void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) = 0; diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 5fed99589f2dd..90871ceaa9bd9 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -94,7 +94,7 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { bloom_filter_options.ndv = 100; properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); auto writer_properties = properties_builder.build(); - auto builder = BloomFilterBuilder::Make(&schema, *writer_properties); + auto builder = BloomFilterBuilder::Make(&schema, writer_properties.get()); auto append_values_to_bloom_filter = [&](const std::vector& insert_hashes) { builder->AppendRowGroup(); @@ -164,15 +164,25 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); properties_builder.enable_bloom_filter_options(bloom_filter_options, "c2"); auto properties = properties_builder.build(); - auto builder = BloomFilterBuilder::Make(&schema, *properties); + auto builder = BloomFilterBuilder::Make(&schema, properties.get()); // AppendRowGroup() is not called and expect throw. - ASSERT_THROW(builder->GetOrCreateBloomFilter(0), ParquetException); + EXPECT_THROW_THAT( + [&]() { builder->GetOrCreateBloomFilter(0); }, ParquetException, + ::testing::Property( + &ParquetException::what, + ::testing::HasSubstr("No row group appended to BloomFilterBuilder"))); builder->AppendRowGroup(); // GetOrCreateBloomFilter() with wrong column ordinal expect throw. - ASSERT_THROW(builder->GetOrCreateBloomFilter(2), ParquetException); + EXPECT_THROW_THAT([&]() { builder->GetOrCreateBloomFilter(2); }, ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("Invalid column ordinal"))); // GetOrCreateBloomFilter() with boolean expect throw. - ASSERT_THROW(builder->GetOrCreateBloomFilter(1), ParquetException); + EXPECT_THROW_THAT( + [&]() { builder->GetOrCreateBloomFilter(1); }, ParquetException, + ::testing::Property( + &ParquetException::what, + ::testing::HasSubstr("BloomFilterBuilder does not support boolean type"))); auto filter = builder->GetOrCreateBloomFilter(0); // Call GetOrCreateBloomFilter the second time it is actually a cached version. EXPECT_EQ(filter, builder->GetOrCreateBloomFilter(0)); @@ -181,7 +191,10 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { builder->WriteTo(sink.get(), &location); EXPECT_EQ(1, location.bloom_filter_location.size()); // Multiple WriteTo() expect throw. - ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException); + EXPECT_THROW_THAT( + [&]() { builder->WriteTo(sink.get(), &location); }, ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("Cannot call WriteTo() multiple times"))); } TEST(BloomFilterBuilderTest, GetOrCreate) { @@ -195,7 +208,7 @@ TEST(BloomFilterBuilderTest, GetOrCreate) { properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); properties_builder.enable_bloom_filter_options(bloom_filter_options, "c2"); auto properties = properties_builder.build(); - auto builder = BloomFilterBuilder::Make(&schema, *properties); + auto builder = BloomFilterBuilder::Make(&schema, properties.get()); // AppendRowGroup() is not called and expect throw. ASSERT_THROW(builder->GetOrCreateBloomFilter(0), ParquetException); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index d7277aff72194..b216c89c2fa7d 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1736,7 +1736,6 @@ TestBloomFilterWriter::BuildWriterWithBloomFilter( wp_builder.disable_dictionary(); wp_builder.encoding(column_properties.encoding()); } - wp_builder.max_statistics_size(column_properties.max_statistics_size()); auto path = this->schema_.Column(0)->path(); if (column_properties.bloom_filter_enabled()) { wp_builder.enable_bloom_filter_options( @@ -1750,7 +1749,7 @@ TestBloomFilterWriter::BuildWriterWithBloomFilter( ColumnChunkMetaDataBuilder::Make(this->writer_properties_, this->descr_); std::unique_ptr pager = PageWriter::Open( this->sink_, column_properties.compression(), this->metadata_.get()); - builder_ = BloomFilterBuilder::Make(&this->schema_, *this->writer_properties_); + builder_ = BloomFilterBuilder::Make(&this->schema_, this->writer_properties_.get()); // Initial RowGroup builder_->AppendRowGroup(); bloom_filter_ = builder_->GetOrCreateBloomFilter(0); diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 49274a8ba8117..c3c00dbc07534 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -474,7 +474,7 @@ class FileSerializer : public ParquetFileWriter::Contents { if (properties_->file_encryption_properties()) { ParquetException::NYI("Encryption is not supported with bloom filter"); } - // Serialize page index after all row groups have been written and report + // Serialize bloom filter after all row groups have been written and report // location to the file metadata. BloomFilterLocation bloom_filter_location; bloom_filter_builder_->WriteTo(sink_.get(), &bloom_filter_location); @@ -533,7 +533,7 @@ class FileSerializer : public ParquetFileWriter::Contents { } } if (properties_->bloom_filter_enabled()) { - bloom_filter_builder_ = BloomFilterBuilder::Make(schema(), *properties_); + bloom_filter_builder_ = BloomFilterBuilder::Make(schema(), properties_.get()); } if (properties_->page_index_enabled()) { page_index_builder_ = PageIndexBuilder::Make(&schema_, file_encryptor_.get()); diff --git a/cpp/src/parquet/type_fwd.h b/cpp/src/parquet/type_fwd.h index 33c79bfc744c7..857c7c4f15e71 100644 --- a/cpp/src/parquet/type_fwd.h +++ b/cpp/src/parquet/type_fwd.h @@ -84,10 +84,6 @@ class ArrowWriterPropertiesBuilder; class BloomFilter; -namespace schema { -class ColumnPath; -} // namespace schema - namespace arrow { class FileWriter; From e1de5bc6e8529f68da2f2ba6bd8c059ccc09ded8 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 9 Apr 2024 01:04:23 +0800 Subject: [PATCH 37/51] fix lint --- cpp/src/parquet/column_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 27066e7834bf9..0948b91da8109 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2482,7 +2482,7 @@ void UpdateBinaryBloomFilter(BloomFilter* bloom_filter, const ArrayType& array) std::array hashes; int hashes_idx = 0; auto flush_hashes = [&]() { - DCHECK(hashes_idx != 0); + DCHECK_NE(0, hashes_idx); bloom_filter->Hashes(byte_arrays.data(), static_cast(hashes_idx), hashes.data()); bloom_filter->InsertHashes(hashes.data(), static_cast(hashes_idx)); hashes_idx = 0; From 430742a150bfd359861404bd5f5d459ef72b4b26 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 10 Apr 2024 00:00:47 +0800 Subject: [PATCH 38/51] switch to anonymous namespace --- cpp/src/parquet/bloom_filter_builder.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 4be23358e0255..fad704a30f8d7 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -34,6 +34,7 @@ namespace parquet { +namespace { /// Column encryption for bloom filter is not implemented yet. class BloomFilterBuilderImpl : public BloomFilterBuilder { public: @@ -81,11 +82,6 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { std::vector> file_bloom_filters_; }; -std::unique_ptr BloomFilterBuilder::Make( - const SchemaDescriptor* schema, const WriterProperties* properties) { - return std::make_unique(schema, properties); -} - void BloomFilterBuilderImpl::AppendRowGroup() { if (finished_) { throw ParquetException( @@ -152,5 +148,11 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, } } } +} // namespace + +std::unique_ptr BloomFilterBuilder::Make( + const SchemaDescriptor* schema, const WriterProperties* properties) { + return std::make_unique(schema, properties); +} } // namespace parquet From 00f176e12c71444a0cd7d437ad8486ff2e0de7b9 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 26 Apr 2024 13:22:59 +0800 Subject: [PATCH 39/51] fix comment for column_writer.cc --- cpp/src/parquet/column_writer.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 0948b91da8109..6ea542e44acbb 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2489,7 +2489,7 @@ void UpdateBinaryBloomFilter(BloomFilter* bloom_filter, const ArrayType& array) }; PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline( *array.data(), - [&](const std::string_view& view) { + [&](std::string_view view) { if (hashes_idx == kHashBatchSize) { flush_hashes(); } @@ -2507,6 +2507,8 @@ template <> void TypedColumnWriterImpl::UpdateBloomFilterArray( const ::arrow::Array& values) { if (bloom_filter_) { + // TODO(mwish): GH-37832 currently we don't support write StringView/BinaryView to + // parquet file. We can support if (!::arrow::is_base_binary_like(values.type_id())) { throw ParquetException("Only BaseBinaryArray and subclasses supported"); } From 17f495123f0b7636b3013e853d1e0b4941308f4e Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 26 Apr 2024 13:25:13 +0800 Subject: [PATCH 40/51] fix comment in other parts --- cpp/src/parquet/bloom_filter_builder.cc | 4 ++-- cpp/src/parquet/bloom_filter_builder.h | 4 ++++ cpp/src/parquet/column_writer_test.cc | 2 +- cpp/src/parquet/file_writer.cc | 9 +++++---- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index fad704a30f8d7..997fd5b4c6caf 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -32,7 +32,7 @@ #include "parquet/metadata.h" #include "parquet/properties.h" -namespace parquet { +namespace parquet::internal { namespace { /// Column encryption for bloom filter is not implemented yet. @@ -155,4 +155,4 @@ std::unique_ptr BloomFilterBuilder::Make( return std::make_unique(schema, properties); } -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index dcfdbfbd9d025..f899a3f32fac2 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -27,6 +27,8 @@ class SchemaDescriptor; struct BloomFilterOptions; struct BloomFilterLocation; +namespace internal { + /// \brief Interface for collecting bloom filter of a parquet file. /// /// ``` @@ -81,4 +83,6 @@ class PARQUET_EXPORT BloomFilterBuilder { virtual ~BloomFilterBuilder() = default; }; +} // namespace internal + } // namespace parquet diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index b216c89c2fa7d..c027b2be23766 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1718,7 +1718,7 @@ class TestBloomFilterWriter : public TestPrimitiveWriter { std::shared_ptr> BuildWriterWithBloomFilter( int64_t output_size, const ColumnProperties& column_properties); - std::unique_ptr builder_; + std::unique_ptr builder_; BloomFilter* bloom_filter_; }; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index c3c00dbc07534..3f9f7f9de5578 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -93,7 +93,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { const WriterProperties* properties, bool buffered_row_group = false, InternalFileEncryptor* file_encryptor = nullptr, PageIndexBuilder* page_index_builder = nullptr, - BloomFilterBuilder* bloom_filter_builder = nullptr) + internal::BloomFilterBuilder* bloom_filter_builder = nullptr) : sink_(std::move(sink)), metadata_(metadata), properties_(properties), @@ -233,7 +233,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { bool buffered_row_group_; InternalFileEncryptor* file_encryptor_; PageIndexBuilder* page_index_builder_; - BloomFilterBuilder* bloom_filter_builder_; + internal::BloomFilterBuilder* bloom_filter_builder_; void CheckRowsWritten() const { // verify when only one column is written at a time @@ -494,7 +494,7 @@ class FileSerializer : public ParquetFileWriter::Contents { std::unique_ptr row_group_writer_; std::unique_ptr page_index_builder_; std::unique_ptr file_encryptor_; - std::unique_ptr bloom_filter_builder_; + std::unique_ptr bloom_filter_builder_; void StartFile() { auto file_encryption_properties = properties_->file_encryption_properties(); @@ -533,7 +533,8 @@ class FileSerializer : public ParquetFileWriter::Contents { } } if (properties_->bloom_filter_enabled()) { - bloom_filter_builder_ = BloomFilterBuilder::Make(schema(), properties_.get()); + bloom_filter_builder_ = + internal::BloomFilterBuilder::Make(schema(), properties_.get()); } if (properties_->page_index_enabled()) { page_index_builder_ = PageIndexBuilder::Make(&schema_, file_encryptor_.get()); From 34a4c287e7febaf1b71dbc8546ed673e9802efc8 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 10 Jun 2024 15:53:36 +0800 Subject: [PATCH 41/51] trying to fix the ci build --- cpp/src/parquet/bloom_filter_reader_writer_test.cc | 6 +++--- cpp/src/parquet/column_writer_test.cc | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 90871ceaa9bd9..014484c1d77a8 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -94,7 +94,7 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { bloom_filter_options.ndv = 100; properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); auto writer_properties = properties_builder.build(); - auto builder = BloomFilterBuilder::Make(&schema, writer_properties.get()); + auto builder = internal::BloomFilterBuilder::Make(&schema, writer_properties.get()); auto append_values_to_bloom_filter = [&](const std::vector& insert_hashes) { builder->AppendRowGroup(); @@ -164,7 +164,7 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); properties_builder.enable_bloom_filter_options(bloom_filter_options, "c2"); auto properties = properties_builder.build(); - auto builder = BloomFilterBuilder::Make(&schema, properties.get()); + auto builder = internal::BloomFilterBuilder::Make(&schema, properties.get()); // AppendRowGroup() is not called and expect throw. EXPECT_THROW_THAT( [&]() { builder->GetOrCreateBloomFilter(0); }, ParquetException, @@ -208,7 +208,7 @@ TEST(BloomFilterBuilderTest, GetOrCreate) { properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); properties_builder.enable_bloom_filter_options(bloom_filter_options, "c2"); auto properties = properties_builder.build(); - auto builder = BloomFilterBuilder::Make(&schema, properties.get()); + auto builder = internal::BloomFilterBuilder::Make(&schema, properties.get()); // AppendRowGroup() is not called and expect throw. ASSERT_THROW(builder->GetOrCreateBloomFilter(0), ParquetException); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index c027b2be23766..a916caa442495 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1749,7 +1749,8 @@ TestBloomFilterWriter::BuildWriterWithBloomFilter( ColumnChunkMetaDataBuilder::Make(this->writer_properties_, this->descr_); std::unique_ptr pager = PageWriter::Open( this->sink_, column_properties.compression(), this->metadata_.get()); - builder_ = BloomFilterBuilder::Make(&this->schema_, this->writer_properties_.get()); + builder_ = + internal::BloomFilterBuilder::Make(&this->schema_, this->writer_properties_.get()); // Initial RowGroup builder_->AppendRowGroup(); bloom_filter_ = builder_->GetOrCreateBloomFilter(0); From c587568c5dd2aae1cd0f65b87261950914aa5ca6 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 3 Jul 2024 23:17:45 +0800 Subject: [PATCH 42/51] resolve comments --- .../parquet/arrow/arrow_reader_writer_test.cc | 59 +++++++++++++++---- cpp/src/parquet/bloom_filter_builder.cc | 29 +++++---- cpp/src/parquet/column_writer.cc | 10 +++- 3 files changed, 71 insertions(+), 27 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index f71adb380ba2f..f5935661cda5e 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5763,8 +5763,8 @@ class ParquetBloomFilterRoundTripTest : public ::testing::Test, } template - void VerifyBloomFilter(const BloomFilter* bloom_filter, - const ::arrow::ChunkedArray& chunked_array) { + void VerifyBloomFilterContains(const BloomFilter* bloom_filter, + const ::arrow::ChunkedArray& chunked_array) { for (auto value : ::arrow::stl::Iterate(chunked_array)) { if (value == std::nullopt) { continue; @@ -5773,6 +5773,17 @@ class ParquetBloomFilterRoundTripTest : public ::testing::Test, } } + template + void VerifyBloomFilterNotContains(const BloomFilter* bloom_filter, + const ::arrow::ChunkedArray& chunked_array) { + for (auto value : ::arrow::stl::Iterate(chunked_array)) { + if (value == std::nullopt) { + continue; + } + EXPECT_FALSE(bloom_filter->FindHash(bloom_filter->Hash(value.value()))); + } + } + protected: std::vector> bloom_filters_; }; @@ -5781,7 +5792,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { auto schema = ::arrow::schema( {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); BloomFilterOptions options; - options.ndv = 100; + options.ndv = 10; auto writer_properties = WriterProperties::Builder() .enable_bloom_filter_options(options, "c0") ->enable_bloom_filter_options(options, "c1") @@ -5804,16 +5815,26 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { int64_t bloom_filter_idx = 0; // current index in `bloom_filters_` for (int64_t row_group_id = 0; row_group_id < 2; ++row_group_id) { { + // The bloom filter for same column in another row-group. + int64_t bloom_filter_idx_another_rg = + row_group_id == 0 ? bloom_filter_idx + 2 : bloom_filter_idx - 2; ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); auto col = table->column(0)->Slice(current_row, row_group_row_count[row_group_id]); - VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[bloom_filter_idx].get(), *col); + VerifyBloomFilterContains<::arrow::Int64Type>( + bloom_filters_[bloom_filter_idx].get(), *col); + VerifyBloomFilterNotContains<::arrow::Int64Type>( + bloom_filters_[bloom_filter_idx_another_rg].get(), *col); ++bloom_filter_idx; } { + int64_t bloom_filter_idx_another_rg = + row_group_id == 0 ? bloom_filter_idx + 2 : bloom_filter_idx - 2; ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); auto col = table->column(1)->Slice(current_row, row_group_row_count[row_group_id]); - VerifyBloomFilter<::arrow::StringType>(bloom_filters_[bloom_filter_idx].get(), - *col); + VerifyBloomFilterContains<::arrow::StringType>( + bloom_filters_[bloom_filter_idx].get(), *col); + VerifyBloomFilterNotContains<::arrow::StringType>( + bloom_filters_[bloom_filter_idx_another_rg].get(), *col); ++bloom_filter_idx; } current_row += row_group_row_count[row_group_id]; @@ -5828,7 +5849,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { ::arrow::field("c1", ::arrow::dictionary(::arrow::int64(), ::arrow::utf8()))}); bloom_filters_.clear(); BloomFilterOptions options; - options.ndv = 100; + options.ndv = 10; auto writer_properties = WriterProperties::Builder() .enable_bloom_filter_options(options, "c0") ->enable_bloom_filter_options(options, "c1") @@ -5843,6 +5864,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { [6, "f"] ])"}; auto table = ::arrow::TableFromJSON(schema, contents); + // using non_dict_table to adapt some interface which doesn't support dictionary. auto non_dict_table = ::arrow::TableFromJSON(origin_schema, contents); WriteFile(writer_properties, table); @@ -5853,18 +5875,28 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { int64_t bloom_filter_idx = 0; // current index in `bloom_filters_` for (int64_t row_group_id = 0; row_group_id < 2; ++row_group_id) { { + // The bloom filter for same column in another row-group. + int64_t bloom_filter_idx_another_rg = + row_group_id == 0 ? bloom_filter_idx + 2 : bloom_filter_idx - 2; ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); auto col = non_dict_table->column(0)->Slice(current_row, row_group_row_count[row_group_id]); - VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[bloom_filter_idx].get(), *col); + VerifyBloomFilterContains<::arrow::Int64Type>( + bloom_filters_[bloom_filter_idx].get(), *col); + VerifyBloomFilterNotContains<::arrow::Int64Type>( + bloom_filters_[bloom_filter_idx_another_rg].get(), *col); ++bloom_filter_idx; } { + int64_t bloom_filter_idx_another_rg = + row_group_id == 0 ? bloom_filter_idx + 2 : bloom_filter_idx - 2; ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); auto col = non_dict_table->column(1)->Slice(current_row, row_group_row_count[row_group_id]); - VerifyBloomFilter<::arrow::StringType>(bloom_filters_[bloom_filter_idx].get(), - *col); + VerifyBloomFilterContains<::arrow::StringType>( + bloom_filters_[bloom_filter_idx].get(), *col); + VerifyBloomFilterNotContains<::arrow::StringType>( + bloom_filters_[bloom_filter_idx_another_rg].get(), *col); ++bloom_filter_idx; } current_row += row_group_row_count[row_group_id]; @@ -5875,7 +5907,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripWithOneFilter) { auto schema = ::arrow::schema( {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); BloomFilterOptions options; - options.ndv = 100; + options.ndv = 10; auto writer_properties = WriterProperties::Builder() .enable_bloom_filter_options(options, "c0") ->disable_bloom_filter("c1") @@ -5900,7 +5932,8 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripWithOneFilter) { { ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); auto col = table->column(0)->Slice(current_row, row_group_row_count[row_group_id]); - VerifyBloomFilter<::arrow::Int64Type>(bloom_filters_[bloom_filter_idx].get(), *col); + VerifyBloomFilterContains<::arrow::Int64Type>( + bloom_filters_[bloom_filter_idx].get(), *col); ++bloom_filter_idx; } current_row += row_group_row_count[row_group_id]; @@ -5910,7 +5943,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripWithOneFilter) { TEST_F(ParquetBloomFilterRoundTripTest, ThrowForBoolean) { auto schema = ::arrow::schema({::arrow::field("boolean_col", ::arrow::boolean())}); BloomFilterOptions options; - options.ndv = 100; + options.ndv = 10; auto writer_properties = WriterProperties::Builder() .enable_bloom_filter_options(options, "boolean_col") ->max_row_group_length(4) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 997fd5b4c6caf..651ca89ba266b 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -41,6 +41,9 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema, const WriterProperties* properties) : schema_(schema), properties_(properties) {} + BloomFilterBuilderImpl(const BloomFilterBuilderImpl&) = delete; + BloomFilterBuilderImpl(BloomFilterBuilderImpl&&) = default; + /// Append a new row group to host all incoming bloom filters. void AppendRowGroup() override; @@ -51,14 +54,11 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { /// been flushed. void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override; - BloomFilterBuilderImpl(const BloomFilterBuilderImpl&) = delete; - BloomFilterBuilderImpl(BloomFilterBuilderImpl&&) = default; - private: /// Make sure column ordinal is not out of bound and the builder is in good state. void CheckState(int32_t column_ordinal) const { if (finished_) { - throw ParquetException("BloomFilterBuilder is already finished."); + throw ParquetException("Cannot call WriteTo() twice on BloomFilterBuilder."); } if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) { throw ParquetException("Invalid column ordinal: ", column_ordinal); @@ -85,7 +85,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { void BloomFilterBuilderImpl::AppendRowGroup() { if (finished_) { throw ParquetException( - "Cannot call AppendRowGroup() to finished BloomFilterBuilder."); + "Cannot call AppendRowGroup() to BloomFilterBuilder::WriteTo is called"); } file_bloom_filters_.emplace_back(std::make_unique()); } @@ -93,12 +93,16 @@ void BloomFilterBuilderImpl::AppendRowGroup() { BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordinal) { CheckState(column_ordinal); const ColumnDescriptor* column_descr = schema_->Column(column_ordinal); + // Bloom filter does not support boolean type, and this should be checked in + // CheckState() already. DCHECK_NE(column_descr->physical_type(), Type::BOOLEAN); auto bloom_filter_options_opt = properties_->bloom_filter_options(column_descr->path()); if (bloom_filter_options_opt == std::nullopt) { return nullptr; } BloomFilterOptions bloom_filter_options = *bloom_filter_options_opt; + // CheckState() should have checked that file_bloom_filters_ is not empty. + DCHECK(!file_bloom_filters_.empty()); RowGroupBloomFilters& row_group_bloom_filter = *file_bloom_filters_.back(); auto iter = row_group_bloom_filter.find(column_ordinal); if (iter == row_group_bloom_filter.end()) { @@ -111,7 +115,10 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin DCHECK(insert_result.second); iter = insert_result.first; } - ARROW_CHECK(iter->second != nullptr); + if (iter->second == nullptr) { + throw ParquetException("Bloom filter state is invalid for column ", + column_descr->path()); + } return iter->second.get(); } @@ -130,22 +137,20 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, if (row_group_bloom_filters.empty()) { continue; } - bool has_valid_bloom_filter = false; int num_columns = schema_->num_columns(); std::vector> locations(num_columns, std::nullopt); // serialize bloom filter in ascending order of column id for (auto& [column_id, filter] : row_group_bloom_filters) { - ARROW_CHECK(filter != nullptr); + if (ARROW_PREDICT_FALSE(filter == nullptr)) { + throw ParquetException("Bloom filter state is invalid for column ", column_id); + } PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); filter->WriteTo(sink); PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); - has_valid_bloom_filter = true; locations[column_id] = IndexLocation{offset, static_cast(pos - offset)}; } - if (has_valid_bloom_filter) { - location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations)); - } + location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations)); } } } // namespace diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 571c954dcd916..325073be44453 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2446,6 +2446,8 @@ template <> void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const bool*, int64_t, const uint8_t*, int64_t) { + // BooleanType does not have a bloom filter currently, + // so bloom_filter_ should always be nullptr. DCHECK(bloom_filter_ == nullptr); } @@ -2504,7 +2506,7 @@ void TypedColumnWriterImpl::UpdateBloomFilterArray( const ::arrow::Array& values) { if (bloom_filter_) { // TODO(mwish): GH-37832 currently we don't support write StringView/BinaryView to - // parquet file. We can support + // parquet file. if (!::arrow::is_base_binary_like(values.type_id())) { throw ParquetException("Only BaseBinaryArray and subclasses supported"); } @@ -2513,7 +2515,11 @@ void TypedColumnWriterImpl::UpdateBloomFilterArray( UpdateBinaryBloomFilter(bloom_filter_, checked_cast(values)); } else { - DCHECK(::arrow::is_large_binary_like(values.type_id())); + // TODO(mwish): GH-37832 currently we don't support write StringView/BinaryView to + // parquet file. + if (!::arrow::is_large_binary_like(values.type_id())) { + throw ParquetException("Only LargeBinaryArray and subclasses supported"); + } UpdateBinaryBloomFilter(bloom_filter_, checked_cast(values)); } From 22030dbc08e87ae0dc6224d09f05bb2cc2768bf6 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 11 Nov 2024 12:22:45 +0800 Subject: [PATCH 43/51] change the bloom filter from vector to map --- .../parquet/arrow/arrow_reader_writer_test.cc | 14 +++--- cpp/src/parquet/bloom_filter.h | 50 +++++++++---------- cpp/src/parquet/bloom_filter_builder.cc | 5 +- .../bloom_filter_reader_writer_test.cc | 13 +++-- cpp/src/parquet/metadata.cc | 18 +++---- cpp/src/parquet/metadata.h | 20 ++++++-- 6 files changed, 64 insertions(+), 56 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index edb75f776ee30..d46b45adce60f 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5491,7 +5491,7 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test, void ReadPageIndexes(int expect_num_row_groups, int expect_num_pages, const std::set& expect_columns_without_index = {}) { auto read_properties = default_arrow_reader_properties(); - auto reader = ParquetFileReader::Open(std::make_shared(this->buffer_)); + auto reader = ParquetFileReader::Open(std::make_shared(buffer_)); auto metadata = reader->metadata(); ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups()); @@ -5923,10 +5923,10 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { [5, null], [6, "f"] ])"}; - auto table = ::arrow::TableFromJSON(schema, contents); + auto dict_encoded_table = ::arrow::TableFromJSON(schema, contents); // using non_dict_table to adapt some interface which doesn't support dictionary. - auto non_dict_table = ::arrow::TableFromJSON(origin_schema, contents); - WriteFile(writer_properties, table); + auto table = ::arrow::TableFromJSON(origin_schema, contents); + WriteFile(writer_properties, dict_encoded_table); ReadBloomFilters(/*expect_num_row_groups=*/2); ASSERT_EQ(4, bloom_filters_.size()); @@ -5939,8 +5939,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { int64_t bloom_filter_idx_another_rg = row_group_id == 0 ? bloom_filter_idx + 2 : bloom_filter_idx - 2; ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); - auto col = non_dict_table->column(0)->Slice(current_row, - row_group_row_count[row_group_id]); + auto col = table->column(0)->Slice(current_row, row_group_row_count[row_group_id]); VerifyBloomFilterContains<::arrow::Int64Type>( bloom_filters_[bloom_filter_idx].get(), *col); VerifyBloomFilterNotContains<::arrow::Int64Type>( @@ -5951,8 +5950,7 @@ TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTripDictionary) { int64_t bloom_filter_idx_another_rg = row_group_id == 0 ? bloom_filter_idx + 2 : bloom_filter_idx - 2; ASSERT_NE(nullptr, bloom_filters_[bloom_filter_idx]); - auto col = non_dict_table->column(1)->Slice(current_row, - row_group_row_count[row_group_id]); + auto col = table->column(1)->Slice(current_row, row_group_row_count[row_group_id]); VerifyBloomFilterContains<::arrow::StringType>( bloom_filters_[bloom_filter_idx].get(), *col); VerifyBloomFilterNotContains<::arrow::StringType>( diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index 6bfd099484549..5a3df7dad5130 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -106,6 +106,31 @@ class PARQUET_EXPORT BloomFilter { /// @return hash result. virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; + // Variant of const reference argument to facilitate template + + /// Compute hash for ByteArray value by using its plain encoding result. + /// + /// @param value the value to hash. + uint64_t Hash(const ByteArray& value) const { return Hash(&value); } + + /// Compute hash for fixed byte array value by using its plain encoding result. + /// + /// @param value the value to hash. + uint64_t Hash(const FLBA& value, uint32_t type_len) const { + return Hash(&value, type_len); + } + /// Compute hash for Int96 value by using its plain encoding result. + /// + /// @param value the value to hash. + uint64_t Hash(const Int96& value) const { return Hash(&value); } + /// Compute hash for std::string_view value by using its plain encoding result. + /// + /// @param value the value to hash. + uint64_t Hash(std::string_view value) const { + ByteArray ba(value); + return Hash(&ba); + } + /// Batch compute hashes for 32 bits values by using its plain encoding result. /// /// @param values values a pointer to the values to hash. @@ -167,31 +192,6 @@ class PARQUET_EXPORT BloomFilter { virtual ~BloomFilter() = default; - // Variant of const reference argument to facilitate template - - /// Compute hash for ByteArray value by using its plain encoding result. - /// - /// @param value the value to hash. - uint64_t Hash(const ByteArray& value) const { return Hash(&value); } - - /// Compute hash for fixed byte array value by using its plain encoding result. - /// - /// @param value the value to hash. - uint64_t Hash(const FLBA& value, uint32_t type_len) const { - return Hash(&value, type_len); - } - /// Compute hash for Int96 value by using its plain encoding result. - /// - /// @param value the value to hash. - uint64_t Hash(const Int96& value) const { return Hash(&value); } - /// Compute hash for std::string_view value by using its plain encoding result. - /// - /// @param value the value to hash. - uint64_t Hash(std::string_view value) const { - ByteArray ba(value); - return Hash(&ba); - } - protected: // Hash strategy available for Bloom filter. enum class HashStrategy : uint32_t { XXHASH = 0 }; diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 651ca89ba266b..6b06845a63227 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -138,7 +138,7 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, continue; } int num_columns = schema_->num_columns(); - std::vector> locations(num_columns, std::nullopt); + RowGroupBloomFilterLocation locations; // serialize bloom filter in ascending order of column id for (auto& [column_id, filter] : row_group_bloom_filters) { @@ -148,6 +148,9 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); filter->WriteTo(sink); PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); + if (pos - offset > std::numeric_limits::max()) { + throw ParquetException("Bloom filter is too large to be serialized."); + } locations[column_id] = IndexLocation{offset, static_cast(pos - offset)}; } location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations)); diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 014484c1d77a8..ed23d5de56976 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -116,9 +116,9 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { builder->WriteTo(sink.get(), &location); EXPECT_EQ(2, location.bloom_filter_location.size()); for (auto& [row_group_id, row_group_bloom_filter] : location.bloom_filter_location) { - EXPECT_EQ(2, row_group_bloom_filter.size()); - EXPECT_TRUE(row_group_bloom_filter[0].has_value()); - EXPECT_FALSE(row_group_bloom_filter[1].has_value()); + EXPECT_EQ(1, row_group_bloom_filter.size()); + EXPECT_TRUE(row_group_bloom_filter.find(0) != row_group_bloom_filter.end()); + EXPECT_FALSE(row_group_bloom_filter.find(1) != row_group_bloom_filter.end()); } struct RowGroupBloomFilterCase { @@ -135,10 +135,9 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { RowGroupBloomFilterCase{/*row_group_id=*/1, /*exists_hashes=*/{300, 400}, /*unexists_hashes=*/{100, 200}}}; for (const auto& c : cases) { - int64_t bloom_filter_offset = - location.bloom_filter_location[c.row_group_id][0]->offset; - int32_t bloom_filter_length = - location.bloom_filter_location[c.row_group_id][0]->length; + auto& bloom_filter_location = location.bloom_filter_location[c.row_group_id]; + int64_t bloom_filter_offset = bloom_filter_location[0].offset; + int32_t bloom_filter_length = bloom_filter_location[0].length; ReaderProperties reader_properties; ::arrow::io::BufferReader reader( diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 709ed72eae08c..14af2d59b7f0a 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1941,21 +1941,19 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { void SetBloomFilterLocation(const BloomFilterLocation& location) { auto set_bloom_filter_location = [this](size_t row_group_ordinal, - const FileIndexLocation& file_bloom_filter_location) { + const FileBloomFilterLocation& file_bloom_filter_location) { auto& row_group_metadata = this->row_groups_.at(row_group_ordinal); auto iter = file_bloom_filter_location.find(row_group_ordinal); if (iter != file_bloom_filter_location.cend()) { const auto& row_group_bloom_filter_location = iter->second; - for (size_t i = 0; i < row_group_bloom_filter_location.size(); ++i) { - DCHECK(i < row_group_metadata.columns.size()); - auto& column = row_group_metadata.columns[i]; + for (auto& [column_id, bloom_filter_location] : + row_group_bloom_filter_location) { + DCHECK(column_id < row_group_metadata.columns.size()); + auto& column = row_group_metadata.columns[column_id]; auto& column_metadata = column.meta_data; - const auto& bloom_filter_location = row_group_bloom_filter_location[i]; - if (bloom_filter_location.has_value()) { - column_metadata.__set_bloom_filter_offset(bloom_filter_location->offset); - // bloom_filter_length is added by Parquet format 2.10.0 - column_metadata.__set_bloom_filter_length(bloom_filter_location->length); - } + column_metadata.__set_bloom_filter_offset(bloom_filter_location.offset); + // bloom_filter_length is added by Parquet format 2.10.0 + column_metadata.__set_bloom_filter_length(bloom_filter_location.length); } } }; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index b7ac58e5620d0..5c178dda4732e 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -510,15 +510,25 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { std::unique_ptr impl_; }; -/// Alias type of page index and bloom filter location of a row group. The index -/// location is located by column ordinal. If a column does not have a page index -/// (respectively bloom filter), its value is set to std::nullopt. +/// Alias type of page index location of a row group. The index location +/// is located by column ordinal. If a column does not have a page index, +/// its value is set to std::nullopt. using RowGroupIndexLocation = std::vector>; -/// Alias type of page index and bloom filter location of a parquet file. The +/// Alias type of bloom filter location of a row group. The filter location +/// is located by column ordinal. Number of columns with a bloom filter to +/// be relatively small compared to the number of overall columns, so +/// map is used. +using RowGroupBloomFilterLocation = std::map; + +/// Alias type of page index and location of a parquet file. The /// index location is located by the row group ordinal. using FileIndexLocation = std::map; +/// Alias type of bloom filter and location of a parquet file. The +/// index location is located by the row group ordinal. +using FileBloomFilterLocation = std::map; + /// \brief Public struct for location to all page indexes in a parquet file. struct PageIndexLocation { /// Row group column index locations which uses row group ordinal as the key. @@ -533,7 +543,7 @@ struct BloomFilterLocation { /// /// Note: Before Parquet 2.10, the bloom filter index only have "offset". But here /// we use "IndexLocation" with length to support the future extension. - FileIndexLocation bloom_filter_location; + FileBloomFilterLocation bloom_filter_location; }; class PARQUET_EXPORT FileMetaDataBuilder { From e9c550aa928b687c9cee68c5560f76fc800f65af Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 11 Nov 2024 19:59:41 +0800 Subject: [PATCH 44/51] fix lint --- cpp/src/parquet/bloom_filter_builder.cc | 6 +++++- cpp/src/parquet/metadata.cc | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 6b06845a63227..8c315bac46621 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -145,11 +145,15 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, if (ARROW_PREDICT_FALSE(filter == nullptr)) { throw ParquetException("Bloom filter state is invalid for column ", column_id); } + if (ARROW_PREDICT_FALSE(column_id < 0 || column_id >= num_columns)) { + throw ParquetException("Invalid column ordinal when serailizing: ", column_id); + } PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); filter->WriteTo(sink); PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); if (pos - offset > std::numeric_limits::max()) { - throw ParquetException("Bloom filter is too large to be serialized."); + throw ParquetException("Bloom filter is too large to be serialized, size: ", + pos - offset); } locations[column_id] = IndexLocation{offset, static_cast(pos - offset)}; } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 14af2d59b7f0a..aece8a14f5f0a 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1948,7 +1948,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { const auto& row_group_bloom_filter_location = iter->second; for (auto& [column_id, bloom_filter_location] : row_group_bloom_filter_location) { - DCHECK(column_id < row_group_metadata.columns.size()); + DCHECK_LT(static_cast(column_id), + row_group_metadata.columns.size()); auto& column = row_group_metadata.columns[column_id]; auto& column_metadata = column.meta_data; column_metadata.__set_bloom_filter_offset(bloom_filter_location.offset); From 23fb3fad81a632b19729251518202114b66589f2 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 14 Nov 2024 11:11:12 +0800 Subject: [PATCH 45/51] fix lint --- cpp/src/parquet/bloom_filter.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index 5a3df7dad5130..ca21fec848eec 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -116,6 +116,7 @@ class PARQUET_EXPORT BloomFilter { /// Compute hash for fixed byte array value by using its plain encoding result. /// /// @param value the value to hash. + /// @param type_len the value length. uint64_t Hash(const FLBA& value, uint32_t type_len) const { return Hash(&value, type_len); } From d8928193fb99453fb2279183c73d1677bf16a880 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 15 Nov 2024 11:49:18 +0800 Subject: [PATCH 46/51] fix comment --- cpp/src/parquet/bloom_filter_reader.h | 2 +- cpp/src/parquet/column_writer.cc | 8 +++++--- cpp/src/parquet/metadata.h | 7 ++++--- cpp/src/parquet/page_index.h | 2 +- cpp/src/parquet/properties.h | 2 +- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_reader.h b/cpp/src/parquet/bloom_filter_reader.h index 46e046156da7a..cbd267dd1972d 100644 --- a/cpp/src/parquet/bloom_filter_reader.h +++ b/cpp/src/parquet/bloom_filter_reader.h @@ -17,7 +17,7 @@ #pragma once -#include "arrow/io/type_fwd.h" +#include "arrow/io/interfaces.h" #include "parquet/properties.h" #include "parquet/type_fwd.h" diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index b1534aeaa9969..68e5ee34d1fdc 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1701,6 +1701,7 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( auto update_stats = [&](int64_t num_chunk_levels, const std::shared_ptr& chunk_indices) { + DCHECK(page_statistics_ != nullptr || bloom_filter_ != nullptr); // TODO(PARQUET-2068) This approach may make two copies. First, a copy of the // indices array to a (hopefully smaller) referenced indices array. Second, a copy // of the values array to a (probably not smaller) referenced values array. @@ -1725,9 +1726,8 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( &exec_ctx)); referenced_dictionary = referenced_dictionary_datum.make_array(); } - - int64_t non_null_count = chunk_indices->length() - chunk_indices->null_count(); if (page_statistics_ != nullptr) { + int64_t non_null_count = chunk_indices->length() - chunk_indices->null_count(); page_statistics_->IncrementNullCount(num_chunk_levels - non_null_count); page_statistics_->IncrementNumValues(non_null_count); page_statistics_->Update(*referenced_dictionary, /*update_counts=*/false); @@ -2426,7 +2426,9 @@ void TypedColumnWriterImpl::UpdateBloomFilter(const FLBA* values, template <> void TypedColumnWriterImpl::UpdateBloomFilter(const bool*, int64_t) { - DCHECK(bloom_filter_ == nullptr); + if (ARROW_PREDICT_FALSE(bloom_filter_ != nullptr)) { + throw ParquetException("BooleanType does not support bloom filters"); + } } template diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 5c178dda4732e..6fa2e16311532 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -516,9 +516,10 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder { using RowGroupIndexLocation = std::vector>; /// Alias type of bloom filter location of a row group. The filter location -/// is located by column ordinal. Number of columns with a bloom filter to -/// be relatively small compared to the number of overall columns, so -/// map is used. +/// is located by column ordinal. +/// +/// Number of columns with a bloom filter to be relatively small compared to +/// the number of overall columns, so map is used. using RowGroupBloomFilterLocation = std::map; /// Alias type of page index and location of a parquet file. The diff --git a/cpp/src/parquet/page_index.h b/cpp/src/parquet/page_index.h index 89c49cf7a896f..d45c59cab223f 100644 --- a/cpp/src/parquet/page_index.h +++ b/cpp/src/parquet/page_index.h @@ -17,7 +17,7 @@ #pragma once -#include "arrow/io/type_fwd.h" +#include "arrow/io/interfaces.h" #include "parquet/encryption/type_fwd.h" #include "parquet/types.h" diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 01ba164c8f9ee..ad88b4fc2f970 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -599,7 +599,7 @@ class PARQUET_EXPORT WriterProperties { } /// Disable bloom filter for the column specified by `path`. - /// Default enabled. + /// Default disabled. Builder* disable_bloom_filter(const std::shared_ptr& path) { return this->disable_bloom_filter(path->ToDotString()); } From c5b1fb1255feac765d22e7a0365a51c221288769 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 13 Jan 2025 15:00:20 +0800 Subject: [PATCH 47/51] Resolve comments --- cpp/src/parquet/CMakeLists.txt | 2 +- .../parquet/arrow/arrow_reader_writer_test.cc | 6 +++--- cpp/src/parquet/bloom_filter.h | 2 ++ cpp/src/parquet/bloom_filter_builder.cc | 18 +++++++++--------- .../parquet/bloom_filter_reader_writer_test.cc | 6 +++--- cpp/src/parquet/column_writer_test.cc | 4 ++-- cpp/src/parquet/metadata.cc | 1 - cpp/src/parquet/metadata.h | 3 --- 8 files changed, 20 insertions(+), 22 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index b677d2ccdba17..fec7b59937dae 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -159,8 +159,8 @@ set(PARQUET_SRCS arrow/schema_internal.cc arrow/writer.cc bloom_filter.cc - bloom_filter_reader.cc bloom_filter_builder.cc + bloom_filter_reader.cc column_reader.cc column_scanner.cc column_writer.cc diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 4e5b55e04c615..10d075024d5ab 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5543,7 +5543,7 @@ auto encode_double = [](double value) { } // namespace -class ParquetIndexRoundTripTest { +class TestingWithPageIndex { public: void WriteFile(const std::shared_ptr& writer_properties, const std::shared_ptr<::arrow::Table>& table) { @@ -5572,7 +5572,7 @@ class ParquetIndexRoundTripTest { }; class ParquetPageIndexRoundTripTest : public ::testing::Test, - public ParquetIndexRoundTripTest { + public TestingWithPageIndex { public: void ReadPageIndexes(int expect_num_row_groups, int expect_num_pages, const std::set& expect_columns_without_index = {}) { @@ -5878,7 +5878,7 @@ TEST_F(ParquetPageIndexRoundTripTest, EnablePerColumn) { } class ParquetBloomFilterRoundTripTest : public ::testing::Test, - public ParquetIndexRoundTripTest { + public TestingWithPageIndex { public: void ReadBloomFilters(int expect_num_row_groups, const std::set& expect_columns_without_filter = {}) { diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index ca21fec848eec..804940f294d61 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -120,10 +120,12 @@ class PARQUET_EXPORT BloomFilter { uint64_t Hash(const FLBA& value, uint32_t type_len) const { return Hash(&value, type_len); } + /// Compute hash for Int96 value by using its plain encoding result. /// /// @param value the value to hash. uint64_t Hash(const Int96& value) const { return Hash(&value); } + /// Compute hash for std::string_view value by using its plain encoding result. /// /// @param value the value to hash. diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 8c315bac46621..40cd848488064 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -58,7 +58,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { /// Make sure column ordinal is not out of bound and the builder is in good state. void CheckState(int32_t column_ordinal) const { if (finished_) { - throw ParquetException("Cannot call WriteTo() twice on BloomFilterBuilder."); + throw ParquetException("BloomFilterBuilder is already finished."); } if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) { throw ParquetException("Invalid column ordinal: ", column_ordinal); @@ -84,8 +84,7 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder { void BloomFilterBuilderImpl::AppendRowGroup() { if (finished_) { - throw ParquetException( - "Cannot call AppendRowGroup() to BloomFilterBuilder::WriteTo is called"); + throw ParquetException("Cannot append to a finished BloomFilterBuilder"); } file_bloom_filters_.emplace_back(std::make_unique()); } @@ -112,11 +111,10 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin bloom_filter_options.ndv, bloom_filter_options.fpp)); auto insert_result = row_group_bloom_filter.emplace( column_ordinal, std::move(block_split_bloom_filter)); - DCHECK(insert_result.second); iter = insert_result.first; } if (iter->second == nullptr) { - throw ParquetException("Bloom filter state is invalid for column ", + throw ParquetException("Bloom filter should not be null for column ", column_descr->path()); } return iter->second.get(); @@ -125,7 +123,7 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) { if (finished_) { - throw ParquetException("Cannot call WriteTo() multiple times."); + throw ParquetException("Cannot write a finished BloomFilterBuilder"); } finished_ = true; @@ -143,17 +141,19 @@ void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, // serialize bloom filter in ascending order of column id for (auto& [column_id, filter] : row_group_bloom_filters) { if (ARROW_PREDICT_FALSE(filter == nullptr)) { - throw ParquetException("Bloom filter state is invalid for column ", column_id); + throw ParquetException("Bloom filter is null for column ", column_id); } if (ARROW_PREDICT_FALSE(column_id < 0 || column_id >= num_columns)) { - throw ParquetException("Invalid column ordinal when serailizing: ", column_id); + throw ParquetException("Invalid column ordinal when serializing bloom filter: ", + column_id); } PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); + // TODO(GH-43138): Estimate the quality of the bloom filter before writing it. filter->WriteTo(sink); PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); if (pos - offset > std::numeric_limits::max()) { throw ParquetException("Bloom filter is too large to be serialized, size: ", - pos - offset); + pos - offset, " for column ", column_id); } locations[column_id] = IndexLocation{offset, static_cast(pos - offset)}; } diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index ed23d5de56976..25b5530bd149c 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -84,7 +84,7 @@ TEST(BloomFilterReader, FileNotHaveBloomFilter) { } // , c1 has bloom filter. -TEST(BloomFilterBuilderTest, BasicRoundTrip) { +TEST(BloomFilterBuilder, BasicRoundTrip) { SchemaDescriptor schema; schema::NodePtr root = schema::GroupNode::Make( "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::ByteArray("c2")}); @@ -152,7 +152,7 @@ TEST(BloomFilterBuilderTest, BasicRoundTrip) { } } -TEST(BloomFilterBuilderTest, InvalidOperations) { +TEST(BloomFilterBuilder, InvalidOperations) { SchemaDescriptor schema; schema::NodePtr root = schema::GroupNode::Make( "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::Boolean("c2")}); @@ -196,7 +196,7 @@ TEST(BloomFilterBuilderTest, InvalidOperations) { ::testing::HasSubstr("Cannot call WriteTo() multiple times"))); } -TEST(BloomFilterBuilderTest, GetOrCreate) { +TEST(BloomFilterBuilder, GetOrCreate) { SchemaDescriptor schema; schema::NodePtr root = schema::GroupNode::Make( "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::Boolean("c2")}); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 6408dc7315674..bbb3b024d98ea 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1765,7 +1765,7 @@ TestBloomFilterWriter::BuildWriterWithBloomFilter( this->sink_, column_properties.compression(), this->metadata_.get()); builder_ = internal::BloomFilterBuilder::Make(&this->schema_, this->writer_properties_.get()); - // Initial RowGroup + // Initialize RowGroup builder_->AppendRowGroup(); bloom_filter_ = builder_->GetOrCreateBloomFilter(0); std::shared_ptr writer = @@ -1791,7 +1791,7 @@ TYPED_TEST(TestBloomFilterWriter, Basic) { writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); writer->Close(); - // Read all rows so we are sure that also the non-dictionary pages are read correctly + // Make sure that column values are read correctly this->SetupValuesOut(SMALL_SIZE); this->ReadColumnFully(); ASSERT_EQ(SMALL_SIZE, this->values_read_); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 223555e0f6168..36496b8fd1851 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1976,7 +1976,6 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { auto& column = row_group_metadata.columns[column_id]; auto& column_metadata = column.meta_data; column_metadata.__set_bloom_filter_offset(bloom_filter_location.offset); - // bloom_filter_length is added by Parquet format 2.10.0 column_metadata.__set_bloom_filter_length(bloom_filter_location.length); } } diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 25c17e5b6da5d..cd1da0725ca48 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -529,9 +529,6 @@ struct PageIndexLocation { /// \brief Public struct for location to all bloom filters in a parquet file. struct BloomFilterLocation { /// Row group bloom filter index locations which uses row group ordinal as the key. - /// - /// Note: Before Parquet 2.10, the bloom filter index only have "offset". But here - /// we use "IndexLocation" with length to support the future extension. FileBloomFilterLocation bloom_filter_location; }; From d57ceea991ab6a5b0b149b51d99069567d3c0f14 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 7 Feb 2025 14:40:10 +0800 Subject: [PATCH 48/51] minor fix --- cpp/src/parquet/file_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index c0a420a34e652..36e336fce5e74 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -393,7 +393,7 @@ class FileSerializer : public ParquetFileWriter::Contents { bloom_filter_builder_->AppendRowGroup(); } std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, static_cast(num_row_groups_ - 1), properties_.get(), + sink_, rg_metadata, row_group_ordinal, properties_.get(), buffered_row_group, file_encryptor_.get(), page_index_builder_.get(), bloom_filter_builder_.get())); row_group_writer_ = std::make_unique(std::move(contents)); From 26c2d07807b51c406f1e8c5a01fb5688d0f91cc8 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 7 Feb 2025 14:50:51 +0800 Subject: [PATCH 49/51] address some comments --- cpp/src/parquet/bloom_filter_builder.cc | 2 +- cpp/src/parquet/bloom_filter_builder.h | 13 ++----------- cpp/src/parquet/column_writer.cc | 4 +++- cpp/src/parquet/type_fwd.h | 3 +++ 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_builder.cc b/cpp/src/parquet/bloom_filter_builder.cc index 40cd848488064..ba8814b2540b7 100644 --- a/cpp/src/parquet/bloom_filter_builder.cc +++ b/cpp/src/parquet/bloom_filter_builder.cc @@ -99,7 +99,7 @@ BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(int32_t column_ordin if (bloom_filter_options_opt == std::nullopt) { return nullptr; } - BloomFilterOptions bloom_filter_options = *bloom_filter_options_opt; + const BloomFilterOptions& bloom_filter_options = *bloom_filter_options_opt; // CheckState() should have checked that file_bloom_filters_ is not empty. DCHECK(!file_bloom_filters_.empty()); RowGroupBloomFilters& row_group_bloom_filter = *file_bloom_filters_.back(); diff --git a/cpp/src/parquet/bloom_filter_builder.h b/cpp/src/parquet/bloom_filter_builder.h index f899a3f32fac2..90bc18a186def 100644 --- a/cpp/src/parquet/bloom_filter_builder.h +++ b/cpp/src/parquet/bloom_filter_builder.h @@ -20,14 +20,7 @@ #include "arrow/io/type_fwd.h" #include "parquet/types.h" -namespace parquet { - -class BloomFilter; -class SchemaDescriptor; -struct BloomFilterOptions; -struct BloomFilterLocation; - -namespace internal { +namespace parquet::internal { /// \brief Interface for collecting bloom filter of a parquet file. /// @@ -83,6 +76,4 @@ class PARQUET_EXPORT BloomFilterBuilder { virtual ~BloomFilterBuilder() = default; }; -} // namespace internal - -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index da8e476adcc19..8af1ec65e921e 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2541,7 +2541,9 @@ void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const bool*, in int64_t) { // BooleanType does not have a bloom filter currently, // so bloom_filter_ should always be nullptr. - DCHECK(bloom_filter_ == nullptr); + if (UNLIKELY(bloom_filter_ == nullptr)) { + throw ParquetException("BooleanType does not support bloom filters"); + } } template <> diff --git a/cpp/src/parquet/type_fwd.h b/cpp/src/parquet/type_fwd.h index f5f2f337a872c..0ba6d264cb325 100644 --- a/cpp/src/parquet/type_fwd.h +++ b/cpp/src/parquet/type_fwd.h @@ -90,6 +90,9 @@ class Statistics; struct SizeStatistics; class BloomFilter; +struct BloomFilterOptions; +struct BloomFilterLocation; + class ColumnIndex; class OffsetIndex; From e6bc6e16f4e496da63f35be6736fc2c0f0d99a3b Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 10 Mar 2025 11:16:27 +0800 Subject: [PATCH 50/51] Minor fix --- .../bloom_filter_reader_writer_test.cc | 34 ++----------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/bloom_filter_reader_writer_test.cc b/cpp/src/parquet/bloom_filter_reader_writer_test.cc index 25b5530bd149c..b3cfd9d474abb 100644 --- a/cpp/src/parquet/bloom_filter_reader_writer_test.cc +++ b/cpp/src/parquet/bloom_filter_reader_writer_test.cc @@ -192,37 +192,9 @@ TEST(BloomFilterBuilder, InvalidOperations) { // Multiple WriteTo() expect throw. EXPECT_THROW_THAT( [&]() { builder->WriteTo(sink.get(), &location); }, ParquetException, - ::testing::Property(&ParquetException::what, - ::testing::HasSubstr("Cannot call WriteTo() multiple times"))); -} - -TEST(BloomFilterBuilder, GetOrCreate) { - SchemaDescriptor schema; - schema::NodePtr root = schema::GroupNode::Make( - "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::Boolean("c2")}); - schema.Init(root); - WriterProperties::Builder properties_builder; - BloomFilterOptions bloom_filter_options; - bloom_filter_options.ndv = 100; - properties_builder.enable_bloom_filter_options(bloom_filter_options, "c1"); - properties_builder.enable_bloom_filter_options(bloom_filter_options, "c2"); - auto properties = properties_builder.build(); - auto builder = internal::BloomFilterBuilder::Make(&schema, properties.get()); - // AppendRowGroup() is not called and expect throw. - ASSERT_THROW(builder->GetOrCreateBloomFilter(0), ParquetException); - - builder->AppendRowGroup(); - // GetOrCreateBloomFilter() with wrong column ordinal expect throw. - ASSERT_THROW(builder->GetOrCreateBloomFilter(2), ParquetException); - // GetOrCreateBloomFilter() with boolean expect throw. - ASSERT_THROW(builder->GetOrCreateBloomFilter(1), ParquetException); - builder->GetOrCreateBloomFilter(0); - auto sink = CreateOutputStream(); - BloomFilterLocation location; - builder->WriteTo(sink.get(), &location); - EXPECT_EQ(1, location.bloom_filter_location.size()); - // Multiple WriteTo() expect throw. - ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException); + ::testing::Property( + &ParquetException::what, + ::testing::HasSubstr("Cannot write a finished BloomFilterBuilder"))); } } // namespace parquet::test From dfaf0e85f0cd6410f62c2449217f0f0e99b03a43 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 10 Mar 2025 14:27:21 +0800 Subject: [PATCH 51/51] try to fix lint --- cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/file_writer.cc | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index fcdcc40a83d08..b757e806e0dcd 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2543,7 +2543,7 @@ void TypedColumnWriterImpl::UpdateBloomFilterSpaced(const bool*, in int64_t) { // BooleanType does not have a bloom filter currently, // so bloom_filter_ should always be nullptr. - if (UNLIKELY(bloom_filter_ == nullptr)) { + if (ARROW_PREDICT_FALSE(bloom_filter_ != nullptr)) { throw ParquetException("BooleanType does not support bloom filters"); } } diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 36e336fce5e74..a6d8115434ce0 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -393,9 +393,8 @@ class FileSerializer : public ParquetFileWriter::Contents { bloom_filter_builder_->AppendRowGroup(); } std::unique_ptr contents(new RowGroupSerializer( - sink_, rg_metadata, row_group_ordinal, properties_.get(), - buffered_row_group, file_encryptor_.get(), page_index_builder_.get(), - bloom_filter_builder_.get())); + sink_, rg_metadata, row_group_ordinal, properties_.get(), buffered_row_group, + file_encryptor_.get(), page_index_builder_.get(), bloom_filter_builder_.get())); row_group_writer_ = std::make_unique(std::move(contents)); return row_group_writer_.get(); }