From 7274e4b7532bacf94da255669f69dc851331e6ce Mon Sep 17 00:00:00 2001 From: arnavb Date: Sun, 23 Nov 2025 14:05:27 +0000 Subject: [PATCH 01/16] Add FSST support --- cpp/src/parquet/CMakeLists.txt | 2 + cpp/src/parquet/column_reader.cc | 3 +- cpp/src/parquet/decoder.cc | 262 +++++++ cpp/src/parquet/encoder.cc | 180 ++++- cpp/src/parquet/encoding_test.cc | 310 +++++++++ cpp/src/parquet/parquet.thrift | 7 + cpp/src/parquet/thirdparty/fsst/fsst.cpp | 200 ++++++ cpp/src/parquet/thirdparty/fsst/fsst.h | 227 ++++++ .../parquet/thirdparty/fsst/fsst_avx512.cpp | 149 ++++ .../parquet/thirdparty/fsst/fsst_avx512.inc | 57 ++ .../thirdparty/fsst/fsst_avx512_unroll1.inc | 57 ++ .../thirdparty/fsst/fsst_avx512_unroll2.inc | 114 +++ .../thirdparty/fsst/fsst_avx512_unroll3.inc | 171 +++++ .../thirdparty/fsst/fsst_avx512_unroll4.inc | 228 ++++++ cpp/src/parquet/thirdparty/fsst/libfsst.cpp | 651 ++++++++++++++++++ cpp/src/parquet/thirdparty/fsst/libfsst.hpp | 471 +++++++++++++ cpp/src/parquet/types.cc | 2 + cpp/src/parquet/types.h | 3 +- python/pyarrow/_parquet.pyx | 2 + python/pyarrow/includes/libparquet.pxd | 1 + python/pyarrow/parquet/core.py | 2 +- 21 files changed, 3095 insertions(+), 4 deletions(-) create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst.cpp create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst.h create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc create mode 100644 cpp/src/parquet/thirdparty/fsst/libfsst.cpp create mode 100644 cpp/src/parquet/thirdparty/fsst/libfsst.hpp diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index dc7d40d2a38..e06d13d1884 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -171,6 +171,8 @@ set(PARQUET_SRCS encryption/internal_file_encryptor.cc exception.cc file_reader.cc + thirdparty/fsst/libfsst.cpp + thirdparty/fsst/fsst_avx512.cpp file_writer.cc geospatial/statistics.cc geospatial/util_internal.cc diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 8ecb774022f..59c3b9934a2 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -862,7 +862,8 @@ class ColumnReaderImplBase { case Encoding::RLE: case Encoding::DELTA_BINARY_PACKED: case Encoding::DELTA_BYTE_ARRAY: - case Encoding::DELTA_LENGTH_BYTE_ARRAY: { + case Encoding::DELTA_LENGTH_BYTE_ARRAY: + case Encoding::FSST: { auto decoder = MakeTypedDecoder(encoding, descr_, pool_); current_decoder_ = decoder.get(); decoders_[static_cast(encoding)] = std::move(decoder); diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index d0a857dd22a..77bc3c760ff 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include "arrow/array.h" #include "arrow/array/builder_binary.h" @@ -51,6 +53,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" +#include "parquet/thirdparty/fsst/fsst.h" #include "parquet/types.h" #ifdef _MSC_VER @@ -2307,6 +2310,258 @@ class ByteStreamSplitDecoder : public ByteStreamSplitDecoderBase { + public: + using Base = DecoderImpl; + using Base::num_values_; + + explicit FsstDecoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : DecoderImpl(descr, Encoding::FSST), pool_(pool) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + const auto header_size = static_cast(sizeof(fsst_decoder_t)); + if (len < header_size) { + throw ParquetException("FSST page too small to contain decoder header"); + } + num_values_ = num_values; + memcpy(&decoder_, data, sizeof(fsst_decoder_t)); + next_ = data + header_size; + remaining_bytes_ = len - header_size; + decode_buffer_size_ = 0; + } + + int Decode(ByteArray* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + if (max_values == 0) { + return 0; + } + + const int64_t estimated_output_size = + decode_buffer_size_ + + static_cast(remaining_bytes_) * kMaxDecompressionExpansion; + EnsureDecodeBuffer(estimated_output_size); + + int decoded = 0; + + while (decoded < max_values) { + if (ARROW_PREDICT_FALSE(remaining_bytes_ < static_cast(kLengthPrefixSize))) { + throw ParquetException("FSST data truncated before length prefix"); + } + + const uint32_t compressed_len = SafeLoadAs(next_); + next_ += kLengthPrefixSize; + remaining_bytes_ -= static_cast(kLengthPrefixSize); + + if (ARROW_PREDICT_FALSE(compressed_len > static_cast(remaining_bytes_))) { + throw ParquetException("FSST compressed length exceeds available data"); + } + + const uint8_t* compressed_ptr = next_; + next_ += compressed_len; + remaining_bytes_ -= static_cast(compressed_len); + + uint8_t* value_ptr = nullptr; + const size_t decompressed_len = + DecompressValue(compressed_ptr, compressed_len, &value_ptr); + + buffer[decoded].ptr = value_ptr; + buffer[decoded].len = static_cast(decompressed_len); + decode_buffer_size_ += static_cast(decompressed_len); + ++decoded; + } + + num_values_ -= decoded; + return decoded; + } + + int DecodeSpaced(ByteArray* buffer, int num_values, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + if (null_count == 0) { + return Decode(buffer, num_values); + } + + const int values_to_decode = num_values - null_count; + temp_values_.resize(values_to_decode); + const int decoded = Decode(temp_values_.data(), values_to_decode); + if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) { + throw ParquetException("Expected to decode ", values_to_decode, + " values but decoded ", decoded, " values."); + } + + int value_index = 0; + for (int i = 0; i < num_values; ++i) { + if (bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + buffer[i] = temp_values_[value_index++]; + } else { + buffer[i].ptr = nullptr; + buffer[i].len = 0; + } + } + + return value_index; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) override { + int values_decoded = 0; + PARQUET_THROW_NOT_OK( + DecodeArrowDense(num_values, null_count, valid_bits, valid_bits_offset, builder, + &values_decoded)); + return values_decoded; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override { + int values_decoded = 0; + PARQUET_THROW_NOT_OK( + DecodeArrowDict(num_values, null_count, valid_bits, valid_bits_offset, builder, + &values_decoded)); + return values_decoded; + } + + private: + Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out, + int* out_values_decoded) { + const int values_to_decode = num_values - null_count; + temp_values_.resize(values_to_decode); + + const int decoded = Decode(temp_values_.data(), values_to_decode); + if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) { + throw ParquetException("Expected to decode ", values_to_decode, + " values but decoded ", decoded, " values."); + } + + auto visit_binary_helper = [&](auto* helper) { + auto* values_ptr = temp_values_.data(); + int value_index = 0; + + RETURN_NOT_OK( + VisitBitRuns(valid_bits, valid_bits_offset, num_values, + [&](int64_t position, int64_t run_length, bool is_valid) { + if (is_valid) { + for (int64_t i = 0; i < run_length; ++i) { + const auto& value = values_ptr[value_index++]; + RETURN_NOT_OK(helper->AppendValue( + value.ptr, static_cast(value.len))); + } + } else { + RETURN_NOT_OK(helper->AppendNulls(run_length)); + } + return Status::OK(); + })); + + *out_values_decoded = decoded; + return Status::OK(); + }; + + return DispatchArrowBinaryHelper( + out, num_values, /*estimated_data_length=*/{}, visit_binary_helper); + } + + Status DecodeArrowDict(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder, + int* out_values_decoded) { + const int values_to_decode = num_values - null_count; + temp_values_.resize(values_to_decode); + + const int decoded = Decode(temp_values_.data(), values_to_decode); + if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) { + throw ParquetException("Expected to decode ", values_to_decode, + " values but decoded ", decoded, " values."); + } + + RETURN_NOT_OK(builder->Reserve(num_values)); + + int value_index = 0; + RETURN_NOT_OK(VisitBitRuns( + valid_bits, valid_bits_offset, num_values, + [&](int64_t position, int64_t run_length, bool is_valid) { + if (is_valid) { + for (int64_t i = 0; i < run_length; ++i) { + const auto& value = temp_values_[value_index++]; + RETURN_NOT_OK(builder->Append(value.ptr, static_cast(value.len))); + } + } else { + RETURN_NOT_OK(builder->AppendNulls(run_length)); + } + return Status::OK(); + })); + + *out_values_decoded = decoded; + return Status::OK(); + } + + uint8_t* EnsureDecodeBuffer(int64_t capacity) { + const int64_t min_capacity = + std::max(capacity, kInitialDecodeBufferSize); + const int64_t target = ::arrow::bit_util::NextPower2(min_capacity); + + if (!decode_buffer_) { + PARQUET_ASSIGN_OR_THROW( + decode_buffer_, ::arrow::AllocateResizableBuffer(target, pool_)); + } else if (decode_buffer_->size() < target) { + PARQUET_THROW_NOT_OK(decode_buffer_->Resize(target, false)); + } + return decode_buffer_->mutable_data(); + } + + size_t DecompressValue(const uint8_t* compressed_ptr, uint32_t compressed_len, + uint8_t** value_ptr) { + EnsureDecodeBuffer(decode_buffer_size_ + + OutputUpperBound(compressed_len)); + + while (true) { + uint8_t* destination = decode_buffer_->mutable_data() + decode_buffer_size_; + const size_t available = + static_cast(decode_buffer_->size() - decode_buffer_size_); + + const size_t decompressed = + fsst_decompress(&decoder_, compressed_len, compressed_ptr, available, + destination); + + if (decompressed > 0 || compressed_len == 0) { + *value_ptr = destination; + return decompressed; + } + + int64_t new_capacity = std::max( + decode_buffer_->size() * 2, + decode_buffer_size_ + OutputUpperBound(compressed_len)); + if (new_capacity <= decode_buffer_->size()) { + throw ParquetException("FSST decompression failed"); + } + EnsureDecodeBuffer(new_capacity); + } + } + + static int64_t OutputUpperBound(uint32_t compressed_len) { + const int64_t expanded = + static_cast(compressed_len) * kMaxDecompressionExpansion; + return std::max(expanded, kInitialDecodeBufferSize); + } + + static constexpr size_t kLengthPrefixSize = sizeof(uint32_t); + static constexpr int64_t kInitialDecodeBufferSize = 1024; + static constexpr int64_t kMaxDecompressionExpansion = 8; + + fsst_decoder_t decoder_{}; + const uint8_t* next_ = nullptr; + int remaining_bytes_ = 0; + ::arrow::MemoryPool* pool_; + std::shared_ptr<::arrow::ResizableBuffer> decode_buffer_; + int64_t decode_buffer_size_ = 0; + std::vector temp_values_; +}; + } // namespace // ---------------------------------------------------------------------- @@ -2373,6 +2628,13 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin throw ParquetException( "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"); } + } else if (encoding == Encoding::FSST) { + switch (type_num) { + case Type::BYTE_ARRAY: + return std::make_unique(descr, pool); + default: + throw ParquetException("FSST encoding only supports BYTE_ARRAY"); + } } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) { if (type_num == Type::BYTE_ARRAY) { return std::make_unique(descr, pool); diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index f9367555d97..61b1bb9606c 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -18,8 +18,11 @@ #include "parquet/encoding.h" #include +#include #include #include +#include +#include #include #include #include @@ -47,6 +50,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" +#include "parquet/thirdparty/fsst/fsst.h" #include "parquet/types.h" #ifdef _MSC_VER @@ -101,7 +105,7 @@ class EncoderImpl : virtual public Encoder { MemoryPool* memory_pool() const override { return pool_; } int64_t ReportUnencodedDataBytes() override { - if (descr_->physical_type() != Type::BYTE_ARRAY) { + if (descr_ != nullptr && descr_->physical_type() != Type::BYTE_ARRAY) { throw ParquetException("ReportUnencodedDataBytes is only supported for BYTE_ARRAY"); } int64_t bytes = unencoded_byte_array_data_bytes_; @@ -1737,6 +1741,173 @@ std::shared_ptr RleBooleanEncoder::FlushValues() { return buffer; } +// ---------------------------------------------------------------------- + +class FsstEncoder : public EncoderImpl, virtual public TypedEncoder { + public: + explicit FsstEncoder(const ColumnDescriptor* descr, MemoryPool* pool) + : EncoderImpl(descr, Encoding::FSST, pool), + encoder_(nullptr), + unencoded_values_() {} + + ~FsstEncoder() override { + if (encoder_) { + fsst_destroy(encoder_); + } + } + + int64_t EstimatedDataEncodedSize() override { + const int64_t total_size = pending_unencoded_bytes_; + const double scaled = + static_cast(total_size) * compression_ratio_hint_; + const int64_t estimated_payload = + static_cast(std::ceil(scaled)); + return static_cast(sizeof(fsst_decoder_t)) + + std::max(0, estimated_payload); + } + + std::shared_ptr FlushValues() override { + if (unencoded_values_.empty()) { + PARQUET_ASSIGN_OR_THROW(auto buffer, AllocateBuffer(0, pool_)); + return buffer; + } + + BuildSymbolTable(); + + const int64_t total_input_size = pending_unencoded_bytes_; + + const int64_t decoder_bytes = static_cast(sizeof(fsst_decoder_t)); + const int64_t length_prefix_bytes = + static_cast(unencoded_values_.size()) * static_cast(sizeof(uint32_t)); + const int64_t estimated_buffer_size = + decoder_bytes + total_input_size * kFsstCompressionExpansion + length_prefix_bytes; + + PARQUET_ASSIGN_OR_THROW(auto output_buffer, + AllocateResizableBuffer(estimated_buffer_size, pool_)); + uint8_t* out_ptr = output_buffer->mutable_data(); + + fsst_decoder_t decoder = fsst_decoder(encoder_); + memcpy(out_ptr, &decoder, sizeof(fsst_decoder_t)); + out_ptr += sizeof(fsst_decoder_t); + + int64_t total_output_size = sizeof(fsst_decoder_t); + std::vector out_lengths(1); + std::vector out_ptrs(1); + + for (size_t i = 0; i < unencoded_values_.size(); ++i) { + const int64_t remaining_capacity = + estimated_buffer_size - total_output_size - sizeof(uint32_t); + if (ARROW_PREDICT_FALSE(remaining_capacity <= 0)) { + throw ParquetException("FSST compression buffer exhausted"); + } + size_t available = static_cast(remaining_capacity); + size_t input_length = static_cast(unencoded_values_[i].len); + const unsigned char* input_ptr = unencoded_values_[i].ptr; + size_t num_compressed = + fsst_compress(encoder_, 1, &input_length, &input_ptr, available, + out_ptr + sizeof(uint32_t), out_lengths.data(), out_ptrs.data()); + + if (num_compressed == 0) { + throw ParquetException("FSST compression buffer too small"); + } + + uint32_t len = static_cast(out_lengths[0]); + memcpy(out_ptr, &len, sizeof(uint32_t)); + out_ptr += sizeof(uint32_t) + out_lengths[0]; + total_output_size += sizeof(uint32_t) + out_lengths[0]; + } + + PARQUET_THROW_NOT_OK(output_buffer->Resize(total_output_size)); + UpdateCompressionStats(total_input_size, total_output_size); + unencoded_values_.clear(); + pending_unencoded_bytes_ = 0; + unencoded_byte_array_data_bytes_ = 0; + return output_buffer; + } + + using TypedEncoder::Put; + + void Put(const ByteArray* src, int num_values) override { + for (int i = 0; i < num_values; ++i) { + unencoded_values_.push_back(src[i]); + const int64_t length = static_cast(src[i].len); + unencoded_byte_array_data_bytes_ += length; + pending_unencoded_bytes_ += length; + } + } + + void Put(const ::arrow::Array& values) override { + AssertVarLengthBinary(values); + + PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BinaryType>( + *values.data(), + [&](::std::string_view view) { + if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { + return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + } + ByteArray val; + val.len = static_cast(view.size()); + val.ptr = reinterpret_cast(view.data()); + unencoded_values_.push_back(val); + const int64_t length = static_cast(val.len); + unencoded_byte_array_data_bytes_ += length; + pending_unencoded_bytes_ += length; + return Status::OK(); + }, + []() { return Status::OK(); })); + } + + void PutSpaced(const ByteArray* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + if (valid_bits != NULLPTR) { + PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(ByteArray), pool_)); + auto buffer_ptr = reinterpret_cast(buffer->mutable_data()); + int num_valid_values = ::arrow::util::internal::SpacedCompress( + src, num_values, valid_bits, valid_bits_offset, buffer_ptr); + Put(buffer_ptr, num_valid_values); + } else { + Put(src, num_values); + } + } + + private: + void BuildSymbolTable() { + if (unencoded_values_.empty()) { + return; + } + + std::vector input_lengths; + std::vector input_ptrs; + + for (const auto& val : unencoded_values_) { + input_lengths.push_back(val.len); + input_ptrs.push_back(val.ptr); + } + + encoder_ = fsst_create(unencoded_values_.size(), input_lengths.data(), + input_ptrs.data(), 0); + + if (!encoder_) { + throw ParquetException("Failed to create FSST encoder"); + } + } + + void UpdateCompressionStats(int64_t input_bytes, int64_t payload_bytes) { + const double ratio = static_cast(std::max(0, payload_bytes)) / + static_cast(std::max(int64_t{1}, input_bytes)); + compression_ratio_hint_ = std::max(kMinimumCompressionRatio, ratio); + } + + static constexpr double kDefaultCompressionRatio = 1.0; + static constexpr double kMinimumCompressionRatio = 0.01; + // Allocate worst-case 4x expansion. + static constexpr int64_t kFsstCompressionExpansion = 4; + fsst_encoder_t* encoder_; + std::vector unencoded_values_; + double compression_ratio_hint_ = kDefaultCompressionRatio; + int64_t pending_unencoded_bytes_ = 0; +}; + } // namespace // ---------------------------------------------------------------------- @@ -1821,6 +1992,13 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin default: throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY"); } + } else if (encoding == Encoding::FSST) { + switch (type_num) { + case Type::BYTE_ARRAY: + return std::make_unique(descr, pool); + default: + throw ParquetException("FSST encoding only supports BYTE_ARRAY"); + } } else if (encoding == Encoding::RLE) { switch (type_num) { case Type::BOOLEAN: diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 66a3f7647fa..ca51d8e1b24 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -28,7 +29,9 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_dict.h" #include "arrow/array/concatenate.h" +#include "arrow/buffer.h" #include "arrow/compute/cast.h" +#include "arrow/io/memory.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" @@ -41,10 +44,16 @@ #include "arrow/util/endian.h" #include "arrow/util/span.h" #include "arrow/util/string.h" +#include "parquet/column_page.h" +#include "parquet/column_reader.h" #include "parquet/encoding.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/test_util.h" +#include "parquet/thirdparty/fsst/fsst.h" #include "parquet/types.h" using arrow::default_memory_pool; @@ -2593,4 +2602,305 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { } } +// ---------------------------------------------------------------------- + +TEST(TestFsstEncoding, BasicRoundTrip) { + ::arrow::random::RandomArrayGenerator rag(0); + constexpr int64_t kNumValues = 2048; + constexpr int64_t kNumUnique = 128; + constexpr int32_t kMinLength = 4; + constexpr int32_t kMaxLength = 48; + + auto values = rag.BinaryWithRepeats(kNumValues, kNumUnique, kMinLength, kMaxLength, + 0.0); + + auto encoder = MakeTypedEncoder(Encoding::FSST); + ASSERT_NO_THROW(encoder->Put(*values)); + auto encoded = encoder->FlushValues(); + ASSERT_NE(nullptr, encoded); + ASSERT_GT(encoded->size(), 0); + + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(values->length()), encoded->data(), + static_cast(encoded->size())); + + std::vector decoded(values->length()); + ASSERT_EQ(static_cast(values->length()), + decoder->Decode(decoded.data(), static_cast(decoded.size()))); + + const auto& binary = checked_cast(*values); + for (int64_t i = 0; i < values->length(); ++i) { + auto view = binary.GetView(i); + ASSERT_EQ(view.size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i; + } +} + +TEST(TestFsstEncoding, FlushEmptyReturnsEmptyBuffer) { + auto encoder = MakeTypedEncoder(Encoding::FSST); + auto buffer = encoder->FlushValues(); + ASSERT_NE(nullptr, buffer); + ASSERT_EQ(0, buffer->size()); +} + +TEST(TestFsstEncoding, ReportUnencodedBytesResetsCounter) { + ::arrow::random::RandomArrayGenerator rag(42); + auto values = rag.String(256, 1, 24, 0.0); + + auto encoder = MakeTypedEncoder(Encoding::FSST); + ASSERT_NO_THROW(encoder->Put(*values)); + + const auto& binary = checked_cast(*values); + const int64_t expected = binary.total_values_length(); + EXPECT_EQ(expected, encoder->ReportUnencodedDataBytes()); + EXPECT_EQ(0, encoder->ReportUnencodedDataBytes()); + + auto encoded = encoder->FlushValues(); + ASSERT_NE(nullptr, encoded); + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(values->length()), encoded->data(), + static_cast(encoded->size())); + + std::vector decoded(values->length()); + ASSERT_EQ(static_cast(values->length()), + decoder->Decode(decoded.data(), static_cast(decoded.size()))); + + for (int64_t i = 0; i < values->length(); ++i) { + auto view = binary.GetView(i); + ASSERT_EQ(view.size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i; + } + + EXPECT_EQ(0, encoder->ReportUnencodedDataBytes()); +} + +TEST(TestFsstEncoding, MultiPageRoundTrip) { + constexpr int64_t kStringsPerPage = 10000; + constexpr int64_t kNumPages = 2; + constexpr int64_t kTotalValues = kStringsPerPage * kNumPages; + constexpr int64_t kPageSizeBytes = 512 * 1024; + + std::vector page1; + std::vector page2; + page1.reserve(kStringsPerPage); + page2.reserve(kStringsPerPage); + + for (int64_t i = 0; i < kStringsPerPage; ++i) { + page1.push_back("https://www.example.com/user/profile/page" + std::to_string(i) + + "/settings/preferences/advanced/options"); + page2.push_back("ftp://ftp.downloads.org/pub/software/release" + std::to_string(i) + + "/package/installer/binaries/archive"); + } + + std::vector expected; + expected.reserve(kTotalValues); + expected.insert(expected.end(), page1.begin(), page1.end()); + expected.insert(expected.end(), page2.begin(), page2.end()); + + ASSERT_OK_AND_ASSIGN(auto output_stream, ::arrow::io::BufferOutputStream::Create()); + + schema::NodeVector fields; + fields.push_back(schema::PrimitiveNode::Make("url", Repetition::REQUIRED, + Type::BYTE_ARRAY, ConvertedType::UTF8)); + auto parquet_schema = std::static_pointer_cast( + schema::GroupNode::Make("schema", Repetition::REQUIRED, fields)); + + auto writer_props = WriterProperties::Builder() + .encoding(Encoding::FSST) + ->data_pagesize(kPageSizeBytes) + ->build(); + + std::unique_ptr writer; + ASSERT_NO_THROW(writer = + ParquetFileWriter::Open(output_stream, parquet_schema, writer_props)); + ASSERT_NE(nullptr, writer); + auto* row_group_writer = writer->AppendRowGroup(); + ASSERT_NE(nullptr, row_group_writer); + auto* column_writer = static_cast*>( + row_group_writer->NextColumn()); + ASSERT_NE(nullptr, column_writer); + + auto write_page = [&](const std::vector& source) { + std::vector batch; + batch.reserve(source.size()); + for (const auto& value : source) { + batch.emplace_back(value); + } + column_writer->WriteBatch(static_cast(batch.size()), nullptr, nullptr, + batch.data()); + }; + + write_page(page1); + write_page(page2); + + column_writer->Close(); + row_group_writer->Close(); + ASSERT_NO_THROW(writer->Close()); + + ASSERT_OK_AND_ASSIGN(auto buffer, output_stream->Finish()); + + auto make_reader = [&buffer]() -> std::unique_ptr { + auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(buffer); + return ParquetFileReader::Open(buffer_reader); + }; + + std::unique_ptr page_reader_file; + ASSERT_NO_THROW(page_reader_file = make_reader()); + ASSERT_NE(nullptr, page_reader_file); + auto page_row_group = page_reader_file->RowGroup(0); + auto page_reader = page_row_group->GetColumnPageReader(0); + + int data_page_count = 0; + while (page_reader->NextPage() != nullptr) { + ++data_page_count; + } + EXPECT_GT(data_page_count, 1); + + std::unique_ptr reader; + ASSERT_NO_THROW(reader = make_reader()); + ASSERT_NE(nullptr, reader); + auto row_group_reader = reader->RowGroup(0); + auto column_reader = + std::static_pointer_cast>(row_group_reader->Column(0)); + + std::vector decoded(kTotalValues); + int64_t values_read = 0; + while (values_read < kTotalValues) { + int64_t batch_length = std::min(1024, kTotalValues - values_read); + int64_t batch_read = 0; + column_reader->ReadBatch(batch_length, nullptr, nullptr, + decoded.data() + values_read, &batch_read); + ASSERT_GT(batch_read, 0); + values_read += batch_read; + } + ASSERT_EQ(values_read, kTotalValues); + + for (int64_t i = 0; i < kTotalValues; ++i) { + ASSERT_EQ(expected[i].size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(expected[i].data(), decoded[i].ptr, expected[i].size())) << i; + } +} + +TEST(TestFsstEncoding, MultipleFlushesProduceIndependentPages) { + ::arrow::random::RandomArrayGenerator rag(99); + auto encoder = MakeTypedEncoder(Encoding::FSST); + + const std::vector> batches = { + {64, 3, 12}, {512, 2, 24}, {17, 1, 8}}; + + for (const auto& [size, min_len, max_len] : batches) { + auto batch = rag.String(size, min_len, max_len, 0.0); + + ASSERT_NO_THROW(encoder->Put(*batch)); + auto buffer = encoder->FlushValues(); + ASSERT_NE(nullptr, buffer); + ASSERT_GT(buffer->size(), 0); + + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(batch->length()), buffer->data(), + static_cast(buffer->size())); + + std::vector decoded(batch->length()); + ASSERT_EQ(static_cast(batch->length()), + decoder->Decode(decoded.data(), static_cast(decoded.size()))); + + const auto& binary = checked_cast(*batch); + for (int64_t i = 0; i < batch->length(); ++i) { + auto view = binary.GetView(i); + ASSERT_EQ(view.size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i; + } + + encoder->ReportUnencodedDataBytes(); + } +} + +TEST(TestFsstEncoding, DecodeRejectsOverstatedLength) { + ::arrow::random::RandomArrayGenerator rag(123); + auto values = rag.String(64, 3, 32, 0.0); + + auto encoder = MakeTypedEncoder(Encoding::FSST); + ASSERT_NO_THROW(encoder->Put(*values)); + auto encoded = encoder->FlushValues(); + ASSERT_NE(nullptr, encoded); + ASSERT_GT(encoded->size(), + static_cast(sizeof(fsst_decoder_t) + sizeof(uint32_t))); + + ASSERT_OK_AND_ASSIGN(auto corrupted, + ::arrow::AllocateBuffer(encoded->size(), default_memory_pool())); + std::memcpy(corrupted->mutable_data(), encoded->data(), encoded->size()); + + auto* length_ptr = + reinterpret_cast(corrupted->mutable_data() + sizeof(fsst_decoder_t)); + *length_ptr = static_cast(encoded->size()); + + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(values->length()), corrupted->data(), + static_cast(corrupted->size())); + + std::vector decoded(values->length()); + EXPECT_THROW(decoder->Decode(decoded.data(), static_cast(decoded.size())), + ParquetException); +} + +TEST(TestFsstEncoding, SetDataRejectsShortHeader) { + auto decoder = MakeTypedDecoder(Encoding::FSST); + + const int num_values = 1; + const int header_bytes = static_cast(sizeof(fsst_decoder_t)); + std::vector truncated(static_cast(header_bytes - 1), 0); + + EXPECT_THROW( + decoder->SetData(num_values, truncated.data(), static_cast(truncated.size())), + ParquetException); +} + +TEST(TestFsstEncoding, HeavyNullsDecodeSpaced) { + ::arrow::random::RandomArrayGenerator rag(321); + constexpr int64_t kNumValues = 4096; + constexpr double kNullProb = 0.85; + + auto values = rag.String(kNumValues, 1, 24, kNullProb); + + auto encoder = MakeTypedEncoder(Encoding::FSST); + ASSERT_NO_THROW(encoder->Put(*values)); + auto encoded = encoder->FlushValues(); + ASSERT_NE(nullptr, encoded); + + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(values->length()), encoded->data(), + static_cast(encoded->size())); + + const auto& binary = checked_cast(*values); + std::vector decoded(values->length()); + std::vector valid_bits(::arrow::bit_util::BytesForBits(values->length()), 0); + + ::arrow::internal::BitmapWriter writer(valid_bits.data(), 0, values->length()); + for (int64_t i = 0; i < values->length(); ++i) { + if (!binary.IsNull(i)) { + writer.Set(); + } else { + writer.Clear(); + } + writer.Next(); + } + writer.Finish(); + + const int null_count = binary.null_count(); + ASSERT_EQ(static_cast(values->length() - null_count), + decoder->DecodeSpaced(decoded.data(), static_cast(values->length()), + null_count, valid_bits.data(), 0)); + + for (int64_t i = 0; i < values->length(); ++i) { + if (binary.IsNull(i)) { + EXPECT_EQ(nullptr, decoded[i].ptr); + EXPECT_EQ(0u, decoded[i].len); + continue; + } + auto view = binary.GetView(i); + ASSERT_EQ(view.size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i; + } +} + } // namespace parquet::test diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index e3cc5adb964..b94868ec378 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -629,6 +629,13 @@ enum Encoding { Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. */ BYTE_STREAM_SPLIT = 9; + + /** Fast Static Symbol Table (FSST) encoding for BYTE_ARRAY data. + FSST compresses strings using a symbol table that maps 1-8 byte sequences + to single-byte codes. It allows random access to compressed data and works + well with dictionary encoding by compressing dictionary values. + */ + FSST = 10; } /** diff --git a/cpp/src/parquet/thirdparty/fsst/fsst.cpp b/cpp/src/parquet/thirdparty/fsst/fsst.cpp new file mode 100644 index 00000000000..c1a9cc34980 --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/fsst.cpp @@ -0,0 +1,200 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2019, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#ifdef FSST12 +#include "fsst12.h" // the official FSST API -- also usable by C mortals +#else +#include "fsst.h" // the official FSST API -- also usable by C mortals +#endif +#include +#include +#include +#include +#include +#include +using namespace std; + +// Utility to compress and decompress (-d) data with FSST (using stdin and stdout). +// +// The utility has a poor-man's async I/O in that it uses double buffering for input and output, +// and two background pthreads for reading and writing. The idea is to make the CPU overlap with I/O. +// +// The data format is quite simple. A FSST compressed file is a sequence of blocks, each with format: +// (1) 3-byte block length field (max blocksize is hence 16MB). This byte-length includes (1), (2) and (3). +// (2) FSST dictionary as produced by fst_export(). +// (3) the FSST compressed data. +// +// The natural strength of FSST is in fact not block-based compression, but rather the compression and +// *individual* decompression of many small strings separately. Think of compressed databases and (column-store) +// data formats. But, this utility is to serve as an apples-to-apples comparison point with utilities like lz4. + +namespace { + +class BinarySemaphore { + private: + mutex m; + condition_variable cv; + bool value; + + public: + explicit BinarySemaphore(bool initialValue = false) : value(initialValue) {} + void wait() { + unique_lock lock(m); + while (!value) cv.wait(lock); + value = false; + } + void post() { + { unique_lock lock(m); value = true; } + cv.notify_one(); + } +}; + +bool stopThreads = false; +BinarySemaphore srcDoneIO[2], dstDoneIO[2], srcDoneCPU[2], dstDoneCPU[2]; +unsigned char *srcBuf[2] = { NULL, NULL }; +unsigned char *dstBuf[2] = { NULL, NULL }; +unsigned char *dstMem[2] = { NULL, NULL }; +size_t srcLen[2] = { 0, 0 }; +size_t dstLen[2] = { 0, 0 }; + +#define FSST_MEMBUF (1ULL<<22) +int decompress = 0; +size_t blksz = FSST_MEMBUF-(1+FSST_MAXHEADER/2); // block size of compression (max compressed size must fit 3 bytes) + +#define DESERIALIZE(p) (((unsigned long long) (p)[0]) << 16) | (((unsigned long long) (p)[1]) << 8) | ((unsigned long long) (p)[2]) +#define SERIALIZE(l,p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; } + +void reader(ifstream& src) { + for(int swap=0; true; swap = 1-swap) { + srcDoneCPU[swap].wait(); + if (stopThreads) break; + src.read((char*) srcBuf[swap], blksz); + srcLen[swap] = (unsigned long) src.gcount(); + if (decompress) { + if (blksz && srcLen[swap] == blksz) { + blksz = DESERIALIZE(srcBuf[swap]+blksz-3); // read size of next block + srcLen[swap] -= 3; // cut off size bytes + } else { + blksz = 0; + } + } + srcDoneIO[swap].post(); + } +} + +void writer(ofstream& dst) { + for(int swap=0; true; swap = 1-swap) { + dstDoneCPU[swap].wait(); + if (!dstLen[swap]) break; + dst.write((char*) dstBuf[swap], dstLen[swap]); + dstDoneIO[swap].post(); + } + for(int swap=0; swap<2; swap++) + dstDoneIO[swap].post(); +} + +} + +#ifdef FSST_STANDALONE +int main(int argc, char* argv[]) { + size_t srcTot = 0, dstTot = 0; + if (argc < 2 || argc > 4 || (argc == 4 && (argv[1][0] != '-' || argv[1][1] != 'd' || argv[1][2]))) { + cerr << "usage: " << argv[0] << " -d infile outfile" << endl; + cerr << " " << argv[0] << " infile outfile" << endl; + cerr << " " << argv[0] << " infile" << endl; + return -1; + } + decompress = (argc == 4); + string srcfile(argv[1+decompress]), dstfile; + if (argc == 2) { + dstfile = srcfile + ".fsst"; + } else { + dstfile = argv[2+decompress]; + } + ifstream src; + ofstream dst; + src.open(srcfile, ios::binary); + dst.open(dstfile, ios::binary); + dst.exceptions(ios_base::failbit); + dst.exceptions(ios_base::badbit); + src.exceptions(ios_base::badbit); + if (decompress) { + unsigned char tmp[3]; + src.read((char*) tmp, 3); + if (src.gcount() != 3) { + cerr << "failed to open input." << endl; + return -1; + } + blksz = DESERIALIZE(tmp); // read first block size + } + vector buffer(FSST_MEMBUF*6); + srcBuf[0] = buffer.data(); + srcBuf[1] = srcBuf[0] + (FSST_MEMBUF*(1ULL+decompress)); + dstMem[0] = srcBuf[1] + (FSST_MEMBUF*(1ULL+decompress)); + dstMem[1] = dstMem[0] + (FSST_MEMBUF*(2ULL-decompress)); + + for(int swap=0; swap<2; swap++) { + srcDoneCPU[swap].post(); // input buffer is not being processed initially + dstDoneIO[swap].post(); // output buffer is not being written initially + } + thread readerThread([&src]{ reader(src); }); + thread writerThread([&dst]{ writer(dst); }); + + for(int swap=0; true; swap = 1-swap) { + srcDoneIO[swap].wait(); // wait until input buffer is available (i.e. done reading) + dstDoneIO[swap].wait(); // wait until output buffer is ready writing hence free for use + if (srcLen[swap] == 0) { + dstLen[swap] = 0; + break; + } + if (decompress) { + fsst_decoder_t decoder; + size_t hdr = fsst_import(&decoder, srcBuf[swap]); + dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF, dstBuf[swap] = dstMem[swap]); + } else { + unsigned char tmp[FSST_MAXHEADER]; + fsst_encoder_t* encoder = fsst_create(1, &srcLen[swap], const_cast(&srcBuf[swap]), 0); + size_t hdr = fsst_export(encoder, tmp); + if (fsst_compress(encoder, 1, &srcLen[swap], const_cast(&srcBuf[swap]), + FSST_MEMBUF * 2, dstMem[swap] + FSST_MAXHEADER + 3, + &dstLen[swap], &dstBuf[swap]) < 1) + return -1; + dstLen[swap] += 3 + hdr; + dstBuf[swap] -= 3 + hdr; + SERIALIZE(dstLen[swap],dstBuf[swap]); // block starts with size + copy(tmp, tmp+hdr, dstBuf[swap]+3); // then the header (followed by the compressed bytes which are already there) + fsst_destroy(encoder); + } + srcTot += srcLen[swap]; + dstTot += dstLen[swap]; + srcDoneCPU[swap].post(); // input buffer may be re-used by the reader for the next block + dstDoneCPU[swap].post(); // output buffer is ready for writing out + } + cerr << (decompress?"Dec":"C") << "ompressed " << srcTot << " bytes into " << dstTot << " bytes ==> " << (int) ((100*dstTot)/srcTot) << "%" << endl; + + // force wait until all background writes finished + stopThreads = true; + for(int swap=0; swap<2; swap++) { + srcDoneCPU[swap].post(); + dstDoneCPU[swap].post(); + } + dstDoneIO[0].wait(); + dstDoneIO[1].wait(); + readerThread.join(); + writerThread.join(); +} +#endif // FSST_STANDALONE diff --git a/cpp/src/parquet/thirdparty/fsst/fsst.h b/cpp/src/parquet/thirdparty/fsst/fsst.h new file mode 100644 index 00000000000..71085d57201 --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/fsst.h @@ -0,0 +1,227 @@ +/* + * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019 + * + * =================================================================================================================================== + * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): + * + * Copyright 2018-2020, CWI, TU Munich, FSU Jena + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files + * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst + * =================================================================================================================================== + * + * FSST: Fast Static Symbol Table compression + * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf + * + * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e. + * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual + * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is + * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text. + * + * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) + * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression + * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could + * be seen as strings again and fit in whatever your program is that manipulates strings. + * + * useful property: FSST ensures that strings that are equal, are also equal in their compressed form. + * + * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of + * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. + * + * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program + * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings). + * + * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are + * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length. + */ +#ifndef FSST_INCLUDED_H +#define FSST_INCLUDED_H + +#ifdef _MSC_VER +#define __restrict__ +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#define __ORDER_LITTLE_ENDIAN__ 2 +#include +static inline int __builtin_ctzl(unsigned long long x) { + unsigned long ret; + _BitScanForward64(&ret, x); + return (int)ret; +} +#endif + +#ifdef __cplusplus +#define FSST_FALLTHROUGH [[fallthrough]] +#include +extern "C" { +#else +#define FSST_FALLTHROUGH +#endif + +#include + +/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */ +#define FSST_ESC 255 + +/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */ +typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */ + +/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */ +typedef struct { + unsigned long long version; /* version id */ + unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */ + unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */ + unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ +} fsst_decoder_t; + +/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */ +fsst_encoder_t* +fsst_create( + size_t n, /* IN: number of strings in batch to sample from. */ + const size_t lenIn[], /* IN: byte-lengths of the inputs */ + const unsigned char *strIn[], /* IN: string start pointers. */ + int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */ +); + +/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ +fsst_encoder_t* +fsst_duplicate( + fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */ +); + +#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */ + +/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */ +unsigned int /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */ +fsst_export( + fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ + unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */ +); + +/* Deallocate encoder. */ +void +fsst_destroy(fsst_encoder_t*); + +/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */ +unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */ +fsst_import( + fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ + unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */ +); + +/* Return a decoder structure from an encoder. */ +fsst_decoder_t +fsst_decoder( + fsst_encoder_t *encoder +); + +/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */ +/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */ +size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ +fsst_compress( + fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */ + size_t nstrings, /* IN: number of strings in batch to compress. */ + const size_t lenIn[], /* IN: byte-lengths of the inputs */ + const unsigned char *strIn[], /* IN: input string start pointers. */ + size_t outsize, /* IN: byte-length of output buffer. */ + unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */ + size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */ + unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */ +); + +/* Decompress a single string, inlined for speed. */ +inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */ +fsst_decompress( + const fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */ + size_t lenIn, /* IN: byte-length of compressed string. */ + const unsigned char *strIn, /* IN: compressed string. */ + size_t size, /* IN: byte-length of output buffer. */ + unsigned char *output /* OUT: memory buffer to put the decompressed string in. */ +) { + unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len; + unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output; + unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; + size_t code, posOut = 0, posIn = 0; +#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */ +#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long)) +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + while (posOut+32 <= size && posIn+4 <= lenIn) { + unsigned int nextBlock, escapeMask; + memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int)); + escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u); + if (escapeMask == 0) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } else { + unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3; + switch(firstEscapePos) { /* Duff's device */ + case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */ + } + } + } + if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop + if (posIn+2 <= lenIn) { + strOut[posOut] = strIn[posIn+1]; + if (strIn[posIn] != FSST_ESC) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + if (strIn[posIn] != FSST_ESC) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } else { + posIn += 2; strOut[posOut++] = strIn[posIn-1]; + } + } else { + posIn += 2; posOut++; + } + } + if (posIn < lenIn) { // last code cannot be an escape + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } + } +#else + while (posOut+8 <= size && posIn < lenIn) + if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */ + FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */ + posOut += len[code]; + } else { + strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */ + posIn++; posOut++; + } +#endif +#endif + while (posIn < lenIn) + if ((code = strIn[posIn++]) < FSST_ESC) { + size_t posWrite = posOut, endWrite = posOut + len[code]; + unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite; + if ((posOut = endWrite) > size) endWrite = size; + for(; posWrite < endWrite; posWrite++) /* only write if there is room */ + strOut[posWrite] = symbolPointer[posWrite]; + } else { + if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */ + posIn++; posOut++; + } + if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0; + return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */ +} + +#ifdef __cplusplus +} +#endif +#endif /* FSST_INCLUDED_H */ diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp new file mode 100644 index 00000000000..e43d3e03652 --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp @@ -0,0 +1,149 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +#if defined(__x86_64__) || defined(_M_X64) +#include + +#ifdef _WIN32 +namespace libfsst { +bool fsst_hasAVX512() { + int info[4]; + __cpuidex(info, 0x00000007, 0); + return (info[1]>>16)&1; +} +} // namespace libfsst +#else +#include +namespace libfsst { +bool fsst_hasAVX512() { + int info[4]; + __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]); + return (info[1]>>16)&1; +} +} // namespace libfsst +#endif +#else +namespace libfsst { +bool fsst_hasAVX512() { return false; } +} // namespace libfsst +#endif + +namespace libfsst { + +// BULK COMPRESSION OF STRINGS +// +// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes. +// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up. +// +// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4 +// unroll3 performs best on my hardware +// +// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes). +// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs). +// +// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits) +// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order. +// +// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings. +// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process. +// - so 'processed' is the amount of strings we started processing and it is between [480,512]. +// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished. +// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method. +// +// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings. +// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings. +// +// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to. +// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch). +// +// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS +// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however, +// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them. +// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. +// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores. + +size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) { + size_t processed = 0; + // define some constants (all_x means that all 8 lanes contain 64-bits value X) +#ifdef __AVX512F__ + //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c + __m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1)); + __m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME)); + __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE)); +#define all_HASH _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE) +#define all_ONE _mm512_srli_epi64(all_MASK, 63) +#define all_M19 _mm512_srli_epi64(all_MASK, 45) +#define all_M18 _mm512_srli_epi64(all_MASK, 46) +#define all_M28 _mm512_srli_epi64(all_MASK, 36) +#define all_FFFFFF _mm512_srli_epi64(all_MASK, 40) +#define all_FFFF _mm512_srli_epi64(all_MASK, 48) +#define all_FF _mm512_srli_epi64(all_MASK, 56) + + SIMDjob *inputEnd = input+n; + assert(n >= unroll*8 && n <= 512); // should be close to 512 + __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4 + __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll + u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll + + if (unroll >= 4) { + while (input+delta1+delta2+delta3+delta4 < inputEnd) { + #include "fsst_avx512_unroll4.inc" + } + } else if (unroll == 3) { + while (input+delta1+delta2+delta3 < inputEnd) { + #include "fsst_avx512_unroll3.inc" + } + } else if (unroll == 2) { + while (input+delta1+delta2 < inputEnd) { + #include "fsst_avx512_unroll2.inc" + } + } else { + while (input+delta1 < inputEnd) { + #include "fsst_avx512_unroll1.inc" + } + } + + // flush the job states of the unfinished strings at the end of output[] + processed = n - (inputEnd - input); + u32 unfinished = 0; + if (unroll > 1) { + if (unroll > 2) { + if (unroll > 3) { + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); + unfinished += _mm_popcnt_u32((int) loadmask4); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); + unfinished += _mm_popcnt_u32((int) loadmask3); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); + unfinished += _mm_popcnt_u32((int) loadmask2); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); +#else + (void) symbolTable; + (void) codeBase; + (void) symbolBase; + (void) input; + (void) output; + (void) n; + (void) unroll; +#endif + return processed; +} +} // namespace libfsst diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc new file mode 100644 index 00000000000..0a74541dd88 --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc @@ -0,0 +1,57 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmaskX=11111111, deltaX=8). + jobX = _mm512_mask_expandloadu_epi64(jobX, loadmaskX, input); input += deltaX; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i wordX = _mm512_i64gather_epi64(_mm512_srli_epi64(jobX, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // codeX: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i codeX = _mm512_i64gather_epi64(_mm512_and_epi64(wordX, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + __m512i posX = _mm512_mullo_epi64(_mm512_and_epi64(wordX, all_FFFFFF), all_PRIME); + // hash them into a random number: posX = posX*PRIME; posX ^= posX>>SHIFT + posX = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(posX,_mm512_srli_epi64(posX,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + __m512i iclX = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the writeX register (in case it turns out to be an escaped byte). + __m512i writeX = _mm512_slli_epi64(_mm512_and_epi64(wordX, all_FF), 8); + // lookup just like the iclX above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symbX = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + posX = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(iclX, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 matchX = _mm512_cmpeq_epi64_mask(symbX, _mm512_and_epi64(wordX, posX)) & _mm512_cmplt_epi64_mask(iclX, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + codeX = _mm512_mask_mov_epi64(codeX, matchX, _mm512_srli_epi64(iclX, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + writeX = _mm512_or_epi64(writeX, _mm512_and_epi64(codeX, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + codeX = _mm512_and_epi64(codeX, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(jobX, all_M19), writeX, 1); + // increase the jobX.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + jobX = _mm512_add_epi64(jobX, _mm512_slli_epi64(_mm512_srli_epi64(codeX, FSST_LEN_BITS), 46)); + // increase the jobX.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + jobX = _mm512_add_epi64(jobX, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(codeX, 8), all_ONE))); + // test which lanes are done now (jobX.cur==jobX.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the jobX register) + loadmaskX = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(jobX, 46), _mm512_and_epi64(_mm512_srli_epi64(jobX, 28), all_M18)); + // calculate the amount of lanes in jobX that are done + deltaX = _mm_popcnt_u32((int) loadmaskX); + // write out the job state for the lanes that are done (we need the final 'jobX.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmaskX, jobX); output += deltaX; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc new file mode 100644 index 00000000000..f4b81c7970d --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc @@ -0,0 +1,57 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc new file mode 100644 index 00000000000..aa33cd7e69c --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc @@ -0,0 +1,114 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc new file mode 100644 index 00000000000..e2057032abd --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc @@ -0,0 +1,171 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + code3 = _mm512_and_epi64(code3, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + // calculate the amount of lanes in job3 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + delta3 = _mm_popcnt_u32((int) loadmask3); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; + _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc new file mode 100644 index 00000000000..15cca7c938b --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc @@ -0,0 +1,228 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; + job4 = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); + __m512i word4 = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code4 = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); + __m512i pos4 = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT + // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); + pos4 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); + __m512i write4 = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); + pos4 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); + __mmask8 match4 = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); + code4 = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); + write4 = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + code3 = _mm512_and_epi64(code3, all_FFFF); + code4 = _mm512_and_epi64(code4, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); + job4 = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); + job4 = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) + // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); + loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + // calculate the amount of lanes in job3 that are done + // calculate the amount of lanes in job4 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + delta3 = _mm_popcnt_u32((int) loadmask3); + delta4 = _mm_popcnt_u32((int) loadmask4); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; + _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; + _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4; diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.cpp b/cpp/src/parquet/thirdparty/fsst/libfsst.cpp new file mode 100644 index 00000000000..e3ba787b959 --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/libfsst.cpp @@ -0,0 +1,651 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +namespace libfsst { +Symbol concat(Symbol a, Symbol b) { + Symbol s; + u32 length = a.length()+b.length(); + if (length > Symbol::maxLength) length = Symbol::maxLength; + s.set_code_len(FSST_CODE_MASK, length); + s.store_num((b.load_num() << (8*a.length())) | a.load_num()); + return s; +} +} // namespace libfsst + +namespace std { +template <> +class hash { + public: + size_t operator()(const libfsst::QSymbol& q) const { + uint64_t k = q.symbol.load_num(); + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + uint64_t h = 0x8445d61a4e774912 ^ (8*m); + k *= m; + k ^= k >> r; + k *= m; + h ^= k; + h *= m; + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; + } +}; +} + +namespace libfsst { +bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } + +std::ostream& operator<<(std::ostream& out, const Symbol& s) { + for (u32 i=0; i line, const size_t len[], bool zeroTerminated=false) { + SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); + int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) + size_t sampleFrac = 128; + + // start by determining the terminator. We use the (lowest) most infrequent byte as terminator + st->zeroTerminated = zeroTerminated; + if (zeroTerminated) { + st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency + } else { + u16 byteHisto[256]; + memset(byteHisto, 0, sizeof(byteHisto)); + for(size_t i=0; iterminator = 256; + while(i-- > 0) { + if (byteHisto[i] > minSize) continue; + st->terminator = i; + minSize = byteHisto[i]; + } + } + assert(st->terminator != 256); + + // a random number between 0 and 128 + auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); }; + + // compress sample, and compute (pair-)frequencies + auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain + int gain = 0; + + for(size_t i=0; i sampleFrac) continue; + } + if (cur < end) { + u16 code2 = 255, code1 = st->findLongestSymbol(cur, end); + cur += st->symbols[code1].length(); + gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1))); + while (true) { + // count single symbol (i.e. an option is not extending it) + counters.count1Inc(code1); + + // as an alternative, consider just using the next byte.. + if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly + counters.count1Inc(*start); + + if (cur==end) { + break; + } + + // now match a new symbol + start = cur; + if (curhashTabSize-1); + Symbol s = st->hashTab[idx]; + code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) { + code2 = s.code(); + cur += s.length(); + } else if (code2 >= FSST_CODE_BASE) { + cur += 2; + } else { + code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; + cur += 1; + } + } else { + code2 = st->findLongestSymbol(cur, end); + cur += st->symbols[code2].length(); + } + + // compute compressed output size + gain += ((int) (cur-start))-(1+isEscapeCode(code2)); + + if (sampleFrac < 128) { // no need to count pairs in final round + // consider the symbol that is the concatenation of the two last symbols + counters.count2Inc(code1, code2); + + // as an alternative, consider just extending with the next byte.. + if ((cur-start) > 1) // ..but do not count single byte extensions doubly + counters.count2Inc(code1, *start); + } + code1 = code2; + } + } + } + return gain; + }; + + auto makeTable = [&](SymbolTable *st, Counters &counters) { + // hashmap of c (needed because we can generate duplicate candidates) + unordered_set cands; + + // artificially make terminater the most frequent symbol so it gets included + u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator; + counters.count1Set(terminator,65535); + + auto addOrInc = [&](unordered_set &cands, Symbol s, u64 count) { + if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!! + QSymbol q; + q.symbol = s; + q.gain = count * s.length(); + auto it = cands.find(q); + if (it != cands.end()) { + q.gain += (*it).gain; + cands.erase(*it); + } + cands.insert(q); + }; + + // add candidate symbols based on counted frequency + for (u32 pos1=0; pos1nSymbols; pos1++) { + u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! + if (!cnt1) continue; + + // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed + Symbol s1 = st->symbols[pos1]; + addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1); + + if (sampleFrac >= 128 || // last round we do not create new (combined) symbols + s1.length() == Symbol::maxLength || // symbol cannot be extended + s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte + continue; + } + for (u32 pos2=0; pos2nSymbols; pos2++) { + u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! + if (!cnt2) continue; + + // create a new symbol + Symbol s2 = st->symbols[pos2]; + Symbol s3 = concat(s1, s2); + if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte + addOrInc(cands, s3, cnt2); + } + } + + // insert candidates into priority queue (by gain) + auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); }; + priority_queue,decltype(cmpGn)> pq(cmpGn); + for (auto& q : cands) + pq.push(q); + + // Create new symbol map using best candidates + st->clear(); + while (st->nSymbols < 255 && !pq.empty()) { + QSymbol q = pq.top(); + pq.pop(); + st->add(q.symbol); + } + }; + + u8 bestCounters[512*sizeof(u16)]; +#ifdef NONOPT_FSST + for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) { + sampleFrac = frac; +#else + for(sampleFrac=8; true; sampleFrac += 30) { +#endif + memset(&counters, 0, sizeof(Counters)); + long gain = compressCount(st, counters); + if (gain >= bestGain) { // a new best solution! + counters.backup1(bestCounters); + *bestTable = *st; bestGain = gain; + } + if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128) + makeTable(st, counters); + } + delete st; + counters.restore1(bestCounters); + makeTable(bestTable, counters); + bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression + return bestTable; +} + +#ifndef NONOPT_FSST +static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) { + size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size; + u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings + SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer + SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this) + size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs) + + while (curLine < nlines && outOff <= (1<<19)) { + size_t prevLine = curLine, chunk, curOff = 0; + + // bail out if the output buffer cannot hold the compressed next string fully + if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7 + else budget -= (len[curLine]-curOff)*2; + + strOut[curLine] = (u8*) 0; + lenOut[curLine] = 0; + + do { + do { + chunk = len[curLine] - curOff; + if (chunk > 511) { + chunk = 511; // large strings need to be chopped up into segments of 511 bytes + } + // create a job in this batch + SIMDjob job; + job.cur = inOff; + job.end = job.cur + chunk; + job.pos = batchPos; + job.out = outOff; + + // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros) + outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes. + if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk + + // register job in this batch + input[batchPos] = job; + jobLine[batchPos] = curLine; + + if (chunk == 0) { + empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out + } else { + // copy string chunk into temp buffer + memcpy(symbolBase + inOff, line[curLine] + curOff, chunk); + inOff += chunk; + curOff += chunk; + symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded + } + if (++batchPos == 512) break; + } while(curOff < len[curLine]); + + if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines) || (((len[curLine])*2 + 7) > budget)) { // cannot accumulate more? + if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling) + // radix-sort jobs on length (longest string first) + // -- this provides best load balancing and allows to skip empty jobs at the end + u16 sortpos[513]; + memset(sortpos, 0, sizeof(sortpos)); + + // calculate length histo + for(size_t i=0; i> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } + } + job.out = out - codeBase; + } + // postprocess job info + job.cur = 0; + job.end = job.out - input[job.pos].out; // misuse .end field as compressed size + job.out = input[job.pos].out; // reset offset to start of encoded string + input[job.pos] = job; + } + + // copy out the result data + for(size_t i=0; i> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else if (avoidBranch) { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } else if ((u8) code < byteLim) { + // 2 byte code after checking there is no longer pattern + *out++ = (u8) code; cur += 2; + } else { + // 1 byte code or miss. + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse + cur++; + } + } + } + }; + + for(curLine=0; curLine 511) { + chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST + } + if ((2*chunk+7) > (size_t) (lim-out)) { + return curLine; // out of memory + } + // copy the string to the 511-byte buffer + memcpy(buf, cur, chunk); + buf[chunk] = (u8) symbolTable.terminator; + cur = buf; + end = cur + chunk; + + // based on symboltable stats, choose a variant that is nice to the branch predictor + if (noSuffixOpt) { + compressVariant(true,false); + } else if (avoidBranch) { + compressVariant(false,true); + } else { + compressVariant(false, false); + } + } while((curOff += chunk) < lenIn[curLine]); + lenOut[curLine] = (size_t) (out - strOut[curLine]); + } + return curLine; +} + +#define FSST_SAMPLELINE ((size_t) 512) + +// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes +vector makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) { + size_t totSize = 0; + const size_t *lenIn = *lenRef; + vector sample; + + for(size_t i=0; i sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample + Encoder *encoder = new Encoder(); + encoder->symbolTable = shared_ptr(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); + if (sampleLen != lenIn) delete[] sampleLen; + delete[] sampleBuf; + return (fsst_encoder_t*) encoder; +} + +/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */ +extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) { + Encoder *e = new Encoder(); + e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr + return (fsst_encoder_t*) e; +} + +// export a symbol table in compact format. +extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { + Encoder *e = (Encoder*) encoder; + // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there. + // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t + // (such functionality could be useful to append compressed data to an existing block). + // + // However, the hash function in the encoder hash table is endian-sensitive, and given its + // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. + // Doing a endian-conversion during hashing will be slow and self-defeating. + // + // Overall, we could support reconstructing an encoder for incremental compression, but + // should enforce equal-endianness. Bit of a bummer. Not going there now. + // + // The version field is now there just for future-proofness, but not used yet + + // version allows keeping track of fsst versions, track endianness, and encoder reconstruction + u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0 + (((u64) e->symbolTable->suffixLim) << 24) | + (((u64) e->symbolTable->terminator) << 16) | + (((u64) e->symbolTable->nSymbols) << 8) | + FSST_ENDIAN_MARKER; // least significant byte is nonzero + + version = swap64_if_be(version); // ensure version is little-endian encoded + + /* do not assume unaligned reads here */ + memcpy(buf, &version, 8); + buf[8] = e->symbolTable->zeroTerminated; + for(u32 i=0; i<8; i++) + buf[9+i] = (u8) e->symbolTable->lenHisto[i]; + u32 pos = 17; + + // emit only the used bytes of the symbols + for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++) + for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++) + buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes + + return pos; // length of what was serialized +} + +#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */ + +extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) { + u64 version = 0; + u32 code, pos = 17; + u8 lenHisto[8]; + + // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) + memcpy(&version, buf, 8); + version = swap64_if_be(version); // version is always little-endian encoded + + if ((version>>32) != FSST_VERSION) return 0; + decoder->zeroTerminated = buf[8]&1; + memcpy(lenHisto, buf+9, 8); + + // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) + decoder->len[0] = 1; + decoder->symbol[0] = 0; + + // we use lenHisto[0] as 1-byte symbol run length (at the end) + code = decoder->zeroTerminated; + if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end + + // now get all symbols from the buffer + for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */ + for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) { + decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */ + decoder->symbol[code] = 0; + for(u32 j=0; jlen[code]; j++) + ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols + } + } + if (decoder->zeroTerminated) lenHisto[0]++; + + // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). + while(code<255) { + decoder->symbol[code] = FSST_CORRUPT; + decoder->len[code++] = 8; + } + return pos; +} + +// runtime check for simd +inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { +#ifndef NONOPT_FSST + if (simd && fsst_hasAVX512()) + return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +#endif + (void) simd; + return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch); +} +size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} + +// adaptive choosing of scalar compression method based on symbol length histogram +inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + bool avoidBranch = false, noSuffixOpt = false; + if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) { + noSuffixOpt = true; + } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) && + (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) && + (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) { + avoidBranch = true; + } + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} +size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +} +} // namespace libfsst + +using namespace libfsst; +// the main compression function (everything automatic) +extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) { + // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) + size_t totLen = accumulate(lenIn, lenIn+nlines, 0); + int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); + return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); +} + +/* deallocate encoder */ +extern "C" void fsst_destroy(fsst_encoder_t* encoder) { + Encoder *e = (Encoder*) encoder; + delete e; +} + +/* very lazy implementation relying on export and import */ +extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) { + u8 buf[sizeof(fsst_decoder_t)]; + u32 cnt1 = fsst_export(encoder, buf); + fsst_decoder_t decoder; + u32 cnt2 = fsst_import(&decoder, buf); + assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; + return decoder; +} diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp new file mode 100644 index 00000000000..f61bc0175b6 --- /dev/null +++ b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp @@ -0,0 +1,471 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include "fsst.h" // the official FSST API -- also usable by C mortals + +/* unsigned integers */ +namespace libfsst { +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +} // namespace libfsst + +#if UINTPTR_MAX == 0xffffffffU +// We're on a 32-bit platform +#define NONOPT_FSST +#endif + +#define FSST_ENDIAN_MARKER ((u64) 1) +#define FSST_VERSION_20190218 20190218 +#define FSST_VERSION ((u64) FSST_VERSION_20190218) + +// "symbols" are character sequences (up to 8 bytes) +// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism: +// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X. + +// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length +#define FSST_LEN_BITS 12 +#define FSST_CODE_BITS 9 +#define FSST_CODE_BASE 256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */ +#define FSST_CODE_MAX (1UL<=8) { + len = 8; + memcpy(val.str, input, 8); + } else { + memcpy(val.str, input, len); + } + set_code_len(FSST_CODE_MAX, len); + } + void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); } + + u64 load_num() const { return swap64_if_be(val.num); } + void store_num(u64 v) { val.num = swap64_if_be(v); } + + u32 length() const { return (u32) (icl >> 28); } + u16 code() const { return (icl >> 16) & FSST_CODE_MASK; } + u32 ignoredBits() const { return (u32) icl; } + + u8 first() const { assert( length() >= 1); return 0xFF & load_num(); } + u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); } + +#define FSST_HASH_LOG2SIZE 10 +#define FSST_HASH_PRIME 2971215073LL +#define FSST_SHIFT 15 +#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) + size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes +}; + +// Symbol that can be put in a queue, ordered on gain +struct QSymbol{ + Symbol symbol; + mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols + bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } +}; + +// we construct FSST symbol tables using a random sample of about 16KB (1<<14) +#define FSST_SAMPLETARGET (1<<14) +#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) + +// two phases of compression, before and after optimize(): +// +// (1) to encode values we probe (and maintain) three datastructures: +// - u16 byteCodes[256] array at the position of the next byte (s.length==1) +// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2) +// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), +// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are +// pseudo codes representing a single byte these will become escapes) +// +// (2) when we finished looking for the best symbol table we call optimize() to reshape it: +// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1) +// length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes +// (allows shortcut during compression) +// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding +// to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one +// byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[] +// and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction +// is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was +// hence added to make symbolTable construction faster. +// +// this final layout allows for the fastest compression code, only currently present in compressBulk + +// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4 +#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket + +// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key +// ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster +// +// the gain field is only used in the symbol queue that sorts symbols on gain + +struct SymbolTable { + static const u32 hashTabSize = 1<> (u8) s.icl)); + return true; + } + bool add(Symbol s) { + assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX); + u32 len = s.length(); + s.set_code_len(FSST_CODE_BASE + nSymbols, len); + if (len == 1) { + byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<> ((u8) hashTab[idx].icl)))) { + return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol + } + if (s.length() >= 2) { + u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; + if (code >= FSST_CODE_BASE) return code; + } + return byteCodes[s.first()] & FSST_CODE_MASK; + } + u16 findLongestSymbol(const u8* cur, const u8* end) const { + return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol + } + + // rationale for finalize: + // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable() + // consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize() + // (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass) + // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf. + // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). + // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[] + // Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte + // symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time. + // + // In all, we change the layout and coding, as follows.. + // + // before finalize(): + // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255 + // - The first 256 codes are pseudo symbols (all escaped bytes) + // + // after finalize(): + // - table layout is symbols[0..nSymbols>, with nSymbols < 256. + // - Real codes are [0,nSymbols>. 8-th bit not set. + // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255 + // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last) + // the two-byte codes are split in two sections: + // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression + // + // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore). + // + void finalize(u8 zeroTerminated) { + assert(nSymbols <= 255); + u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated); + + // compute running sum of code lengths (starting offsets for each length) + rsum[0] = byteLim; // 1-byte codes are highest + rsum[1] = zeroTerminated; + for(u32 i=1; i<7; i++) + rsum[i+1] = rsum[i] + lenHisto[i]; + + // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim) + suffixLim = rsum[1]; + symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only) + + for(u32 i=zeroTerminated, j=rsum[2]; i 1 && first2 == s2.first2()) // test if symbol k is a suffix of s + opt = 0; + } + newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim + } else + newCode[i] = rsum[len-1]++; + s1.set_code_len(newCode[i],len); + symbols[newCode[i]] = s1; + } + // renumber the codes in byteCodes[] + for(u32 i=0; i<256; i++) + if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); + else + byteCodes[i] = 511 + (1 << FSST_LEN_BITS); + + // renumber the codes in shortCodes[] + for(u32 i=0; i<65536; i++) + if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); + else + shortCodes[i] = byteCodes[i&0xFF]; + + // replace the symbols in the hash table + for(u32 i=0; i>8; + } + void count1Inc(u32 pos1) { + if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... + } + void count2Inc(u32 pos1, u32 pos2) { + if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively + count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) + } + u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range + // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros + u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7] + + u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes + high = (high >> (zero << 3)) & 255; // advance to nonzero counter + if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 + return 0; // all zero + + u32 low = count1Low[pos1]; + if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) + return (u32) ((high << 8) + low); + } + u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range + // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros + u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15] + high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters + + u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters + high = (high >> (zero << 2)) & 15; // advance to nonzero counter + if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 + return 0UL; // all zero + + u32 low = count2Low[pos1][pos2]; + if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) + return (u32) ((high << 8) + low); + } + void backup1(u8 *buf) { + memcpy(buf, count1High, FSST_CODE_MAX); + memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX); + } + void restore1(u8 *buf) { + memcpy(count1High, buf, FSST_CODE_MAX); + memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX); + } +}; +#endif + + +#define FSST_BUFSZ (3<<19) // 768KB + +// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression +struct Encoder { + shared_ptr symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc) + union { + Counters counters; // for counting symbol occurences during map construction + u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) + }; +}; + +// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call) +struct SIMDjob { + u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) +}; + +extern bool +fsst_hasAVX512(); // runtime check for avx512 capability + +extern size_t +fsst_compressAVX512( + SymbolTable &symbolTable, + u8* codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) + u8* symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) + SIMDjob* input, // IN: input array (size n) with job information: what to encode, where to store it. + SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer. + size_t n, // IN: size of arrays input and output (should be max 512) + size_t unroll); // IN: degree of SIMD unrolling + +// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) +size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); +size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); +} // namespace libfsst diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 7109c3d1c83..7e5324814b0 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -259,6 +259,8 @@ std::string EncodingToString(Encoding::type t) { return "RLE_DICTIONARY"; case Encoding::BYTE_STREAM_SPLIT: return "BYTE_STREAM_SPLIT"; + case Encoding::FSST: + return "FSST"; default: return "UNKNOWN"; } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index c2040e555fd..0113ec86aa1 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -531,8 +531,9 @@ struct Encoding { DELTA_BYTE_ARRAY = 7, RLE_DICTIONARY = 8, BYTE_STREAM_SPLIT = 9, + FSST = 10, // Should always be last element (except UNKNOWN) - UNDEFINED = 10, + UNDEFINED = 11, UNKNOWN = 999 }; }; diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 14cd3e363a4..86f9f27305a 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1487,6 +1487,7 @@ cdef encoding_name_from_enum(ParquetEncoding encoding_): ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY', ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY', ParquetEncoding_BYTE_STREAM_SPLIT: 'BYTE_STREAM_SPLIT', + ParquetEncoding_FSST: 'FSST', }.get(encoding_, 'UNKNOWN') @@ -1499,6 +1500,7 @@ cdef encoding_enum_from_name(str encoding_name): 'DELTA_BINARY_PACKED': ParquetEncoding_DELTA_BINARY_PACKED, 'DELTA_LENGTH_BYTE_ARRAY': ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY, 'DELTA_BYTE_ARRAY': ParquetEncoding_DELTA_BYTE_ARRAY, + 'FSST': ParquetEncoding_FSST, 'RLE_DICTIONARY': 'dict', 'PLAIN_DICTIONARY': 'dict', }.get(encoding_name, None) diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd index 42d48ba050f..3663b8bcf62 100644 --- a/python/pyarrow/includes/libparquet.pxd +++ b/python/pyarrow/includes/libparquet.pxd @@ -130,6 +130,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY" ParquetEncoding_BYTE_STREAM_SPLIT \ " parquet::Encoding::BYTE_STREAM_SPLIT" + ParquetEncoding_FSST" parquet::Encoding::FSST" enum ParquetCompression" parquet::Compression::type": ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 24cb586c82b..5fb3cc5b699 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -821,7 +821,7 @@ def _sanitize_table(table, new_schema, flavor): Can only be used when ``use_dictionary`` is set to False, and cannot be used in combination with ``use_byte_stream_split``. Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', - 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'FSST'}. Certain encodings are only compatible with certain data types. Please refer to the encodings section of `Reading and writing Parquet files `_. From 9ba9672710837cee922a2c5aa7dea1b78230b9f8 Mon Sep 17 00:00:00 2001 From: arnavb Date: Sun, 23 Nov 2025 14:20:27 +0000 Subject: [PATCH 02/16] update --- cpp/src/parquet/thirdparty/fsst/libfsst.hpp | 127 ++++++++++---------- 1 file changed, 65 insertions(+), 62 deletions(-) diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp index f61bc0175b6..e4d7c0aa9b9 100644 --- a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp +++ b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp @@ -1,20 +1,20 @@ // this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// +// // Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: -// +// // - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst #include #include #include @@ -59,7 +59,7 @@ typedef uint64_t u64; // we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length #define FSST_LEN_BITS 12 -#define FSST_CODE_BITS 9 +#define FSST_CODE_BITS 9 #define FSST_CODE_BASE 256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */ #define FSST_CODE_MAX (1UL<= 1); return 0xFF & load_num(); } u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); } -#define FSST_HASH_LOG2SIZE 10 +#define FSST_HASH_LOG2SIZE 10 #define FSST_HASH_PRIME 2971215073LL #define FSST_SHIFT 15 #define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) @@ -129,7 +129,7 @@ struct QSymbol{ bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } }; -// we construct FSST symbol tables using a random sample of about 16KB (1<<14) +// we construct FSST symbol tables using a random sample of about 16KB (1<<14) #define FSST_SAMPLETARGET (1<<14) #define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) @@ -138,15 +138,15 @@ struct QSymbol{ // (1) to encode values we probe (and maintain) three datastructures: // - u16 byteCodes[256] array at the position of the next byte (s.length==1) // - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2) -// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), -// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are +// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), +// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are // pseudo codes representing a single byte these will become escapes) // // (2) when we finished looking for the best symbol table we call optimize() to reshape it: // - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1) -// length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes +// length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes // (allows shortcut during compression) -// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding +// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding // to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one // byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[] // and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction @@ -173,9 +173,9 @@ struct SymbolTable { u16 byteCodes[256]; // contains code for every 1-byte symbol, otherwise code for pseudo byte (escaped byte) // 'symbols' is the current symbol table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code' - Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols + Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols - // replicate long symbols in hashTab (avoid indirection). + // replicate long symbols in hashTab (avoid indirection). Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes u16 nSymbols; // amount of symbols in the map (max 255) @@ -225,8 +225,8 @@ struct SymbolTable { u32 idx = symbols[i].hash() & (hashTabSize-1); hashTab[idx].val.num = 0; hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab - } - } + } + } nSymbols = 0; // no need to clean symbols[] as no symbols are used } bool hashInsert(Symbol s) { @@ -256,11 +256,11 @@ struct SymbolTable { u16 findLongestSymbol(Symbol s) const { size_t idx = s.hash() & (hashTabSize-1); if (hashTab[idx].icl <= s.icl && hashTab[idx].load_num() == (s.load_num() & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) { - return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol + return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol } if (s.length() >= 2) { u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; - if (code >= FSST_CODE_BASE) return code; + if (code >= FSST_CODE_BASE) return code; } return byteCodes[s.first()] & FSST_CODE_MASK; } @@ -273,23 +273,23 @@ struct SymbolTable { // consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize() // (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass) // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf. - // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). + // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[] // Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte // symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time. // // In all, we change the layout and coding, as follows.. // - // before finalize(): + // before finalize(): // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255 // - The first 256 codes are pseudo symbols (all escaped bytes) // - // after finalize(): - // - table layout is symbols[0..nSymbols>, with nSymbols < 256. - // - Real codes are [0,nSymbols>. 8-th bit not set. + // after finalize(): + // - table layout is symbols[0..nSymbols>, with nSymbols < 256. + // - Real codes are [0,nSymbols>. 8-th bit not set. // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255 // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last) - // the two-byte codes are split in two sections: + // the two-byte codes are split in two sections: // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression // // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore). @@ -298,7 +298,7 @@ struct SymbolTable { assert(nSymbols <= 255); u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated); - // compute running sum of code lengths (starting offsets for each length) + // compute running sum of code lengths (starting offsets for each length) rsum[0] = byteLim; // 1-byte codes are highest rsum[1] = zeroTerminated; for(u32 i=1; i<7; i++) @@ -308,34 +308,34 @@ struct SymbolTable { suffixLim = rsum[1]; symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only) - for(u32 i=zeroTerminated, j=rsum[2]; i 1 && first2 == s2.first2()) // test if symbol k is a suffix of s opt = 0; } - newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim - } else + newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim + } else newCode[i] = rsum[len-1]++; s1.set_code_len(newCode[i],len); - symbols[newCode[i]] = s1; + symbols[newCode[i]] = s1; } - // renumber the codes in byteCodes[] - for(u32 i=0; i<256; i++) + // renumber the codes in byteCodes[] + for(u32 i=0; i<256; i++) if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); - else + else byteCodes[i] = 511 + (1 << FSST_LEN_BITS); - - // renumber the codes in shortCodes[] + + // renumber the codes in shortCodes[] for(u32 i=0; i<65536; i++) if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); - else + else shortCodes[i] = byteCodes[i&0xFF]; // replace the symbols in the hash table @@ -347,22 +347,22 @@ struct SymbolTable { #ifdef NONOPT_FSST struct Counters { - u16 count1[FSST_CODE_MAX]; // array to count frequency of symbols as they occur in the sample - u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample + u16 count1[FSST_CODE_MAX]; // array to count frequency of symbols as they occur in the sample + u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample - void count1Set(u32 pos1, u16 val) { + void count1Set(u32 pos1, u16 val) { count1[pos1] = val; } - void count1Inc(u32 pos1) { + void count1Inc(u32 pos1) { count1[pos1]++; } - void count2Inc(u32 pos1, u32 pos2) { + void count2Inc(u32 pos1, u32 pos2) { count2[pos1][pos2]++; } - u32 count1GetNext(u32 &pos1) { + u32 count1GetNext(u32 &pos1) { return count1[pos1]; } - u32 count2GetNext(u32 pos1, u32 &pos2) { + u32 count2GetNext(u32 pos1, u32 &pos2) { return count2[pos1][pos2]; } void backup1(u8 *buf) { @@ -383,16 +383,16 @@ struct Counters { u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high) u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX]; // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2) // 385KB -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB) - - void count1Set(u32 pos1, u16 val) { + + void count1Set(u32 pos1, u16 val) { count1Low[pos1] = val&255; count1High[pos1] = val>>8; } - void count1Inc(u32 pos1) { + void count1Inc(u32 pos1) { if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... } - void count2Inc(u32 pos1, u32 pos2) { + void count2Inc(u32 pos1, u32 pos2) { if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) @@ -432,32 +432,32 @@ struct Counters { memcpy(count1High, buf, FSST_CODE_MAX); memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX); } -}; +}; #endif #define FSST_BUFSZ (3<<19) // 768KB -// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression +// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression struct Encoder { shared_ptr symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc) union { Counters counters; // for counting symbol occurences during map construction - u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) + u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) }; }; // job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call) struct SIMDjob { - u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) + u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) }; -extern bool +extern bool fsst_hasAVX512(); // runtime check for avx512 capability -extern size_t +extern size_t fsst_compressAVX512( - SymbolTable &symbolTable, + SymbolTable &symbolTable, u8* codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) u8* symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) SIMDjob* input, // IN: input array (size n) with job information: what to encode, where to store it. @@ -465,6 +465,9 @@ fsst_compressAVX512( size_t n, // IN: size of arrays input and output (should be max 512) size_t unroll); // IN: degree of SIMD unrolling +// Symbol manipulation +Symbol concat(Symbol a, Symbol b); + // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); From 12eef2e8d9884ed0c4efff97c91c2b604494a360 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 08:10:50 +0000 Subject: [PATCH 03/16] cmake build --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 35 + cpp/src/parquet/CMakeLists.txt | 29 +- cpp/src/parquet/decoder.cc | 2 +- cpp/src/parquet/encoder.cc | 2 +- cpp/src/parquet/encoding_test.cc | 2 +- cpp/src/parquet/thirdparty/fsst/fsst.cpp | 200 ------ cpp/src/parquet/thirdparty/fsst/fsst.h | 227 ------ .../parquet/thirdparty/fsst/fsst_avx512.cpp | 149 ---- .../parquet/thirdparty/fsst/fsst_avx512.inc | 57 -- .../thirdparty/fsst/fsst_avx512_unroll1.inc | 57 -- .../thirdparty/fsst/fsst_avx512_unroll2.inc | 114 --- .../thirdparty/fsst/fsst_avx512_unroll3.inc | 171 ----- .../thirdparty/fsst/fsst_avx512_unroll4.inc | 228 ------ cpp/src/parquet/thirdparty/fsst/libfsst.cpp | 651 ------------------ cpp/src/parquet/thirdparty/fsst/libfsst.hpp | 474 ------------- cpp/thirdparty/versions.txt | 3 + 16 files changed, 66 insertions(+), 2335 deletions(-) delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst.cpp delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst.h delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc delete mode 100644 cpp/src/parquet/thirdparty/fsst/libfsst.cpp delete mode 100644 cpp/src/parquet/thirdparty/fsst/libfsst.hpp diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 4ced2a66bf5..8d67849ea47 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -49,6 +49,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES Boost Brotli BZip2 + fsst c-ares gflags glog @@ -183,6 +184,8 @@ macro(build_dependency DEPENDENCY_NAME) build_brotli() elseif("${DEPENDENCY_NAME}" STREQUAL "BZip2") build_bzip2() + elseif("${DEPENDENCY_NAME}" STREQUAL "fsst") + build_fsst() elseif("${DEPENDENCY_NAME}" STREQUAL "c-ares") build_cares() elseif("${DEPENDENCY_NAME}" STREQUAL "gflags") @@ -382,6 +385,7 @@ endif() if(ARROW_PARQUET) set(ARROW_WITH_RAPIDJSON ON) set(ARROW_WITH_THRIFT ON) + set(ARROW_WITH_FSST ON) endif() if(ARROW_WITH_THRIFT) @@ -637,6 +641,14 @@ else() ) endif() +if(DEFINED ENV{ARROW_FSST_URL}) + set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}") +else() + set_urls(FSST_SOURCE_URL + "https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz" + "${THIRDPARTY_MIRROR_URL}/fsst-${ARROW_FSST_BUILD_VERSION}.tar.gz") +endif() + if(DEFINED ENV{ARROW_GBENCHMARK_URL}) set(GBENCHMARK_SOURCE_URL "$ENV{ARROW_GBENCHMARK_URL}") else() @@ -2604,6 +2616,29 @@ if(ARROW_USE_XSIMD) endif() endif() +function(build_fsst) + message(STATUS "Building FSST from source using FetchContent") + + fetchcontent_declare(fsst + URL ${FSST_SOURCE_URL} + URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}") + + prepare_fetchcontent() + fetchcontent_makeavailable(fsst) + + set(ARROW_FSST_INCLUDE_DIR + "${fsst_SOURCE_DIR}" + CACHE INTERNAL "FSST include directory") + set(ARROW_FSST_SOURCES + "${fsst_SOURCE_DIR}/libfsst.cpp;${fsst_SOURCE_DIR}/fsst_avx512.cpp" + CACHE INTERNAL "FSST source files") + set(FSST_VENDORED TRUE CACHE INTERNAL "Whether FSST is built from source") +endfunction() + +if(ARROW_WITH_FSST) + resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE) +endif() + macro(build_zlib) message(STATUS "Building ZLIB from source") diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index e06d13d1884..a2d300d684d 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -42,6 +42,10 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) ${ARGN}) set(TEST_ARGUMENTS PREFIX "parquet" LABELS "parquet-tests") + set(_PARQUET_TEST_EXTRA_ARGS ${ARG_UNPARSED_ARGUMENTS}) + if(PARQUET_TEST_EXTRA_INCLUDES) + list(APPEND _PARQUET_TEST_EXTRA_ARGS EXTRA_INCLUDES ${PARQUET_TEST_EXTRA_INCLUDES}) + endif() if(ARROW_TEST_LINKAGE STREQUAL "static") add_test_case(${REL_TEST_NAME} @@ -49,14 +53,14 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) parquet_static ${PARQUET_TEST_LINK_LIBS} ${TEST_ARGUMENTS} - ${ARG_UNPARSED_ARGUMENTS}) + ${_PARQUET_TEST_EXTRA_ARGS}) else() add_test_case(${REL_TEST_NAME} STATIC_LINK_LIBS parquet_shared ${PARQUET_TEST_LINK_LIBS} ${TEST_ARGUMENTS} - ${ARG_UNPARSED_ARGUMENTS}) + ${_PARQUET_TEST_EXTRA_ARGS}) endif() endfunction() @@ -134,6 +138,9 @@ elseif(NOT MSVC) list(APPEND PARQUET_TEST_LINK_LIBS ${CMAKE_DL_LIBS}) endif() +set(PARQUET_TEST_EXTRA_INCLUDES) +set(PARQUET_PRIVATE_INCLUDE_DIRS) + # # Generated Thrift sources set(PARQUET_THRIFT_SOURCE_DIR "${ARROW_SOURCE_DIR}/src/generated/") @@ -171,8 +178,6 @@ set(PARQUET_SRCS encryption/internal_file_encryptor.cc exception.cc file_reader.cc - thirdparty/fsst/libfsst.cpp - thirdparty/fsst/fsst_avx512.cpp file_writer.cc geospatial/statistics.cc geospatial/util_internal.cc @@ -193,6 +198,14 @@ set(PARQUET_SRCS stream_writer.cc types.cc) +if(DEFINED ARROW_FSST_SOURCES) + list(APPEND PARQUET_SRCS ${ARROW_FSST_SOURCES}) +endif() +if(DEFINED ARROW_FSST_INCLUDE_DIR) + list(APPEND PARQUET_PRIVATE_INCLUDE_DIRS ${ARROW_FSST_INCLUDE_DIR}) + list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR}) +endif() + if(ARROW_HAVE_RUNTIME_AVX2) # AVX2 is used as a proxy for BMI2. list(APPEND PARQUET_SRCS level_comparison_avx2.cc level_conversion_bmi2.cc) @@ -308,6 +321,14 @@ add_arrow_lib(parquet STATIC_INSTALL_INTERFACE_LIBS ${PARQUET_STATIC_INSTALL_INTERFACE_LIBS}) +if(PARQUET_PRIVATE_INCLUDE_DIRS) + foreach(_parquet_target parquet_objlib parquet_shared parquet_static) + if(TARGET ${_parquet_target}) + target_include_directories(${_parquet_target} PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS}) + endif() + endforeach() +endif() + if(WIN32 AND NOT (ARROW_TEST_LINKAGE STREQUAL "static")) add_library(parquet_test_support STATIC "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp") diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index 77bc3c760ff..42838fd059f 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -53,7 +53,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" -#include "parquet/thirdparty/fsst/fsst.h" +#include "fsst.h" #include "parquet/types.h" #ifdef _MSC_VER diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index 61b1bb9606c..00daa366f6a 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -50,7 +50,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" -#include "parquet/thirdparty/fsst/fsst.h" +#include "fsst.h" #include "parquet/types.h" #ifdef _MSC_VER diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index ca51d8e1b24..a43bafd0960 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -53,7 +53,7 @@ #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/test_util.h" -#include "parquet/thirdparty/fsst/fsst.h" +#include "fsst.h" #include "parquet/types.h" using arrow::default_memory_pool; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst.cpp b/cpp/src/parquet/thirdparty/fsst/fsst.cpp deleted file mode 100644 index c1a9cc34980..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/fsst.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// Copyright 2018-2019, CWI, TU Munich, FSU Jena -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -#ifdef FSST12 -#include "fsst12.h" // the official FSST API -- also usable by C mortals -#else -#include "fsst.h" // the official FSST API -- also usable by C mortals -#endif -#include -#include -#include -#include -#include -#include -using namespace std; - -// Utility to compress and decompress (-d) data with FSST (using stdin and stdout). -// -// The utility has a poor-man's async I/O in that it uses double buffering for input and output, -// and two background pthreads for reading and writing. The idea is to make the CPU overlap with I/O. -// -// The data format is quite simple. A FSST compressed file is a sequence of blocks, each with format: -// (1) 3-byte block length field (max blocksize is hence 16MB). This byte-length includes (1), (2) and (3). -// (2) FSST dictionary as produced by fst_export(). -// (3) the FSST compressed data. -// -// The natural strength of FSST is in fact not block-based compression, but rather the compression and -// *individual* decompression of many small strings separately. Think of compressed databases and (column-store) -// data formats. But, this utility is to serve as an apples-to-apples comparison point with utilities like lz4. - -namespace { - -class BinarySemaphore { - private: - mutex m; - condition_variable cv; - bool value; - - public: - explicit BinarySemaphore(bool initialValue = false) : value(initialValue) {} - void wait() { - unique_lock lock(m); - while (!value) cv.wait(lock); - value = false; - } - void post() { - { unique_lock lock(m); value = true; } - cv.notify_one(); - } -}; - -bool stopThreads = false; -BinarySemaphore srcDoneIO[2], dstDoneIO[2], srcDoneCPU[2], dstDoneCPU[2]; -unsigned char *srcBuf[2] = { NULL, NULL }; -unsigned char *dstBuf[2] = { NULL, NULL }; -unsigned char *dstMem[2] = { NULL, NULL }; -size_t srcLen[2] = { 0, 0 }; -size_t dstLen[2] = { 0, 0 }; - -#define FSST_MEMBUF (1ULL<<22) -int decompress = 0; -size_t blksz = FSST_MEMBUF-(1+FSST_MAXHEADER/2); // block size of compression (max compressed size must fit 3 bytes) - -#define DESERIALIZE(p) (((unsigned long long) (p)[0]) << 16) | (((unsigned long long) (p)[1]) << 8) | ((unsigned long long) (p)[2]) -#define SERIALIZE(l,p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; } - -void reader(ifstream& src) { - for(int swap=0; true; swap = 1-swap) { - srcDoneCPU[swap].wait(); - if (stopThreads) break; - src.read((char*) srcBuf[swap], blksz); - srcLen[swap] = (unsigned long) src.gcount(); - if (decompress) { - if (blksz && srcLen[swap] == blksz) { - blksz = DESERIALIZE(srcBuf[swap]+blksz-3); // read size of next block - srcLen[swap] -= 3; // cut off size bytes - } else { - blksz = 0; - } - } - srcDoneIO[swap].post(); - } -} - -void writer(ofstream& dst) { - for(int swap=0; true; swap = 1-swap) { - dstDoneCPU[swap].wait(); - if (!dstLen[swap]) break; - dst.write((char*) dstBuf[swap], dstLen[swap]); - dstDoneIO[swap].post(); - } - for(int swap=0; swap<2; swap++) - dstDoneIO[swap].post(); -} - -} - -#ifdef FSST_STANDALONE -int main(int argc, char* argv[]) { - size_t srcTot = 0, dstTot = 0; - if (argc < 2 || argc > 4 || (argc == 4 && (argv[1][0] != '-' || argv[1][1] != 'd' || argv[1][2]))) { - cerr << "usage: " << argv[0] << " -d infile outfile" << endl; - cerr << " " << argv[0] << " infile outfile" << endl; - cerr << " " << argv[0] << " infile" << endl; - return -1; - } - decompress = (argc == 4); - string srcfile(argv[1+decompress]), dstfile; - if (argc == 2) { - dstfile = srcfile + ".fsst"; - } else { - dstfile = argv[2+decompress]; - } - ifstream src; - ofstream dst; - src.open(srcfile, ios::binary); - dst.open(dstfile, ios::binary); - dst.exceptions(ios_base::failbit); - dst.exceptions(ios_base::badbit); - src.exceptions(ios_base::badbit); - if (decompress) { - unsigned char tmp[3]; - src.read((char*) tmp, 3); - if (src.gcount() != 3) { - cerr << "failed to open input." << endl; - return -1; - } - blksz = DESERIALIZE(tmp); // read first block size - } - vector buffer(FSST_MEMBUF*6); - srcBuf[0] = buffer.data(); - srcBuf[1] = srcBuf[0] + (FSST_MEMBUF*(1ULL+decompress)); - dstMem[0] = srcBuf[1] + (FSST_MEMBUF*(1ULL+decompress)); - dstMem[1] = dstMem[0] + (FSST_MEMBUF*(2ULL-decompress)); - - for(int swap=0; swap<2; swap++) { - srcDoneCPU[swap].post(); // input buffer is not being processed initially - dstDoneIO[swap].post(); // output buffer is not being written initially - } - thread readerThread([&src]{ reader(src); }); - thread writerThread([&dst]{ writer(dst); }); - - for(int swap=0; true; swap = 1-swap) { - srcDoneIO[swap].wait(); // wait until input buffer is available (i.e. done reading) - dstDoneIO[swap].wait(); // wait until output buffer is ready writing hence free for use - if (srcLen[swap] == 0) { - dstLen[swap] = 0; - break; - } - if (decompress) { - fsst_decoder_t decoder; - size_t hdr = fsst_import(&decoder, srcBuf[swap]); - dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF, dstBuf[swap] = dstMem[swap]); - } else { - unsigned char tmp[FSST_MAXHEADER]; - fsst_encoder_t* encoder = fsst_create(1, &srcLen[swap], const_cast(&srcBuf[swap]), 0); - size_t hdr = fsst_export(encoder, tmp); - if (fsst_compress(encoder, 1, &srcLen[swap], const_cast(&srcBuf[swap]), - FSST_MEMBUF * 2, dstMem[swap] + FSST_MAXHEADER + 3, - &dstLen[swap], &dstBuf[swap]) < 1) - return -1; - dstLen[swap] += 3 + hdr; - dstBuf[swap] -= 3 + hdr; - SERIALIZE(dstLen[swap],dstBuf[swap]); // block starts with size - copy(tmp, tmp+hdr, dstBuf[swap]+3); // then the header (followed by the compressed bytes which are already there) - fsst_destroy(encoder); - } - srcTot += srcLen[swap]; - dstTot += dstLen[swap]; - srcDoneCPU[swap].post(); // input buffer may be re-used by the reader for the next block - dstDoneCPU[swap].post(); // output buffer is ready for writing out - } - cerr << (decompress?"Dec":"C") << "ompressed " << srcTot << " bytes into " << dstTot << " bytes ==> " << (int) ((100*dstTot)/srcTot) << "%" << endl; - - // force wait until all background writes finished - stopThreads = true; - for(int swap=0; swap<2; swap++) { - srcDoneCPU[swap].post(); - dstDoneCPU[swap].post(); - } - dstDoneIO[0].wait(); - dstDoneIO[1].wait(); - readerThread.join(); - writerThread.join(); -} -#endif // FSST_STANDALONE diff --git a/cpp/src/parquet/thirdparty/fsst/fsst.h b/cpp/src/parquet/thirdparty/fsst/fsst.h deleted file mode 100644 index 71085d57201..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/fsst.h +++ /dev/null @@ -1,227 +0,0 @@ -/* - * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019 - * - * =================================================================================================================================== - * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): - * - * Copyright 2018-2020, CWI, TU Munich, FSU Jena - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files - * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR - * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst - * =================================================================================================================================== - * - * FSST: Fast Static Symbol Table compression - * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf - * - * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e. - * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual - * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is - * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text. - * - * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) - * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression - * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could - * be seen as strings again and fit in whatever your program is that manipulates strings. - * - * useful property: FSST ensures that strings that are equal, are also equal in their compressed form. - * - * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of - * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. - * - * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program - * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings). - * - * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are - * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length. - */ -#ifndef FSST_INCLUDED_H -#define FSST_INCLUDED_H - -#ifdef _MSC_VER -#define __restrict__ -#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ -#define __ORDER_LITTLE_ENDIAN__ 2 -#include -static inline int __builtin_ctzl(unsigned long long x) { - unsigned long ret; - _BitScanForward64(&ret, x); - return (int)ret; -} -#endif - -#ifdef __cplusplus -#define FSST_FALLTHROUGH [[fallthrough]] -#include -extern "C" { -#else -#define FSST_FALLTHROUGH -#endif - -#include - -/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */ -#define FSST_ESC 255 - -/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */ -typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */ - -/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */ -typedef struct { - unsigned long long version; /* version id */ - unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */ - unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */ - unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ -} fsst_decoder_t; - -/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */ -fsst_encoder_t* -fsst_create( - size_t n, /* IN: number of strings in batch to sample from. */ - const size_t lenIn[], /* IN: byte-lengths of the inputs */ - const unsigned char *strIn[], /* IN: string start pointers. */ - int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */ -); - -/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ -fsst_encoder_t* -fsst_duplicate( - fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */ -); - -#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */ - -/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */ -unsigned int /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */ -fsst_export( - fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ - unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */ -); - -/* Deallocate encoder. */ -void -fsst_destroy(fsst_encoder_t*); - -/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */ -unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */ -fsst_import( - fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ - unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */ -); - -/* Return a decoder structure from an encoder. */ -fsst_decoder_t -fsst_decoder( - fsst_encoder_t *encoder -); - -/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */ -/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */ -size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ -fsst_compress( - fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */ - size_t nstrings, /* IN: number of strings in batch to compress. */ - const size_t lenIn[], /* IN: byte-lengths of the inputs */ - const unsigned char *strIn[], /* IN: input string start pointers. */ - size_t outsize, /* IN: byte-length of output buffer. */ - unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */ - size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */ - unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */ -); - -/* Decompress a single string, inlined for speed. */ -inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */ -fsst_decompress( - const fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */ - size_t lenIn, /* IN: byte-length of compressed string. */ - const unsigned char *strIn, /* IN: compressed string. */ - size_t size, /* IN: byte-length of output buffer. */ - unsigned char *output /* OUT: memory buffer to put the decompressed string in. */ -) { - unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len; - unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output; - unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; - size_t code, posOut = 0, posIn = 0; -#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */ -#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long)) -#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) - while (posOut+32 <= size && posIn+4 <= lenIn) { - unsigned int nextBlock, escapeMask; - memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int)); - escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u); - if (escapeMask == 0) { - code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - } else { - unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3; - switch(firstEscapePos) { /* Duff's device */ - case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - // fall through - case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - // fall through - case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - // fall through - case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */ - } - } - } - if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop - if (posIn+2 <= lenIn) { - strOut[posOut] = strIn[posIn+1]; - if (strIn[posIn] != FSST_ESC) { - code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - if (strIn[posIn] != FSST_ESC) { - code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - } else { - posIn += 2; strOut[posOut++] = strIn[posIn-1]; - } - } else { - posIn += 2; posOut++; - } - } - if (posIn < lenIn) { // last code cannot be an escape - code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; - } - } -#else - while (posOut+8 <= size && posIn < lenIn) - if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */ - FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */ - posOut += len[code]; - } else { - strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */ - posIn++; posOut++; - } -#endif -#endif - while (posIn < lenIn) - if ((code = strIn[posIn++]) < FSST_ESC) { - size_t posWrite = posOut, endWrite = posOut + len[code]; - unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite; - if ((posOut = endWrite) > size) endWrite = size; - for(; posWrite < endWrite; posWrite++) /* only write if there is room */ - strOut[posWrite] = symbolPointer[posWrite]; - } else { - if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */ - posIn++; posOut++; - } - if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0; - return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */ -} - -#ifdef __cplusplus -} -#endif -#endif /* FSST_INCLUDED_H */ diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp deleted file mode 100644 index e43d3e03652..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -#include "libfsst.hpp" - -#if defined(__x86_64__) || defined(_M_X64) -#include - -#ifdef _WIN32 -namespace libfsst { -bool fsst_hasAVX512() { - int info[4]; - __cpuidex(info, 0x00000007, 0); - return (info[1]>>16)&1; -} -} // namespace libfsst -#else -#include -namespace libfsst { -bool fsst_hasAVX512() { - int info[4]; - __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]); - return (info[1]>>16)&1; -} -} // namespace libfsst -#endif -#else -namespace libfsst { -bool fsst_hasAVX512() { return false; } -} // namespace libfsst -#endif - -namespace libfsst { - -// BULK COMPRESSION OF STRINGS -// -// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes. -// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up. -// -// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4 -// unroll3 performs best on my hardware -// -// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes). -// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs). -// -// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits) -// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order. -// -// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings. -// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process. -// - so 'processed' is the amount of strings we started processing and it is between [480,512]. -// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished. -// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method. -// -// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings. -// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings. -// -// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to. -// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch). -// -// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS -// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however, -// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them. -// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. -// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores. - -size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) { - size_t processed = 0; - // define some constants (all_x means that all 8 lanes contain 64-bits value X) -#ifdef __AVX512F__ - //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c - __m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1)); - __m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME)); - __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE)); -#define all_HASH _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE) -#define all_ONE _mm512_srli_epi64(all_MASK, 63) -#define all_M19 _mm512_srli_epi64(all_MASK, 45) -#define all_M18 _mm512_srli_epi64(all_MASK, 46) -#define all_M28 _mm512_srli_epi64(all_MASK, 36) -#define all_FFFFFF _mm512_srli_epi64(all_MASK, 40) -#define all_FFFF _mm512_srli_epi64(all_MASK, 48) -#define all_FF _mm512_srli_epi64(all_MASK, 56) - - SIMDjob *inputEnd = input+n; - assert(n >= unroll*8 && n <= 512); // should be close to 512 - __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4 - __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll - u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll - - if (unroll >= 4) { - while (input+delta1+delta2+delta3+delta4 < inputEnd) { - #include "fsst_avx512_unroll4.inc" - } - } else if (unroll == 3) { - while (input+delta1+delta2+delta3 < inputEnd) { - #include "fsst_avx512_unroll3.inc" - } - } else if (unroll == 2) { - while (input+delta1+delta2 < inputEnd) { - #include "fsst_avx512_unroll2.inc" - } - } else { - while (input+delta1 < inputEnd) { - #include "fsst_avx512_unroll1.inc" - } - } - - // flush the job states of the unfinished strings at the end of output[] - processed = n - (inputEnd - input); - u32 unfinished = 0; - if (unroll > 1) { - if (unroll > 2) { - if (unroll > 3) { - _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); - unfinished += _mm_popcnt_u32((int) loadmask4); - } - _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); - unfinished += _mm_popcnt_u32((int) loadmask3); - } - _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); - unfinished += _mm_popcnt_u32((int) loadmask2); - } - _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); -#else - (void) symbolTable; - (void) codeBase; - (void) symbolBase; - (void) input; - (void) output; - (void) n; - (void) unroll; -#endif - return processed; -} -} // namespace libfsst diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc deleted file mode 100644 index 0a74541dd88..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc +++ /dev/null @@ -1,57 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmaskX=11111111, deltaX=8). - jobX = _mm512_mask_expandloadu_epi64(jobX, loadmaskX, input); input += deltaX; - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - __m512i wordX = _mm512_i64gather_epi64(_mm512_srli_epi64(jobX, 46), symbolBase, 1); - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // codeX: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - __m512i codeX = _mm512_i64gather_epi64(_mm512_and_epi64(wordX, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - // get the first three bytes of the string. - __m512i posX = _mm512_mullo_epi64(_mm512_and_epi64(wordX, all_FFFFFF), all_PRIME); - // hash them into a random number: posX = posX*PRIME; posX ^= posX>>SHIFT - posX = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(posX,_mm512_srli_epi64(posX,FSST_SHIFT)), all_HASH), 4); - // lookup in the 3-byte-prefix keyed hash table - __m512i iclX = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 8), 1); - // speculatively store the first input byte into the second position of the writeX register (in case it turns out to be an escaped byte). - __m512i writeX = _mm512_slli_epi64(_mm512_and_epi64(wordX, all_FF), 8); - // lookup just like the iclX above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - __m512i symbX = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 0), 1); - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - posX = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(iclX, all_FF)); - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - __mmask8 matchX = _mm512_cmpeq_epi64_mask(symbX, _mm512_and_epi64(wordX, posX)) & _mm512_cmplt_epi64_mask(iclX, all_ICL_FREE); - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - codeX = _mm512_mask_mov_epi64(codeX, matchX, _mm512_srli_epi64(iclX, 16)); - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - writeX = _mm512_or_epi64(writeX, _mm512_and_epi64(codeX, all_FF)); - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - codeX = _mm512_and_epi64(codeX, all_FFFF); - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(jobX, all_M19), writeX, 1); - // increase the jobX.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - jobX = _mm512_add_epi64(jobX, _mm512_slli_epi64(_mm512_srli_epi64(codeX, FSST_LEN_BITS), 46)); - // increase the jobX.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - jobX = _mm512_add_epi64(jobX, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(codeX, 8), all_ONE))); - // test which lanes are done now (jobX.cur==jobX.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the jobX register) - loadmaskX = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(jobX, 46), _mm512_and_epi64(_mm512_srli_epi64(jobX, 28), all_M18)); - // calculate the amount of lanes in jobX that are done - deltaX = _mm_popcnt_u32((int) loadmaskX); - // write out the job state for the lanes that are done (we need the final 'jobX.out' value to compute the compressed string length) - _mm512_mask_compressstoreu_epi64(output, loadmaskX, jobX); output += deltaX; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc deleted file mode 100644 index f4b81c7970d..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc +++ /dev/null @@ -1,57 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). - job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - // get the first three bytes of the string. - __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); - // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT - pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); - // lookup in the 3-byte-prefix keyed hash table - __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); - // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). - __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); - // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - code1 = _mm512_and_epi64(code1, all_FFFF); - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); - // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); - // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); - // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) - loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); - // calculate the amount of lanes in job1 that are done - delta1 = _mm_popcnt_u32((int) loadmask1); - // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) - _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc deleted file mode 100644 index aa33cd7e69c..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc +++ /dev/null @@ -1,114 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// furnished to do so, subject to the following conditions: -// -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// -// - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). - job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; - job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); - __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - // get the first three bytes of the string. - // get the first three bytes of the string. - __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); - __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); - // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT - // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT - pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); - pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); - // lookup in the 3-byte-prefix keyed hash table - // lookup in the 3-byte-prefix keyed hash table - __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); - __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); - // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). - // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). - __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); - __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); - // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); - __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); - pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); - __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); - code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); - write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - code1 = _mm512_and_epi64(code1, all_FFFF); - code2 = _mm512_and_epi64(code2, all_FFFF); - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); - // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); - job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); - // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); - job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); - // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) - // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) - loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); - loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); - // calculate the amount of lanes in job1 that are done - // calculate the amount of lanes in job2 that are done - delta1 = _mm_popcnt_u32((int) loadmask1); - delta2 = _mm_popcnt_u32((int) loadmask2); - // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) - // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) - _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; - _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc deleted file mode 100644 index e2057032abd..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc +++ /dev/null @@ -1,171 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// -// -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// furnished to do so, subject to the following conditions: -// furnished to do so, subject to the following conditions: -// -// -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// -// -// - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). - job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; - job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; - job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); - __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); - __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - // get the first three bytes of the string. - // get the first three bytes of the string. - // get the first three bytes of the string. - __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); - __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); - __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); - // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT - // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT - // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT - pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); - pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); - pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); - // lookup in the 3-byte-prefix keyed hash table - // lookup in the 3-byte-prefix keyed hash table - // lookup in the 3-byte-prefix keyed hash table - __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); - __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); - __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); - // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). - // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). - // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). - __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); - __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); - __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); - // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); - __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); - __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); - pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); - pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); - __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); - __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); - code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); - code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); - write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); - write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - code1 = _mm512_and_epi64(code1, all_FFFF); - code2 = _mm512_and_epi64(code2, all_FFFF); - code3 = _mm512_and_epi64(code3, all_FFFF); - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); - // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); - job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); - job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); - // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); - job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); - job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); - // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) - // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) - // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) - loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); - loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); - loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); - // calculate the amount of lanes in job1 that are done - // calculate the amount of lanes in job2 that are done - // calculate the amount of lanes in job3 that are done - delta1 = _mm_popcnt_u32((int) loadmask1); - delta2 = _mm_popcnt_u32((int) loadmask2); - delta3 = _mm_popcnt_u32((int) loadmask3); - // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) - // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) - // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) - _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; - _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; - _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc deleted file mode 100644 index 15cca7c938b..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc +++ /dev/null @@ -1,228 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// -// -// -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// -// -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// furnished to do so, subject to the following conditions: -// furnished to do so, subject to the following conditions: -// furnished to do so, subject to the following conditions: -// -// -// -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// -// -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// -// -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -// -// -// -// - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). - // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8). - job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; - job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; - job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; - job4 = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - // load the next 8 input string bytes (uncompressed data, aka 'symbols'). - __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); - __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); - __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); - __m512i word4 = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. - // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). - __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - __m512i code4 = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16)); - // get the first three bytes of the string. - // get the first three bytes of the string. - // get the first three bytes of the string. - // get the first three bytes of the string. - __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); - __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); - __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); - __m512i pos4 = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME); - // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT - // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT - // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT - // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT - pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); - pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); - pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); - pos4 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4); - // lookup in the 3-byte-prefix keyed hash table - // lookup in the 3-byte-prefix keyed hash table - // lookup in the 3-byte-prefix keyed hash table - // lookup in the 3-byte-prefix keyed hash table - __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); - __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); - __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); - __m512i icl4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1); - // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). - // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). - // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). - // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte). - __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); - __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); - __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); - __m512i write4 = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8); - // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. - __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); - __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); - __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); - __m512i symb4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1); - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). - pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); - pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); - pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); - pos4 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF)); - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). - __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); - __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); - __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); - __mmask8 match4 = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE); - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. - code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); - code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); - code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); - code4 = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16)); - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. - write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); - write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); - write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); - write4 = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF)); - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) - code1 = _mm512_and_epi64(code1, all_FFFF); - code2 = _mm512_and_epi64(code2, all_FFFF); - code3 = _mm512_and_epi64(code3, all_FFFF); - code4 = _mm512_and_epi64(code4, all_FFFF); - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); - _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1); - // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) - job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); - job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); - job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); - job4 = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46)); - // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) - job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); - job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); - job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); - job4 = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE))); - // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) - // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) - // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) - // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register) - loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); - loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); - loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); - loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18)); - // calculate the amount of lanes in job1 that are done - // calculate the amount of lanes in job2 that are done - // calculate the amount of lanes in job3 that are done - // calculate the amount of lanes in job4 that are done - delta1 = _mm_popcnt_u32((int) loadmask1); - delta2 = _mm_popcnt_u32((int) loadmask2); - delta3 = _mm_popcnt_u32((int) loadmask3); - delta4 = _mm_popcnt_u32((int) loadmask4); - // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) - // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) - // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) - // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length) - _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; - _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; - _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; - _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4; diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.cpp b/cpp/src/parquet/thirdparty/fsst/libfsst.cpp deleted file mode 100644 index e3ba787b959..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/libfsst.cpp +++ /dev/null @@ -1,651 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -#include "libfsst.hpp" - -namespace libfsst { -Symbol concat(Symbol a, Symbol b) { - Symbol s; - u32 length = a.length()+b.length(); - if (length > Symbol::maxLength) length = Symbol::maxLength; - s.set_code_len(FSST_CODE_MASK, length); - s.store_num((b.load_num() << (8*a.length())) | a.load_num()); - return s; -} -} // namespace libfsst - -namespace std { -template <> -class hash { - public: - size_t operator()(const libfsst::QSymbol& q) const { - uint64_t k = q.symbol.load_num(); - const uint64_t m = 0xc6a4a7935bd1e995; - const int r = 47; - uint64_t h = 0x8445d61a4e774912 ^ (8*m); - k *= m; - k ^= k >> r; - k *= m; - h ^= k; - h *= m; - h ^= h >> r; - h *= m; - h ^= h >> r; - return h; - } -}; -} - -namespace libfsst { -bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } - -std::ostream& operator<<(std::ostream& out, const Symbol& s) { - for (u32 i=0; i line, const size_t len[], bool zeroTerminated=false) { - SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); - int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) - size_t sampleFrac = 128; - - // start by determining the terminator. We use the (lowest) most infrequent byte as terminator - st->zeroTerminated = zeroTerminated; - if (zeroTerminated) { - st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency - } else { - u16 byteHisto[256]; - memset(byteHisto, 0, sizeof(byteHisto)); - for(size_t i=0; iterminator = 256; - while(i-- > 0) { - if (byteHisto[i] > minSize) continue; - st->terminator = i; - minSize = byteHisto[i]; - } - } - assert(st->terminator != 256); - - // a random number between 0 and 128 - auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); }; - - // compress sample, and compute (pair-)frequencies - auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain - int gain = 0; - - for(size_t i=0; i sampleFrac) continue; - } - if (cur < end) { - u16 code2 = 255, code1 = st->findLongestSymbol(cur, end); - cur += st->symbols[code1].length(); - gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1))); - while (true) { - // count single symbol (i.e. an option is not extending it) - counters.count1Inc(code1); - - // as an alternative, consider just using the next byte.. - if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly - counters.count1Inc(*start); - - if (cur==end) { - break; - } - - // now match a new symbol - start = cur; - if (curhashTabSize-1); - Symbol s = st->hashTab[idx]; - code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; - word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); - if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) { - code2 = s.code(); - cur += s.length(); - } else if (code2 >= FSST_CODE_BASE) { - cur += 2; - } else { - code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; - cur += 1; - } - } else { - code2 = st->findLongestSymbol(cur, end); - cur += st->symbols[code2].length(); - } - - // compute compressed output size - gain += ((int) (cur-start))-(1+isEscapeCode(code2)); - - if (sampleFrac < 128) { // no need to count pairs in final round - // consider the symbol that is the concatenation of the two last symbols - counters.count2Inc(code1, code2); - - // as an alternative, consider just extending with the next byte.. - if ((cur-start) > 1) // ..but do not count single byte extensions doubly - counters.count2Inc(code1, *start); - } - code1 = code2; - } - } - } - return gain; - }; - - auto makeTable = [&](SymbolTable *st, Counters &counters) { - // hashmap of c (needed because we can generate duplicate candidates) - unordered_set cands; - - // artificially make terminater the most frequent symbol so it gets included - u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator; - counters.count1Set(terminator,65535); - - auto addOrInc = [&](unordered_set &cands, Symbol s, u64 count) { - if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!! - QSymbol q; - q.symbol = s; - q.gain = count * s.length(); - auto it = cands.find(q); - if (it != cands.end()) { - q.gain += (*it).gain; - cands.erase(*it); - } - cands.insert(q); - }; - - // add candidate symbols based on counted frequency - for (u32 pos1=0; pos1nSymbols; pos1++) { - u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! - if (!cnt1) continue; - - // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed - Symbol s1 = st->symbols[pos1]; - addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1); - - if (sampleFrac >= 128 || // last round we do not create new (combined) symbols - s1.length() == Symbol::maxLength || // symbol cannot be extended - s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte - continue; - } - for (u32 pos2=0; pos2nSymbols; pos2++) { - u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! - if (!cnt2) continue; - - // create a new symbol - Symbol s2 = st->symbols[pos2]; - Symbol s3 = concat(s1, s2); - if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte - addOrInc(cands, s3, cnt2); - } - } - - // insert candidates into priority queue (by gain) - auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); }; - priority_queue,decltype(cmpGn)> pq(cmpGn); - for (auto& q : cands) - pq.push(q); - - // Create new symbol map using best candidates - st->clear(); - while (st->nSymbols < 255 && !pq.empty()) { - QSymbol q = pq.top(); - pq.pop(); - st->add(q.symbol); - } - }; - - u8 bestCounters[512*sizeof(u16)]; -#ifdef NONOPT_FSST - for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) { - sampleFrac = frac; -#else - for(sampleFrac=8; true; sampleFrac += 30) { -#endif - memset(&counters, 0, sizeof(Counters)); - long gain = compressCount(st, counters); - if (gain >= bestGain) { // a new best solution! - counters.backup1(bestCounters); - *bestTable = *st; bestGain = gain; - } - if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128) - makeTable(st, counters); - } - delete st; - counters.restore1(bestCounters); - makeTable(bestTable, counters); - bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression - return bestTable; -} - -#ifndef NONOPT_FSST -static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) { - size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size; - u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings - SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer - SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this) - size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs) - - while (curLine < nlines && outOff <= (1<<19)) { - size_t prevLine = curLine, chunk, curOff = 0; - - // bail out if the output buffer cannot hold the compressed next string fully - if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7 - else budget -= (len[curLine]-curOff)*2; - - strOut[curLine] = (u8*) 0; - lenOut[curLine] = 0; - - do { - do { - chunk = len[curLine] - curOff; - if (chunk > 511) { - chunk = 511; // large strings need to be chopped up into segments of 511 bytes - } - // create a job in this batch - SIMDjob job; - job.cur = inOff; - job.end = job.cur + chunk; - job.pos = batchPos; - job.out = outOff; - - // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros) - outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes. - if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk - - // register job in this batch - input[batchPos] = job; - jobLine[batchPos] = curLine; - - if (chunk == 0) { - empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out - } else { - // copy string chunk into temp buffer - memcpy(symbolBase + inOff, line[curLine] + curOff, chunk); - inOff += chunk; - curOff += chunk; - symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded - } - if (++batchPos == 512) break; - } while(curOff < len[curLine]); - - if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines) || (((len[curLine])*2 + 7) > budget)) { // cannot accumulate more? - if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling) - // radix-sort jobs on length (longest string first) - // -- this provides best load balancing and allows to skip empty jobs at the end - u16 sortpos[513]; - memset(sortpos, 0, sizeof(sortpos)); - - // calculate length histo - for(size_t i=0; i> (u8) s.icl); - if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { - *out++ = (u8) s.code(); cur += s.length(); - } else { - // could be a 2-byte or 1-byte code, or miss - // handle everything with predication - *out = (u8) code; - out += 1+((code&FSST_CODE_BASE)>>8); - cur += (code>>FSST_LEN_BITS); - } - } - job.out = out - codeBase; - } - // postprocess job info - job.cur = 0; - job.end = job.out - input[job.pos].out; // misuse .end field as compressed size - job.out = input[job.pos].out; // reset offset to start of encoded string - input[job.pos] = job; - } - - // copy out the result data - for(size_t i=0; i> (u8) s.icl); - if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { - *out++ = (u8) s.code(); cur += s.length(); - } else if (avoidBranch) { - // could be a 2-byte or 1-byte code, or miss - // handle everything with predication - *out = (u8) code; - out += 1+((code&FSST_CODE_BASE)>>8); - cur += (code>>FSST_LEN_BITS); - } else if ((u8) code < byteLim) { - // 2 byte code after checking there is no longer pattern - *out++ = (u8) code; cur += 2; - } else { - // 1 byte code or miss. - *out = (u8) code; - out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse - cur++; - } - } - } - }; - - for(curLine=0; curLine 511) { - chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST - } - if ((2*chunk+7) > (size_t) (lim-out)) { - return curLine; // out of memory - } - // copy the string to the 511-byte buffer - memcpy(buf, cur, chunk); - buf[chunk] = (u8) symbolTable.terminator; - cur = buf; - end = cur + chunk; - - // based on symboltable stats, choose a variant that is nice to the branch predictor - if (noSuffixOpt) { - compressVariant(true,false); - } else if (avoidBranch) { - compressVariant(false,true); - } else { - compressVariant(false, false); - } - } while((curOff += chunk) < lenIn[curLine]); - lenOut[curLine] = (size_t) (out - strOut[curLine]); - } - return curLine; -} - -#define FSST_SAMPLELINE ((size_t) 512) - -// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes -vector makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) { - size_t totSize = 0; - const size_t *lenIn = *lenRef; - vector sample; - - for(size_t i=0; i sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample - Encoder *encoder = new Encoder(); - encoder->symbolTable = shared_ptr(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); - if (sampleLen != lenIn) delete[] sampleLen; - delete[] sampleBuf; - return (fsst_encoder_t*) encoder; -} - -/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */ -extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) { - Encoder *e = new Encoder(); - e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr - return (fsst_encoder_t*) e; -} - -// export a symbol table in compact format. -extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { - Encoder *e = (Encoder*) encoder; - // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there. - // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t - // (such functionality could be useful to append compressed data to an existing block). - // - // However, the hash function in the encoder hash table is endian-sensitive, and given its - // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. - // Doing a endian-conversion during hashing will be slow and self-defeating. - // - // Overall, we could support reconstructing an encoder for incremental compression, but - // should enforce equal-endianness. Bit of a bummer. Not going there now. - // - // The version field is now there just for future-proofness, but not used yet - - // version allows keeping track of fsst versions, track endianness, and encoder reconstruction - u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0 - (((u64) e->symbolTable->suffixLim) << 24) | - (((u64) e->symbolTable->terminator) << 16) | - (((u64) e->symbolTable->nSymbols) << 8) | - FSST_ENDIAN_MARKER; // least significant byte is nonzero - - version = swap64_if_be(version); // ensure version is little-endian encoded - - /* do not assume unaligned reads here */ - memcpy(buf, &version, 8); - buf[8] = e->symbolTable->zeroTerminated; - for(u32 i=0; i<8; i++) - buf[9+i] = (u8) e->symbolTable->lenHisto[i]; - u32 pos = 17; - - // emit only the used bytes of the symbols - for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++) - for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++) - buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes - - return pos; // length of what was serialized -} - -#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */ - -extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) { - u64 version = 0; - u32 code, pos = 17; - u8 lenHisto[8]; - - // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) - memcpy(&version, buf, 8); - version = swap64_if_be(version); // version is always little-endian encoded - - if ((version>>32) != FSST_VERSION) return 0; - decoder->zeroTerminated = buf[8]&1; - memcpy(lenHisto, buf+9, 8); - - // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) - decoder->len[0] = 1; - decoder->symbol[0] = 0; - - // we use lenHisto[0] as 1-byte symbol run length (at the end) - code = decoder->zeroTerminated; - if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end - - // now get all symbols from the buffer - for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */ - for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) { - decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */ - decoder->symbol[code] = 0; - for(u32 j=0; jlen[code]; j++) - ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols - } - } - if (decoder->zeroTerminated) lenHisto[0]++; - - // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). - while(code<255) { - decoder->symbol[code] = FSST_CORRUPT; - decoder->len[code++] = 8; - } - return pos; -} - -// runtime check for simd -inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { -#ifndef NONOPT_FSST - if (simd && fsst_hasAVX512()) - return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); -#endif - (void) simd; - return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch); -} -size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { - return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); -} - -// adaptive choosing of scalar compression method based on symbol length histogram -inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { - bool avoidBranch = false, noSuffixOpt = false; - if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) { - noSuffixOpt = true; - } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) && - (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) && - (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) { - avoidBranch = true; - } - return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); -} -size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { - return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); -} -} // namespace libfsst - -using namespace libfsst; -// the main compression function (everything automatic) -extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) { - // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) - size_t totLen = accumulate(lenIn, lenIn+nlines, 0); - int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); - return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); -} - -/* deallocate encoder */ -extern "C" void fsst_destroy(fsst_encoder_t* encoder) { - Encoder *e = (Encoder*) encoder; - delete e; -} - -/* very lazy implementation relying on export and import */ -extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) { - u8 buf[sizeof(fsst_decoder_t)]; - u32 cnt1 = fsst_export(encoder, buf); - fsst_decoder_t decoder; - u32 cnt2 = fsst_import(&decoder, buf); - assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; - return decoder; -} diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp deleted file mode 100644 index e4d7c0aa9b9..00000000000 --- a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp +++ /dev/null @@ -1,474 +0,0 @@ -// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): -// -// Copyright 2018-2020, CWI, TU Munich, FSU Jena -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files -// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, -// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -// -// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -#include "fsst.h" // the official FSST API -- also usable by C mortals - -/* unsigned integers */ -namespace libfsst { -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; -} // namespace libfsst - -#if UINTPTR_MAX == 0xffffffffU -// We're on a 32-bit platform -#define NONOPT_FSST -#endif - -#define FSST_ENDIAN_MARKER ((u64) 1) -#define FSST_VERSION_20190218 20190218 -#define FSST_VERSION ((u64) FSST_VERSION_20190218) - -// "symbols" are character sequences (up to 8 bytes) -// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism: -// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X. - -// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length -#define FSST_LEN_BITS 12 -#define FSST_CODE_BITS 9 -#define FSST_CODE_BASE 256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */ -#define FSST_CODE_MAX (1UL<=8) { - len = 8; - memcpy(val.str, input, 8); - } else { - memcpy(val.str, input, len); - } - set_code_len(FSST_CODE_MAX, len); - } - void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); } - - u64 load_num() const { return swap64_if_be(val.num); } - void store_num(u64 v) { val.num = swap64_if_be(v); } - - u32 length() const { return (u32) (icl >> 28); } - u16 code() const { return (icl >> 16) & FSST_CODE_MASK; } - u32 ignoredBits() const { return (u32) icl; } - - u8 first() const { assert( length() >= 1); return 0xFF & load_num(); } - u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); } - -#define FSST_HASH_LOG2SIZE 10 -#define FSST_HASH_PRIME 2971215073LL -#define FSST_SHIFT 15 -#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) - size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes -}; - -// Symbol that can be put in a queue, ordered on gain -struct QSymbol{ - Symbol symbol; - mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols - bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } -}; - -// we construct FSST symbol tables using a random sample of about 16KB (1<<14) -#define FSST_SAMPLETARGET (1<<14) -#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) - -// two phases of compression, before and after optimize(): -// -// (1) to encode values we probe (and maintain) three datastructures: -// - u16 byteCodes[256] array at the position of the next byte (s.length==1) -// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2) -// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), -// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are -// pseudo codes representing a single byte these will become escapes) -// -// (2) when we finished looking for the best symbol table we call optimize() to reshape it: -// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1) -// length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes -// (allows shortcut during compression) -// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding -// to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one -// byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[] -// and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction -// is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was -// hence added to make symbolTable construction faster. -// -// this final layout allows for the fastest compression code, only currently present in compressBulk - -// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4 -#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket - -// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key -// ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster -// -// the gain field is only used in the symbol queue that sorts symbols on gain - -struct SymbolTable { - static const u32 hashTabSize = 1<> (u8) s.icl)); - return true; - } - bool add(Symbol s) { - assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX); - u32 len = s.length(); - s.set_code_len(FSST_CODE_BASE + nSymbols, len); - if (len == 1) { - byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<> ((u8) hashTab[idx].icl)))) { - return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol - } - if (s.length() >= 2) { - u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; - if (code >= FSST_CODE_BASE) return code; - } - return byteCodes[s.first()] & FSST_CODE_MASK; - } - u16 findLongestSymbol(const u8* cur, const u8* end) const { - return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol - } - - // rationale for finalize: - // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable() - // consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize() - // (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass) - // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf. - // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). - // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[] - // Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte - // symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time. - // - // In all, we change the layout and coding, as follows.. - // - // before finalize(): - // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255 - // - The first 256 codes are pseudo symbols (all escaped bytes) - // - // after finalize(): - // - table layout is symbols[0..nSymbols>, with nSymbols < 256. - // - Real codes are [0,nSymbols>. 8-th bit not set. - // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255 - // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last) - // the two-byte codes are split in two sections: - // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression - // - // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore). - // - void finalize(u8 zeroTerminated) { - assert(nSymbols <= 255); - u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated); - - // compute running sum of code lengths (starting offsets for each length) - rsum[0] = byteLim; // 1-byte codes are highest - rsum[1] = zeroTerminated; - for(u32 i=1; i<7; i++) - rsum[i+1] = rsum[i] + lenHisto[i]; - - // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim) - suffixLim = rsum[1]; - symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only) - - for(u32 i=zeroTerminated, j=rsum[2]; i 1 && first2 == s2.first2()) // test if symbol k is a suffix of s - opt = 0; - } - newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim - } else - newCode[i] = rsum[len-1]++; - s1.set_code_len(newCode[i],len); - symbols[newCode[i]] = s1; - } - // renumber the codes in byteCodes[] - for(u32 i=0; i<256; i++) - if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) - byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); - else - byteCodes[i] = 511 + (1 << FSST_LEN_BITS); - - // renumber the codes in shortCodes[] - for(u32 i=0; i<65536; i++) - if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) - shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); - else - shortCodes[i] = byteCodes[i&0xFF]; - - // replace the symbols in the hash table - for(u32 i=0; i>8; - } - void count1Inc(u32 pos1) { - if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) - count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... - } - void count2Inc(u32 pos1, u32 pos2) { - if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) - // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively - count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) - } - u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range - // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros - u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7] - - u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes - high = (high >> (zero << 3)) & 255; // advance to nonzero counter - if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 - return 0; // all zero - - u32 low = count1Low[pos1]; - if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) - return (u32) ((high << 8) + low); - } - u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range - // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros - u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15] - high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters - - u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters - high = (high >> (zero << 2)) & 15; // advance to nonzero counter - if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 - return 0UL; // all zero - - u32 low = count2Low[pos1][pos2]; - if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) - return (u32) ((high << 8) + low); - } - void backup1(u8 *buf) { - memcpy(buf, count1High, FSST_CODE_MAX); - memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX); - } - void restore1(u8 *buf) { - memcpy(count1High, buf, FSST_CODE_MAX); - memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX); - } -}; -#endif - - -#define FSST_BUFSZ (3<<19) // 768KB - -// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression -struct Encoder { - shared_ptr symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc) - union { - Counters counters; // for counting symbol occurences during map construction - u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) - }; -}; - -// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call) -struct SIMDjob { - u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) -}; - -extern bool -fsst_hasAVX512(); // runtime check for avx512 capability - -extern size_t -fsst_compressAVX512( - SymbolTable &symbolTable, - u8* codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) - u8* symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) - SIMDjob* input, // IN: input array (size n) with job information: what to encode, where to store it. - SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer. - size_t n, // IN: size of arrays input and output (should be max 512) - size_t unroll); // IN: degree of SIMD unrolling - -// Symbol manipulation -Symbol concat(Symbol a, Symbol b); - -// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) -size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); -size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); -} // namespace libfsst diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 7ba1f4f876b..ad84ffc98ba 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -66,6 +66,8 @@ ARROW_CARES_BUILD_VERSION=1.17.2 ARROW_CARES_BUILD_SHA256_CHECKSUM=4803c844ce20ce510ef0eb83f8ea41fa24ecaae9d280c468c582d2bb25b3913d ARROW_CRC32C_BUILD_VERSION=1.1.2 ARROW_CRC32C_BUILD_SHA256_CHECKSUM=ac07840513072b7fcebda6e821068aa04889018f24e10e46181068fb214d7e56 +ARROW_FSST_BUILD_VERSION=89f49c580c6388acf3b6ed2a49e1bfde6c05e616 +ARROW_FSST_BUILD_SHA256_CHECKSUM=5921d2b837800e1c886903d18971994587328b3e5d08d32eb8e80c00890c304d ARROW_GBENCHMARK_BUILD_VERSION=v1.8.3 ARROW_GBENCHMARK_BUILD_SHA256_CHECKSUM=6bc180a57d23d4d9515519f92b0c83d61b05b5bab188961f36ac7b06b0d9e9ce ARROW_GFLAGS_BUILD_VERSION=v2.2.2 @@ -144,6 +146,7 @@ DEPENDENCIES=( "ARROW_BZIP2_URL bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" "ARROW_CARES_URL cares-${ARROW_CARES_BUILD_VERSION}.tar.gz https://github.com/c-ares/c-ares/releases/download/cares-${ARROW_CARES_BUILD_VERSION//./_}/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz" "ARROW_CRC32C_URL crc32c-${ARROW_CRC32C_BUILD_VERSION}.tar.gz https://github.com/google/crc32c/archive/refs/tags/${ARROW_CRC32C_BUILD_VERSION}.tar.gz" + "ARROW_FSST_URL fsst-${ARROW_FSST_BUILD_VERSION}.tar.gz https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz" "ARROW_GBENCHMARK_URL gbenchmark-${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz https://github.com/google/benchmark/archive/${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz" "ARROW_GFLAGS_URL gflags-${ARROW_GFLAGS_BUILD_VERSION}.tar.gz https://github.com/gflags/gflags/archive/${ARROW_GFLAGS_BUILD_VERSION}.tar.gz" "ARROW_GLOG_URL glog-${ARROW_GLOG_BUILD_VERSION}.tar.gz https://github.com/google/glog/archive/${ARROW_GLOG_BUILD_VERSION}.tar.gz" From 68cc6f236b0f3a8f1f4996800d556207827fe2aa Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 08:36:45 +0000 Subject: [PATCH 04/16] update --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 8d67849ea47..2ce366f6c9a 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2624,15 +2624,16 @@ function(build_fsst) URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() - fetchcontent_makeavailable(fsst) + fetchcontent_getproperties(fsst) + if(NOT fsst_POPULATED) + fetchcontent_populate(fsst) + endif() - set(ARROW_FSST_INCLUDE_DIR - "${fsst_SOURCE_DIR}" - CACHE INTERNAL "FSST include directory") + set(ARROW_FSST_INCLUDE_DIR "${fsst_SOURCE_DIR}" PARENT_SCOPE) set(ARROW_FSST_SOURCES "${fsst_SOURCE_DIR}/libfsst.cpp;${fsst_SOURCE_DIR}/fsst_avx512.cpp" - CACHE INTERNAL "FSST source files") - set(FSST_VENDORED TRUE CACHE INTERNAL "Whether FSST is built from source") + PARENT_SCOPE) + set(FSST_VENDORED TRUE PARENT_SCOPE) endfunction() if(ARROW_WITH_FSST) From 05506fab80de3293dc1363366694e9c2792c4ecd Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 08:54:13 +0000 Subject: [PATCH 05/16] update --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 2ce366f6c9a..ada1654484a 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2637,6 +2637,9 @@ function(build_fsst) endfunction() if(ARROW_WITH_FSST) + if("${fsst_SOURCE}" STREQUAL "") + set(fsst_SOURCE "BUNDLED") + endif() resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE) endif() From 5935b9e47abba492a5f013c7aa3b1cfb53ffbf61 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 09:31:16 +0000 Subject: [PATCH 06/16] update --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ada1654484a..ceb2c20ac49 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2637,9 +2637,7 @@ function(build_fsst) endfunction() if(ARROW_WITH_FSST) - if("${fsst_SOURCE}" STREQUAL "") - set(fsst_SOURCE "BUNDLED") - endif() + set(fsst_SOURCE "BUNDLED" CACHE STRING "Source of fsst dependency" FORCE) resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE) endif() From ff3b8f73ebfab93a30333d6a8bd8c00fb85d5593 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 10:32:46 +0000 Subject: [PATCH 07/16] update --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ceb2c20ac49..245868948b8 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -73,6 +73,8 @@ set(ARROW_THIRDPARTY_DEPENDENCIES ZLIB zstd) +set(fsst_SOURCE "BUNDLED" CACHE STRING "Source of fsst dependency") + # For backward compatibility. We use "BOOST_SOURCE" if "Boost_SOURCE" # isn't specified and "BOOST_SOURCE" is specified. # We renamed "BOOST" dependency name to "Boost" in 3.0.0 because @@ -2637,7 +2639,9 @@ function(build_fsst) endfunction() if(ARROW_WITH_FSST) - set(fsst_SOURCE "BUNDLED" CACHE STRING "Source of fsst dependency" FORCE) + if(NOT fsst_SOURCE STREQUAL "BUNDLED") + message(FATAL_ERROR "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED.") + endif() resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE) endif() From f5e2bdba8fbd6e5a21bc786d2474108a0873cf8c Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 11:12:49 +0000 Subject: [PATCH 08/16] update --- cpp/src/parquet/CMakeLists.txt | 4 ++++ cpp/src/parquet/encoding_test.cc | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index a2d300d684d..66c9d08cb76 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -205,6 +205,10 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR) list(APPEND PARQUET_PRIVATE_INCLUDE_DIRS ${ARROW_FSST_INCLUDE_DIR}) list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR}) endif() +if(DEFINED ARROW_FSST_SOURCES) + set_source_files_properties(${ARROW_FSST_SOURCES} + PROPERTIES COMPILE_OPTIONS "-Wno-error=shorten-64-to-32") +endif() if(ARROW_HAVE_RUNTIME_AVX2) # AVX2 is used as a proxy for BMI2. diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index a43bafd0960..ebc86f14853 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -2886,7 +2886,7 @@ TEST(TestFsstEncoding, HeavyNullsDecodeSpaced) { } writer.Finish(); - const int null_count = binary.null_count(); + const int null_count = static_cast(binary.null_count()); ASSERT_EQ(static_cast(values->length() - null_count), decoder->DecodeSpaced(decoded.data(), static_cast(values->length()), null_count, valid_bits.data(), 0)); From 7c105d4d04b56101a1294971f170d90cc5dd38f1 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 11:47:43 +0000 Subject: [PATCH 09/16] update --- cpp/src/parquet/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 66c9d08cb76..38eb0dc59f1 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -207,7 +207,9 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR) endif() if(DEFINED ARROW_FSST_SOURCES) set_source_files_properties(${ARROW_FSST_SOURCES} - PROPERTIES COMPILE_OPTIONS "-Wno-error=shorten-64-to-32") + PROPERTIES + COMPILE_OPTIONS + "$<$:-Wno-error=shorten-64-to-32>") endif() if(ARROW_HAVE_RUNTIME_AVX2) From 511776f0b43697af94d12de760061a107c99368a Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 14:22:07 +0000 Subject: [PATCH 10/16] update --- cpp/src/parquet/CMakeLists.txt | 9 +++++---- cpp/src/parquet/fsst_compat.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 cpp/src/parquet/fsst_compat.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 38eb0dc59f1..e2b1db09633 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -206,10 +206,11 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR) list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR}) endif() if(DEFINED ARROW_FSST_SOURCES) - set_source_files_properties(${ARROW_FSST_SOURCES} - PROPERTIES - COMPILE_OPTIONS - "$<$:-Wno-error=shorten-64-to-32>") + set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS + "$<$,$>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>" + "$<$,$,$>:-Wno-error=missing-declarations;-Wno-missing-declarations>" + "$<$:/wd4244>" + "$<$,$>>:-include${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>") endif() if(ARROW_HAVE_RUNTIME_AVX2) diff --git a/cpp/src/parquet/fsst_compat.h b/cpp/src/parquet/fsst_compat.h new file mode 100644 index 00000000000..2da27d27301 --- /dev/null +++ b/cpp/src/parquet/fsst_compat.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +// Provide minimal compatibility shims so third-party FSST sources +// can be compiled with the compilers Arrow supports. + +#if defined(_WIN32) && !defined(_MSC_VER) +#include + +// MinGW does not provide __cpuidex, but FSST only needs the CPUID +// leaf/sub-leaf variant that __cpuid_count implements. +static inline void arrow_fsst_cpuidex(int info[4], int function_id, int subfunction_id) { + __cpuid_count(function_id, subfunction_id, info[0], info[1], info[2], info[3]); +} +#define __cpuidex(info, function_id, subfunction_id) \ + arrow_fsst_cpuidex(info, function_id, subfunction_id) +#endif From 0344d45478bf593af40c9c2543065ed6014dbd41 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 14:41:34 +0000 Subject: [PATCH 11/16] update --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 245868948b8..39bda3d2668 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -647,8 +647,7 @@ if(DEFINED ENV{ARROW_FSST_URL}) set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}") else() set_urls(FSST_SOURCE_URL - "https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz" - "${THIRDPARTY_MIRROR_URL}/fsst-${ARROW_FSST_BUILD_VERSION}.tar.gz") + "https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz") endif() if(DEFINED ENV{ARROW_GBENCHMARK_URL}) From f9b43c48f2a1b596cb080378dc7fe146a7f30e80 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 24 Nov 2025 15:52:53 +0000 Subject: [PATCH 12/16] update --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 26 +++++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 39bda3d2668..33db56e1969 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -643,11 +643,14 @@ else() ) endif() +set(FSST_SOURCE_URL "") +set(FSST_GIT_REPOSITORY "") if(DEFINED ENV{ARROW_FSST_URL}) set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}") +elseif(DEFINED ENV{ARROW_FSST_GIT_REPOSITORY}) + set(FSST_GIT_REPOSITORY "$ENV{ARROW_FSST_GIT_REPOSITORY}") else() - set_urls(FSST_SOURCE_URL - "https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz") + set(FSST_GIT_REPOSITORY "https://github.com/cwida/fsst.git") endif() if(DEFINED ENV{ARROW_GBENCHMARK_URL}) @@ -2620,9 +2623,22 @@ endif() function(build_fsst) message(STATUS "Building FSST from source using FetchContent") - fetchcontent_declare(fsst - URL ${FSST_SOURCE_URL} - URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}") + if(FSST_SOURCE_URL) + fetchcontent_declare(fsst + ${FC_DECLARE_COMMON_OPTIONS} + URL ${FSST_SOURCE_URL} + URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}") + else() + if(NOT FSST_GIT_REPOSITORY) + message(FATAL_ERROR "FSST_GIT_REPOSITORY is not set and no FSST_SOURCE_URL override was provided.") + endif() + fetchcontent_declare(fsst + ${FC_DECLARE_COMMON_OPTIONS} + GIT_REPOSITORY ${FSST_GIT_REPOSITORY} + GIT_TAG ${ARROW_FSST_BUILD_VERSION} + GIT_SHALLOW TRUE + GIT_PROGRESS TRUE) + endif() prepare_fetchcontent() fetchcontent_getproperties(fsst) From c49f19c771eaee4d46a56eaecde0df625adfd58f Mon Sep 17 00:00:00 2001 From: arnavb Date: Tue, 25 Nov 2025 07:31:32 +0000 Subject: [PATCH 13/16] vendor fsst --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 39 +- cpp/thirdparty/fsst/fsst.h | 227 +++++++ cpp/thirdparty/fsst/fsst_avx512.cpp | 150 +++++ cpp/thirdparty/fsst/fsst_avx512_unroll1.inc | 57 ++ cpp/thirdparty/fsst/fsst_avx512_unroll2.inc | 114 ++++ cpp/thirdparty/fsst/fsst_avx512_unroll3.inc | 171 +++++ cpp/thirdparty/fsst/fsst_avx512_unroll4.inc | 228 +++++++ cpp/thirdparty/fsst/libfsst.cpp | 651 ++++++++++++++++++++ cpp/thirdparty/fsst/libfsst.hpp | 471 ++++++++++++++ 9 files changed, 2072 insertions(+), 36 deletions(-) create mode 100644 cpp/thirdparty/fsst/fsst.h create mode 100644 cpp/thirdparty/fsst/fsst_avx512.cpp create mode 100644 cpp/thirdparty/fsst/fsst_avx512_unroll1.inc create mode 100644 cpp/thirdparty/fsst/fsst_avx512_unroll2.inc create mode 100644 cpp/thirdparty/fsst/fsst_avx512_unroll3.inc create mode 100644 cpp/thirdparty/fsst/fsst_avx512_unroll4.inc create mode 100644 cpp/thirdparty/fsst/libfsst.cpp create mode 100644 cpp/thirdparty/fsst/libfsst.hpp diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 33db56e1969..c0c84511c1d 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -643,16 +643,6 @@ else() ) endif() -set(FSST_SOURCE_URL "") -set(FSST_GIT_REPOSITORY "") -if(DEFINED ENV{ARROW_FSST_URL}) - set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}") -elseif(DEFINED ENV{ARROW_FSST_GIT_REPOSITORY}) - set(FSST_GIT_REPOSITORY "$ENV{ARROW_FSST_GIT_REPOSITORY}") -else() - set(FSST_GIT_REPOSITORY "https://github.com/cwida/fsst.git") -endif() - if(DEFINED ENV{ARROW_GBENCHMARK_URL}) set(GBENCHMARK_SOURCE_URL "$ENV{ARROW_GBENCHMARK_URL}") else() @@ -2621,34 +2611,11 @@ if(ARROW_USE_XSIMD) endif() function(build_fsst) - message(STATUS "Building FSST from source using FetchContent") - - if(FSST_SOURCE_URL) - fetchcontent_declare(fsst - ${FC_DECLARE_COMMON_OPTIONS} - URL ${FSST_SOURCE_URL} - URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}") - else() - if(NOT FSST_GIT_REPOSITORY) - message(FATAL_ERROR "FSST_GIT_REPOSITORY is not set and no FSST_SOURCE_URL override was provided.") - endif() - fetchcontent_declare(fsst - ${FC_DECLARE_COMMON_OPTIONS} - GIT_REPOSITORY ${FSST_GIT_REPOSITORY} - GIT_TAG ${ARROW_FSST_BUILD_VERSION} - GIT_SHALLOW TRUE - GIT_PROGRESS TRUE) - endif() - - prepare_fetchcontent() - fetchcontent_getproperties(fsst) - if(NOT fsst_POPULATED) - fetchcontent_populate(fsst) - endif() + message(STATUS "Configuring vendored FSST sources") - set(ARROW_FSST_INCLUDE_DIR "${fsst_SOURCE_DIR}" PARENT_SCOPE) + set(ARROW_FSST_INCLUDE_DIR "${ARROW_SOURCE_DIR}/thirdparty/fsst" PARENT_SCOPE) set(ARROW_FSST_SOURCES - "${fsst_SOURCE_DIR}/libfsst.cpp;${fsst_SOURCE_DIR}/fsst_avx512.cpp" + "${ARROW_SOURCE_DIR}/thirdparty/fsst/libfsst.cpp;${ARROW_SOURCE_DIR}/thirdparty/fsst/fsst_avx512.cpp" PARENT_SCOPE) set(FSST_VENDORED TRUE PARENT_SCOPE) endfunction() diff --git a/cpp/thirdparty/fsst/fsst.h b/cpp/thirdparty/fsst/fsst.h new file mode 100644 index 00000000000..71085d57201 --- /dev/null +++ b/cpp/thirdparty/fsst/fsst.h @@ -0,0 +1,227 @@ +/* + * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019 + * + * =================================================================================================================================== + * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): + * + * Copyright 2018-2020, CWI, TU Munich, FSU Jena + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files + * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst + * =================================================================================================================================== + * + * FSST: Fast Static Symbol Table compression + * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf + * + * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e. + * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual + * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is + * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text. + * + * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) + * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression + * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could + * be seen as strings again and fit in whatever your program is that manipulates strings. + * + * useful property: FSST ensures that strings that are equal, are also equal in their compressed form. + * + * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of + * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. + * + * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program + * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings). + * + * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are + * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length. + */ +#ifndef FSST_INCLUDED_H +#define FSST_INCLUDED_H + +#ifdef _MSC_VER +#define __restrict__ +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#define __ORDER_LITTLE_ENDIAN__ 2 +#include +static inline int __builtin_ctzl(unsigned long long x) { + unsigned long ret; + _BitScanForward64(&ret, x); + return (int)ret; +} +#endif + +#ifdef __cplusplus +#define FSST_FALLTHROUGH [[fallthrough]] +#include +extern "C" { +#else +#define FSST_FALLTHROUGH +#endif + +#include + +/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */ +#define FSST_ESC 255 + +/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */ +typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */ + +/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */ +typedef struct { + unsigned long long version; /* version id */ + unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */ + unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */ + unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ +} fsst_decoder_t; + +/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */ +fsst_encoder_t* +fsst_create( + size_t n, /* IN: number of strings in batch to sample from. */ + const size_t lenIn[], /* IN: byte-lengths of the inputs */ + const unsigned char *strIn[], /* IN: string start pointers. */ + int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */ +); + +/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ +fsst_encoder_t* +fsst_duplicate( + fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */ +); + +#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */ + +/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */ +unsigned int /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */ +fsst_export( + fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ + unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */ +); + +/* Deallocate encoder. */ +void +fsst_destroy(fsst_encoder_t*); + +/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */ +unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */ +fsst_import( + fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ + unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */ +); + +/* Return a decoder structure from an encoder. */ +fsst_decoder_t +fsst_decoder( + fsst_encoder_t *encoder +); + +/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */ +/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */ +size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ +fsst_compress( + fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */ + size_t nstrings, /* IN: number of strings in batch to compress. */ + const size_t lenIn[], /* IN: byte-lengths of the inputs */ + const unsigned char *strIn[], /* IN: input string start pointers. */ + size_t outsize, /* IN: byte-length of output buffer. */ + unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */ + size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */ + unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */ +); + +/* Decompress a single string, inlined for speed. */ +inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */ +fsst_decompress( + const fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */ + size_t lenIn, /* IN: byte-length of compressed string. */ + const unsigned char *strIn, /* IN: compressed string. */ + size_t size, /* IN: byte-length of output buffer. */ + unsigned char *output /* OUT: memory buffer to put the decompressed string in. */ +) { + unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len; + unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output; + unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; + size_t code, posOut = 0, posIn = 0; +#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */ +#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long)) +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + while (posOut+32 <= size && posIn+4 <= lenIn) { + unsigned int nextBlock, escapeMask; + memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int)); + escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u); + if (escapeMask == 0) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } else { + unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3; + switch(firstEscapePos) { /* Duff's device */ + case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */ + } + } + } + if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop + if (posIn+2 <= lenIn) { + strOut[posOut] = strIn[posIn+1]; + if (strIn[posIn] != FSST_ESC) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + if (strIn[posIn] != FSST_ESC) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } else { + posIn += 2; strOut[posOut++] = strIn[posIn-1]; + } + } else { + posIn += 2; posOut++; + } + } + if (posIn < lenIn) { // last code cannot be an escape + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } + } +#else + while (posOut+8 <= size && posIn < lenIn) + if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */ + FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */ + posOut += len[code]; + } else { + strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */ + posIn++; posOut++; + } +#endif +#endif + while (posIn < lenIn) + if ((code = strIn[posIn++]) < FSST_ESC) { + size_t posWrite = posOut, endWrite = posOut + len[code]; + unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite; + if ((posOut = endWrite) > size) endWrite = size; + for(; posWrite < endWrite; posWrite++) /* only write if there is room */ + strOut[posWrite] = symbolPointer[posWrite]; + } else { + if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */ + posIn++; posOut++; + } + if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0; + return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */ +} + +#ifdef __cplusplus +} +#endif +#endif /* FSST_INCLUDED_H */ diff --git a/cpp/thirdparty/fsst/fsst_avx512.cpp b/cpp/thirdparty/fsst/fsst_avx512.cpp new file mode 100644 index 00000000000..150683d2f9a --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512.cpp @@ -0,0 +1,150 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +#if defined(__x86_64__) || defined(_M_X64) +#include + +#ifdef _WIN32 +namespace libfsst { +bool fsst_hasAVX512() { + int info[4]; + __cpuidex(info, 0x00000007, 0); + return (info[1]>>16)&1; +} +} // namespace libfsst +#else +#include +namespace libfsst { +bool fsst_hasAVX512() { + int info[4]; + __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]); + return (info[1]>>16)&1; +} +} // namespace libfsst +#endif +#else +namespace libfsst { +bool fsst_hasAVX512() { return false; } +} // namespace libfsst +#endif + +namespace libfsst { + +// BULK COMPRESSION OF STRINGS +// +// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes. +// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up. +// +// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4 +// unroll3 performs best on my hardware +// +// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes). +// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs). +// +// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits) +// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order. +// +// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings. +// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process. +// - so 'processed' is the amount of strings we started processing and it is between [480,512]. +// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished. +// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method. +// +// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings. +// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings. +// +// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to. +// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch). +// +// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS +// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however, +// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them. +// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. +// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores. + +size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) { + size_t processed = 0; + // define some constants (all_x means that all 8 lanes contain 64-bits value X) +#ifdef __AVX512F__ + //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c + __m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1)); + __m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME)); + __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE)); +#define all_HASH _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE) +#define all_ONE _mm512_srli_epi64(all_MASK, 63) +#define all_M19 _mm512_srli_epi64(all_MASK, 45) +#define all_M18 _mm512_srli_epi64(all_MASK, 46) +#define all_M28 _mm512_srli_epi64(all_MASK, 36) +#define all_FFFFFF _mm512_srli_epi64(all_MASK, 40) +#define all_FFFF _mm512_srli_epi64(all_MASK, 48) +#define all_FF _mm512_srli_epi64(all_MASK, 56) + + SIMDjob *inputEnd = input+n; + assert(n >= unroll*8 && n <= 512); // should be close to 512 + __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4 + __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll + u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll + + if (unroll >= 4) { + while (input+delta1+delta2+delta3+delta4 < inputEnd) { + #include "fsst_avx512_unroll4.inc" + } + } else if (unroll == 3) { + while (input+delta1+delta2+delta3 < inputEnd) { + #include "fsst_avx512_unroll3.inc" + } + } else if (unroll == 2) { + while (input+delta1+delta2 < inputEnd) { + #include "fsst_avx512_unroll2.inc" + } + } else { + while (input+delta1 < inputEnd) { + #include "fsst_avx512_unroll1.inc" + } + } + + // flush the job states of the unfinished strings at the end of output[] + processed = n - (inputEnd - input); + u32 unfinished = 0; + if (unroll > 1) { + if (unroll > 2) { + if (unroll > 3) { + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); + unfinished += _mm_popcnt_u32((int) loadmask4); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); + unfinished += _mm_popcnt_u32((int) loadmask3); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); + unfinished += _mm_popcnt_u32((int) loadmask2); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); +#else + (void) symbolTable; + (void) codeBase; + (void) symbolBase; + (void) input; + (void) output; + (void) n; + (void) unroll; +#endif + return processed; +} +} // namespace libfsst + diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc new file mode 100644 index 00000000000..f4b81c7970d --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc @@ -0,0 +1,57 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc new file mode 100644 index 00000000000..aa33cd7e69c --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc @@ -0,0 +1,114 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc new file mode 100644 index 00000000000..e2057032abd --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc @@ -0,0 +1,171 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + code3 = _mm512_and_epi64(code3, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + // calculate the amount of lanes in job3 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + delta3 = _mm_popcnt_u32((int) loadmask3); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; + _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc new file mode 100644 index 00000000000..15cca7c938b --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc @@ -0,0 +1,228 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; + job4 = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); + __m512i word4 = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code4 = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); + __m512i pos4 = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT + // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); + pos4 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); + __m512i write4 = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); + pos4 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); + __mmask8 match4 = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); + code4 = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); + write4 = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + code3 = _mm512_and_epi64(code3, all_FFFF); + code4 = _mm512_and_epi64(code4, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); + job4 = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); + job4 = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) + // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); + loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + // calculate the amount of lanes in job3 that are done + // calculate the amount of lanes in job4 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + delta3 = _mm_popcnt_u32((int) loadmask3); + delta4 = _mm_popcnt_u32((int) loadmask4); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; + _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; + _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4; diff --git a/cpp/thirdparty/fsst/libfsst.cpp b/cpp/thirdparty/fsst/libfsst.cpp new file mode 100644 index 00000000000..e3ba787b959 --- /dev/null +++ b/cpp/thirdparty/fsst/libfsst.cpp @@ -0,0 +1,651 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +namespace libfsst { +Symbol concat(Symbol a, Symbol b) { + Symbol s; + u32 length = a.length()+b.length(); + if (length > Symbol::maxLength) length = Symbol::maxLength; + s.set_code_len(FSST_CODE_MASK, length); + s.store_num((b.load_num() << (8*a.length())) | a.load_num()); + return s; +} +} // namespace libfsst + +namespace std { +template <> +class hash { + public: + size_t operator()(const libfsst::QSymbol& q) const { + uint64_t k = q.symbol.load_num(); + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + uint64_t h = 0x8445d61a4e774912 ^ (8*m); + k *= m; + k ^= k >> r; + k *= m; + h ^= k; + h *= m; + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; + } +}; +} + +namespace libfsst { +bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } + +std::ostream& operator<<(std::ostream& out, const Symbol& s) { + for (u32 i=0; i line, const size_t len[], bool zeroTerminated=false) { + SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); + int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) + size_t sampleFrac = 128; + + // start by determining the terminator. We use the (lowest) most infrequent byte as terminator + st->zeroTerminated = zeroTerminated; + if (zeroTerminated) { + st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency + } else { + u16 byteHisto[256]; + memset(byteHisto, 0, sizeof(byteHisto)); + for(size_t i=0; iterminator = 256; + while(i-- > 0) { + if (byteHisto[i] > minSize) continue; + st->terminator = i; + minSize = byteHisto[i]; + } + } + assert(st->terminator != 256); + + // a random number between 0 and 128 + auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); }; + + // compress sample, and compute (pair-)frequencies + auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain + int gain = 0; + + for(size_t i=0; i sampleFrac) continue; + } + if (cur < end) { + u16 code2 = 255, code1 = st->findLongestSymbol(cur, end); + cur += st->symbols[code1].length(); + gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1))); + while (true) { + // count single symbol (i.e. an option is not extending it) + counters.count1Inc(code1); + + // as an alternative, consider just using the next byte.. + if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly + counters.count1Inc(*start); + + if (cur==end) { + break; + } + + // now match a new symbol + start = cur; + if (curhashTabSize-1); + Symbol s = st->hashTab[idx]; + code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) { + code2 = s.code(); + cur += s.length(); + } else if (code2 >= FSST_CODE_BASE) { + cur += 2; + } else { + code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; + cur += 1; + } + } else { + code2 = st->findLongestSymbol(cur, end); + cur += st->symbols[code2].length(); + } + + // compute compressed output size + gain += ((int) (cur-start))-(1+isEscapeCode(code2)); + + if (sampleFrac < 128) { // no need to count pairs in final round + // consider the symbol that is the concatenation of the two last symbols + counters.count2Inc(code1, code2); + + // as an alternative, consider just extending with the next byte.. + if ((cur-start) > 1) // ..but do not count single byte extensions doubly + counters.count2Inc(code1, *start); + } + code1 = code2; + } + } + } + return gain; + }; + + auto makeTable = [&](SymbolTable *st, Counters &counters) { + // hashmap of c (needed because we can generate duplicate candidates) + unordered_set cands; + + // artificially make terminater the most frequent symbol so it gets included + u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator; + counters.count1Set(terminator,65535); + + auto addOrInc = [&](unordered_set &cands, Symbol s, u64 count) { + if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!! + QSymbol q; + q.symbol = s; + q.gain = count * s.length(); + auto it = cands.find(q); + if (it != cands.end()) { + q.gain += (*it).gain; + cands.erase(*it); + } + cands.insert(q); + }; + + // add candidate symbols based on counted frequency + for (u32 pos1=0; pos1nSymbols; pos1++) { + u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! + if (!cnt1) continue; + + // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed + Symbol s1 = st->symbols[pos1]; + addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1); + + if (sampleFrac >= 128 || // last round we do not create new (combined) symbols + s1.length() == Symbol::maxLength || // symbol cannot be extended + s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte + continue; + } + for (u32 pos2=0; pos2nSymbols; pos2++) { + u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! + if (!cnt2) continue; + + // create a new symbol + Symbol s2 = st->symbols[pos2]; + Symbol s3 = concat(s1, s2); + if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte + addOrInc(cands, s3, cnt2); + } + } + + // insert candidates into priority queue (by gain) + auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); }; + priority_queue,decltype(cmpGn)> pq(cmpGn); + for (auto& q : cands) + pq.push(q); + + // Create new symbol map using best candidates + st->clear(); + while (st->nSymbols < 255 && !pq.empty()) { + QSymbol q = pq.top(); + pq.pop(); + st->add(q.symbol); + } + }; + + u8 bestCounters[512*sizeof(u16)]; +#ifdef NONOPT_FSST + for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) { + sampleFrac = frac; +#else + for(sampleFrac=8; true; sampleFrac += 30) { +#endif + memset(&counters, 0, sizeof(Counters)); + long gain = compressCount(st, counters); + if (gain >= bestGain) { // a new best solution! + counters.backup1(bestCounters); + *bestTable = *st; bestGain = gain; + } + if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128) + makeTable(st, counters); + } + delete st; + counters.restore1(bestCounters); + makeTable(bestTable, counters); + bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression + return bestTable; +} + +#ifndef NONOPT_FSST +static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) { + size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size; + u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings + SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer + SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this) + size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs) + + while (curLine < nlines && outOff <= (1<<19)) { + size_t prevLine = curLine, chunk, curOff = 0; + + // bail out if the output buffer cannot hold the compressed next string fully + if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7 + else budget -= (len[curLine]-curOff)*2; + + strOut[curLine] = (u8*) 0; + lenOut[curLine] = 0; + + do { + do { + chunk = len[curLine] - curOff; + if (chunk > 511) { + chunk = 511; // large strings need to be chopped up into segments of 511 bytes + } + // create a job in this batch + SIMDjob job; + job.cur = inOff; + job.end = job.cur + chunk; + job.pos = batchPos; + job.out = outOff; + + // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros) + outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes. + if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk + + // register job in this batch + input[batchPos] = job; + jobLine[batchPos] = curLine; + + if (chunk == 0) { + empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out + } else { + // copy string chunk into temp buffer + memcpy(symbolBase + inOff, line[curLine] + curOff, chunk); + inOff += chunk; + curOff += chunk; + symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded + } + if (++batchPos == 512) break; + } while(curOff < len[curLine]); + + if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines) || (((len[curLine])*2 + 7) > budget)) { // cannot accumulate more? + if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling) + // radix-sort jobs on length (longest string first) + // -- this provides best load balancing and allows to skip empty jobs at the end + u16 sortpos[513]; + memset(sortpos, 0, sizeof(sortpos)); + + // calculate length histo + for(size_t i=0; i> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } + } + job.out = out - codeBase; + } + // postprocess job info + job.cur = 0; + job.end = job.out - input[job.pos].out; // misuse .end field as compressed size + job.out = input[job.pos].out; // reset offset to start of encoded string + input[job.pos] = job; + } + + // copy out the result data + for(size_t i=0; i> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else if (avoidBranch) { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } else if ((u8) code < byteLim) { + // 2 byte code after checking there is no longer pattern + *out++ = (u8) code; cur += 2; + } else { + // 1 byte code or miss. + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse + cur++; + } + } + } + }; + + for(curLine=0; curLine 511) { + chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST + } + if ((2*chunk+7) > (size_t) (lim-out)) { + return curLine; // out of memory + } + // copy the string to the 511-byte buffer + memcpy(buf, cur, chunk); + buf[chunk] = (u8) symbolTable.terminator; + cur = buf; + end = cur + chunk; + + // based on symboltable stats, choose a variant that is nice to the branch predictor + if (noSuffixOpt) { + compressVariant(true,false); + } else if (avoidBranch) { + compressVariant(false,true); + } else { + compressVariant(false, false); + } + } while((curOff += chunk) < lenIn[curLine]); + lenOut[curLine] = (size_t) (out - strOut[curLine]); + } + return curLine; +} + +#define FSST_SAMPLELINE ((size_t) 512) + +// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes +vector makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) { + size_t totSize = 0; + const size_t *lenIn = *lenRef; + vector sample; + + for(size_t i=0; i sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample + Encoder *encoder = new Encoder(); + encoder->symbolTable = shared_ptr(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); + if (sampleLen != lenIn) delete[] sampleLen; + delete[] sampleBuf; + return (fsst_encoder_t*) encoder; +} + +/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */ +extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) { + Encoder *e = new Encoder(); + e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr + return (fsst_encoder_t*) e; +} + +// export a symbol table in compact format. +extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { + Encoder *e = (Encoder*) encoder; + // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there. + // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t + // (such functionality could be useful to append compressed data to an existing block). + // + // However, the hash function in the encoder hash table is endian-sensitive, and given its + // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. + // Doing a endian-conversion during hashing will be slow and self-defeating. + // + // Overall, we could support reconstructing an encoder for incremental compression, but + // should enforce equal-endianness. Bit of a bummer. Not going there now. + // + // The version field is now there just for future-proofness, but not used yet + + // version allows keeping track of fsst versions, track endianness, and encoder reconstruction + u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0 + (((u64) e->symbolTable->suffixLim) << 24) | + (((u64) e->symbolTable->terminator) << 16) | + (((u64) e->symbolTable->nSymbols) << 8) | + FSST_ENDIAN_MARKER; // least significant byte is nonzero + + version = swap64_if_be(version); // ensure version is little-endian encoded + + /* do not assume unaligned reads here */ + memcpy(buf, &version, 8); + buf[8] = e->symbolTable->zeroTerminated; + for(u32 i=0; i<8; i++) + buf[9+i] = (u8) e->symbolTable->lenHisto[i]; + u32 pos = 17; + + // emit only the used bytes of the symbols + for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++) + for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++) + buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes + + return pos; // length of what was serialized +} + +#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */ + +extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) { + u64 version = 0; + u32 code, pos = 17; + u8 lenHisto[8]; + + // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) + memcpy(&version, buf, 8); + version = swap64_if_be(version); // version is always little-endian encoded + + if ((version>>32) != FSST_VERSION) return 0; + decoder->zeroTerminated = buf[8]&1; + memcpy(lenHisto, buf+9, 8); + + // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) + decoder->len[0] = 1; + decoder->symbol[0] = 0; + + // we use lenHisto[0] as 1-byte symbol run length (at the end) + code = decoder->zeroTerminated; + if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end + + // now get all symbols from the buffer + for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */ + for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) { + decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */ + decoder->symbol[code] = 0; + for(u32 j=0; jlen[code]; j++) + ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols + } + } + if (decoder->zeroTerminated) lenHisto[0]++; + + // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). + while(code<255) { + decoder->symbol[code] = FSST_CORRUPT; + decoder->len[code++] = 8; + } + return pos; +} + +// runtime check for simd +inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { +#ifndef NONOPT_FSST + if (simd && fsst_hasAVX512()) + return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +#endif + (void) simd; + return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch); +} +size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} + +// adaptive choosing of scalar compression method based on symbol length histogram +inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + bool avoidBranch = false, noSuffixOpt = false; + if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) { + noSuffixOpt = true; + } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) && + (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) && + (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) { + avoidBranch = true; + } + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} +size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +} +} // namespace libfsst + +using namespace libfsst; +// the main compression function (everything automatic) +extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) { + // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) + size_t totLen = accumulate(lenIn, lenIn+nlines, 0); + int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); + return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); +} + +/* deallocate encoder */ +extern "C" void fsst_destroy(fsst_encoder_t* encoder) { + Encoder *e = (Encoder*) encoder; + delete e; +} + +/* very lazy implementation relying on export and import */ +extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) { + u8 buf[sizeof(fsst_decoder_t)]; + u32 cnt1 = fsst_export(encoder, buf); + fsst_decoder_t decoder; + u32 cnt2 = fsst_import(&decoder, buf); + assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; + return decoder; +} diff --git a/cpp/thirdparty/fsst/libfsst.hpp b/cpp/thirdparty/fsst/libfsst.hpp new file mode 100644 index 00000000000..f61bc0175b6 --- /dev/null +++ b/cpp/thirdparty/fsst/libfsst.hpp @@ -0,0 +1,471 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include "fsst.h" // the official FSST API -- also usable by C mortals + +/* unsigned integers */ +namespace libfsst { +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +} // namespace libfsst + +#if UINTPTR_MAX == 0xffffffffU +// We're on a 32-bit platform +#define NONOPT_FSST +#endif + +#define FSST_ENDIAN_MARKER ((u64) 1) +#define FSST_VERSION_20190218 20190218 +#define FSST_VERSION ((u64) FSST_VERSION_20190218) + +// "symbols" are character sequences (up to 8 bytes) +// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism: +// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X. + +// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length +#define FSST_LEN_BITS 12 +#define FSST_CODE_BITS 9 +#define FSST_CODE_BASE 256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */ +#define FSST_CODE_MAX (1UL<=8) { + len = 8; + memcpy(val.str, input, 8); + } else { + memcpy(val.str, input, len); + } + set_code_len(FSST_CODE_MAX, len); + } + void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); } + + u64 load_num() const { return swap64_if_be(val.num); } + void store_num(u64 v) { val.num = swap64_if_be(v); } + + u32 length() const { return (u32) (icl >> 28); } + u16 code() const { return (icl >> 16) & FSST_CODE_MASK; } + u32 ignoredBits() const { return (u32) icl; } + + u8 first() const { assert( length() >= 1); return 0xFF & load_num(); } + u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); } + +#define FSST_HASH_LOG2SIZE 10 +#define FSST_HASH_PRIME 2971215073LL +#define FSST_SHIFT 15 +#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) + size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes +}; + +// Symbol that can be put in a queue, ordered on gain +struct QSymbol{ + Symbol symbol; + mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols + bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } +}; + +// we construct FSST symbol tables using a random sample of about 16KB (1<<14) +#define FSST_SAMPLETARGET (1<<14) +#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) + +// two phases of compression, before and after optimize(): +// +// (1) to encode values we probe (and maintain) three datastructures: +// - u16 byteCodes[256] array at the position of the next byte (s.length==1) +// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2) +// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), +// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are +// pseudo codes representing a single byte these will become escapes) +// +// (2) when we finished looking for the best symbol table we call optimize() to reshape it: +// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1) +// length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes +// (allows shortcut during compression) +// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding +// to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one +// byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[] +// and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction +// is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was +// hence added to make symbolTable construction faster. +// +// this final layout allows for the fastest compression code, only currently present in compressBulk + +// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4 +#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket + +// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key +// ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster +// +// the gain field is only used in the symbol queue that sorts symbols on gain + +struct SymbolTable { + static const u32 hashTabSize = 1<> (u8) s.icl)); + return true; + } + bool add(Symbol s) { + assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX); + u32 len = s.length(); + s.set_code_len(FSST_CODE_BASE + nSymbols, len); + if (len == 1) { + byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<> ((u8) hashTab[idx].icl)))) { + return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol + } + if (s.length() >= 2) { + u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; + if (code >= FSST_CODE_BASE) return code; + } + return byteCodes[s.first()] & FSST_CODE_MASK; + } + u16 findLongestSymbol(const u8* cur, const u8* end) const { + return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol + } + + // rationale for finalize: + // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable() + // consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize() + // (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass) + // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf. + // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). + // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[] + // Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte + // symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time. + // + // In all, we change the layout and coding, as follows.. + // + // before finalize(): + // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255 + // - The first 256 codes are pseudo symbols (all escaped bytes) + // + // after finalize(): + // - table layout is symbols[0..nSymbols>, with nSymbols < 256. + // - Real codes are [0,nSymbols>. 8-th bit not set. + // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255 + // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last) + // the two-byte codes are split in two sections: + // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression + // + // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore). + // + void finalize(u8 zeroTerminated) { + assert(nSymbols <= 255); + u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated); + + // compute running sum of code lengths (starting offsets for each length) + rsum[0] = byteLim; // 1-byte codes are highest + rsum[1] = zeroTerminated; + for(u32 i=1; i<7; i++) + rsum[i+1] = rsum[i] + lenHisto[i]; + + // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim) + suffixLim = rsum[1]; + symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only) + + for(u32 i=zeroTerminated, j=rsum[2]; i 1 && first2 == s2.first2()) // test if symbol k is a suffix of s + opt = 0; + } + newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim + } else + newCode[i] = rsum[len-1]++; + s1.set_code_len(newCode[i],len); + symbols[newCode[i]] = s1; + } + // renumber the codes in byteCodes[] + for(u32 i=0; i<256; i++) + if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); + else + byteCodes[i] = 511 + (1 << FSST_LEN_BITS); + + // renumber the codes in shortCodes[] + for(u32 i=0; i<65536; i++) + if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); + else + shortCodes[i] = byteCodes[i&0xFF]; + + // replace the symbols in the hash table + for(u32 i=0; i>8; + } + void count1Inc(u32 pos1) { + if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... + } + void count2Inc(u32 pos1, u32 pos2) { + if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively + count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) + } + u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range + // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros + u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7] + + u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes + high = (high >> (zero << 3)) & 255; // advance to nonzero counter + if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 + return 0; // all zero + + u32 low = count1Low[pos1]; + if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) + return (u32) ((high << 8) + low); + } + u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range + // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros + u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15] + high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters + + u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters + high = (high >> (zero << 2)) & 15; // advance to nonzero counter + if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 + return 0UL; // all zero + + u32 low = count2Low[pos1][pos2]; + if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) + return (u32) ((high << 8) + low); + } + void backup1(u8 *buf) { + memcpy(buf, count1High, FSST_CODE_MAX); + memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX); + } + void restore1(u8 *buf) { + memcpy(count1High, buf, FSST_CODE_MAX); + memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX); + } +}; +#endif + + +#define FSST_BUFSZ (3<<19) // 768KB + +// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression +struct Encoder { + shared_ptr symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc) + union { + Counters counters; // for counting symbol occurences during map construction + u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) + }; +}; + +// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call) +struct SIMDjob { + u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) +}; + +extern bool +fsst_hasAVX512(); // runtime check for avx512 capability + +extern size_t +fsst_compressAVX512( + SymbolTable &symbolTable, + u8* codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) + u8* symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) + SIMDjob* input, // IN: input array (size n) with job information: what to encode, where to store it. + SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer. + size_t n, // IN: size of arrays input and output (should be max 512) + size_t unroll); // IN: degree of SIMD unrolling + +// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) +size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); +size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); +} // namespace libfsst From 603a076d567a7711a4085f80f626fd9104e917c9 Mon Sep 17 00:00:00 2001 From: arnavb Date: Tue, 25 Nov 2025 08:52:21 +0000 Subject: [PATCH 14/16] update --- NOTICE.txt | 4 ++++ cpp/src/parquet/CMakeLists.txt | 6 ++++-- cpp/src/parquet/decoder.cc | 2 +- cpp/src/parquet/encoder.cc | 21 +++++++++++++++++---- cpp/src/parquet/encoding_test.cc | 4 ++-- cpp/thirdparty/fsst/LICENSE | 21 +++++++++++++++++++++ dev/release/rat_exclude_files.txt | 1 + 7 files changed, 50 insertions(+), 9 deletions(-) create mode 100644 cpp/thirdparty/fsst/LICENSE diff --git a/NOTICE.txt b/NOTICE.txt index 9b98364d2ab..a27486d2a5d 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -21,6 +21,10 @@ This product includes software from the mman-win32 project * Copyright https://code.google.com/p/mman-win32/ * Licensed under the MIT License; +This product includes software from the Fast Static Symbol Table (FSST) project (MIT) + * Copyright (c) 2018-2020 CWI, TU Munich, FSU Jena + * https://github.com/cwida/fsst + This product includes software from the LevelDB project * Copyright (c) 2011 The LevelDB Authors. All rights reserved. * Use of this source code is governed by a BSD-style license that can be diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index e2b1db09633..843b90c5fd4 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -209,8 +209,10 @@ if(DEFINED ARROW_FSST_SOURCES) set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS "$<$,$>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>" "$<$,$,$>:-Wno-error=missing-declarations;-Wno-missing-declarations>" - "$<$:/wd4244>" - "$<$,$>>:-include${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>") + "$<$:/wd4244>") + set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS + "$<$,$>>:-include>" + "$<$,$>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>") endif() if(ARROW_HAVE_RUNTIME_AVX2) diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index 42838fd059f..db4058b6838 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -53,7 +53,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" -#include "fsst.h" +#include "fsst.h" // NOLINT(build/include_subdir) #include "parquet/types.h" #ifdef _MSC_VER diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index 00daa366f6a..852fdece61a 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -50,7 +50,7 @@ #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" -#include "fsst.h" +#include "fsst.h" // NOLINT(build/include_subdir) #include "parquet/types.h" #ifdef _MSC_VER @@ -1778,9 +1778,11 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder(sizeof(fsst_decoder_t)); const int64_t length_prefix_bytes = - static_cast(unencoded_values_.size()) * static_cast(sizeof(uint32_t)); + static_cast(unencoded_values_.size()) * + static_cast(sizeof(uint32_t)); const int64_t estimated_buffer_size = - decoder_bytes + total_input_size * kFsstCompressionExpansion + length_prefix_bytes; + decoder_bytes + total_input_size * kFsstCompressionExpansion + + length_prefix_bytes; PARQUET_ASSIGN_OR_THROW(auto output_buffer, AllocateResizableBuffer(estimated_buffer_size, pool_)); @@ -1819,6 +1821,10 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoderResize(total_output_size)); UpdateCompressionStats(total_input_size, total_output_size); + if (encoder_ != nullptr) { + fsst_destroy(encoder_); + encoder_ = nullptr; + } unencoded_values_.clear(); pending_unencoded_bytes_ = 0; unencoded_byte_array_data_bytes_ = 0; @@ -1860,7 +1866,9 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder(buffer->mutable_data()); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, buffer_ptr); @@ -1872,6 +1880,11 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder Date: Tue, 25 Nov 2025 09:27:27 +0000 Subject: [PATCH 15/16] lint --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 16 ++++++++++---- cpp/src/parquet/CMakeLists.txt | 23 ++++++++++++++------- cpp/src/parquet/decoder.cc | 2 +- cpp/src/parquet/fsst_compat.h | 6 +++--- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index c0c84511c1d..978c14abb9f 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -73,7 +73,9 @@ set(ARROW_THIRDPARTY_DEPENDENCIES ZLIB zstd) -set(fsst_SOURCE "BUNDLED" CACHE STRING "Source of fsst dependency") +set(fsst_SOURCE + "BUNDLED" + CACHE STRING "Source of fsst dependency") # For backward compatibility. We use "BOOST_SOURCE" if "Boost_SOURCE" # isn't specified and "BOOST_SOURCE" is specified. @@ -2613,16 +2615,22 @@ endif() function(build_fsst) message(STATUS "Configuring vendored FSST sources") - set(ARROW_FSST_INCLUDE_DIR "${ARROW_SOURCE_DIR}/thirdparty/fsst" PARENT_SCOPE) + set(ARROW_FSST_INCLUDE_DIR + "${ARROW_SOURCE_DIR}/thirdparty/fsst" + PARENT_SCOPE) set(ARROW_FSST_SOURCES "${ARROW_SOURCE_DIR}/thirdparty/fsst/libfsst.cpp;${ARROW_SOURCE_DIR}/thirdparty/fsst/fsst_avx512.cpp" PARENT_SCOPE) - set(FSST_VENDORED TRUE PARENT_SCOPE) + set(FSST_VENDORED + TRUE + PARENT_SCOPE) endfunction() if(ARROW_WITH_FSST) if(NOT fsst_SOURCE STREQUAL "BUNDLED") - message(FATAL_ERROR "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED.") + message( + FATAL_ERROR + "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED.") endif() resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE) endif() diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 843b90c5fd4..c9ade80a7f5 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -206,13 +206,19 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR) list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR}) endif() if(DEFINED ARROW_FSST_SOURCES) - set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS - "$<$,$>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>" - "$<$,$,$>:-Wno-error=missing-declarations;-Wno-missing-declarations>" - "$<$:/wd4244>") - set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS - "$<$,$>>:-include>" - "$<$,$>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>") + set_property( + SOURCE ${ARROW_FSST_SOURCES} + APPEND + PROPERTY COMPILE_OPTIONS + "$<$,$>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>" + "$<$,$,$>:-Wno-error=missing-declarations;-Wno-missing-declarations>" + "$<$:/wd4244>") + set_property( + SOURCE ${ARROW_FSST_SOURCES} + APPEND + PROPERTY COMPILE_OPTIONS + "$<$,$>>:-include>" + "$<$,$>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>") endif() if(ARROW_HAVE_RUNTIME_AVX2) @@ -333,7 +339,8 @@ add_arrow_lib(parquet if(PARQUET_PRIVATE_INCLUDE_DIRS) foreach(_parquet_target parquet_objlib parquet_shared parquet_static) if(TARGET ${_parquet_target}) - target_include_directories(${_parquet_target} PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS}) + target_include_directories( + ${_parquet_target} PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS}) endif() endforeach() endif() diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index db4058b6838..ae82f3a78a6 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -29,7 +30,6 @@ #include #include #include -#include #include "arrow/array.h" #include "arrow/array/builder_binary.h" diff --git a/cpp/src/parquet/fsst_compat.h b/cpp/src/parquet/fsst_compat.h index 2da27d27301..1c3ef3fae0b 100644 --- a/cpp/src/parquet/fsst_compat.h +++ b/cpp/src/parquet/fsst_compat.h @@ -21,13 +21,13 @@ // can be compiled with the compilers Arrow supports. #if defined(_WIN32) && !defined(_MSC_VER) -#include +# include // MinGW does not provide __cpuidex, but FSST only needs the CPUID // leaf/sub-leaf variant that __cpuid_count implements. static inline void arrow_fsst_cpuidex(int info[4], int function_id, int subfunction_id) { __cpuid_count(function_id, subfunction_id, info[0], info[1], info[2], info[3]); } -#define __cpuidex(info, function_id, subfunction_id) \ - arrow_fsst_cpuidex(info, function_id, subfunction_id) +# define __cpuidex(info, function_id, subfunction_id) \ + arrow_fsst_cpuidex(info, function_id, subfunction_id) #endif From d4176ae0885ca22d5ce99a1388b7f6bbf82de590 Mon Sep 17 00:00:00 2001 From: arnavb Date: Tue, 25 Nov 2025 09:53:07 +0000 Subject: [PATCH 16/16] lint --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 5 +- cpp/src/parquet/CMakeLists.txt | 29 +++++---- cpp/src/parquet/decoder.cc | 69 ++++++++++----------- cpp/src/parquet/encoder.cc | 26 ++++---- cpp/src/parquet/encoding_test.cc | 22 +++---- 5 files changed, 70 insertions(+), 81 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 978c14abb9f..dddbf2271bb 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2628,9 +2628,8 @@ endfunction() if(ARROW_WITH_FSST) if(NOT fsst_SOURCE STREQUAL "BUNDLED") - message( - FATAL_ERROR - "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED.") + message(FATAL_ERROR "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED." + ) endif() resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE) endif() diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index c9ade80a7f5..a38705cadf8 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -206,19 +206,18 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR) list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR}) endif() if(DEFINED ARROW_FSST_SOURCES) - set_property( - SOURCE ${ARROW_FSST_SOURCES} - APPEND - PROPERTY COMPILE_OPTIONS - "$<$,$>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>" - "$<$,$,$>:-Wno-error=missing-declarations;-Wno-missing-declarations>" - "$<$:/wd4244>") - set_property( - SOURCE ${ARROW_FSST_SOURCES} - APPEND - PROPERTY COMPILE_OPTIONS - "$<$,$>>:-include>" - "$<$,$>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>") + set_property(SOURCE ${ARROW_FSST_SOURCES} + APPEND + PROPERTY COMPILE_OPTIONS + "$<$,$>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>" + "$<$,$,$>:-Wno-error=missing-declarations;-Wno-missing-declarations>" + "$<$:/wd4244>") + set_property(SOURCE ${ARROW_FSST_SOURCES} + APPEND + PROPERTY COMPILE_OPTIONS + "$<$,$>>:-include>" + "$<$,$>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>" + ) endif() if(ARROW_HAVE_RUNTIME_AVX2) @@ -339,8 +338,8 @@ add_arrow_lib(parquet if(PARQUET_PRIVATE_INCLUDE_DIRS) foreach(_parquet_target parquet_objlib parquet_shared parquet_static) if(TARGET ${_parquet_target}) - target_include_directories( - ${_parquet_target} PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS}) + target_include_directories(${_parquet_target} + PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS}) endif() endforeach() endif() diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index ae82f3a78a6..03105e4fea4 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -50,10 +50,10 @@ #include "arrow/util/ubsan.h" #include "arrow/visit_data_inline.h" +#include "fsst.h" // NOLINT(build/include_subdir) #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" -#include "fsst.h" // NOLINT(build/include_subdir) #include "parquet/types.h" #ifdef _MSC_VER @@ -2378,8 +2378,7 @@ class FsstDecoder : public DecoderImpl, virtual public TypedDecoder::Accumulator* builder) override { int values_decoded = 0; - PARQUET_THROW_NOT_OK( - DecodeArrowDense(num_values, null_count, valid_bits, valid_bits_offset, builder, - &values_decoded)); + PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, + valid_bits_offset, builder, &values_decoded)); return values_decoded; } - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - typename EncodingTraits::DictAccumulator* builder) override { + int DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override { int values_decoded = 0; - PARQUET_THROW_NOT_OK( - DecodeArrowDict(num_values, null_count, valid_bits, valid_bits_offset, builder, - &values_decoded)); + PARQUET_THROW_NOT_OK(DecodeArrowDict(num_values, null_count, valid_bits, + valid_bits_offset, builder, &values_decoded)); return values_decoded; } @@ -2482,32 +2480,31 @@ class FsstDecoder : public DecoderImpl, virtual public TypedDecoderReserve(num_values)); int value_index = 0; - RETURN_NOT_OK(VisitBitRuns( - valid_bits, valid_bits_offset, num_values, - [&](int64_t position, int64_t run_length, bool is_valid) { - if (is_valid) { - for (int64_t i = 0; i < run_length; ++i) { - const auto& value = temp_values_[value_index++]; - RETURN_NOT_OK(builder->Append(value.ptr, static_cast(value.len))); - } - } else { - RETURN_NOT_OK(builder->AppendNulls(run_length)); - } - return Status::OK(); - })); + RETURN_NOT_OK(VisitBitRuns(valid_bits, valid_bits_offset, num_values, + [&](int64_t position, int64_t run_length, bool is_valid) { + if (is_valid) { + for (int64_t i = 0; i < run_length; ++i) { + const auto& value = temp_values_[value_index++]; + RETURN_NOT_OK(builder->Append( + value.ptr, static_cast(value.len))); + } + } else { + RETURN_NOT_OK(builder->AppendNulls(run_length)); + } + return Status::OK(); + })); *out_values_decoded = decoded; return Status::OK(); } uint8_t* EnsureDecodeBuffer(int64_t capacity) { - const int64_t min_capacity = - std::max(capacity, kInitialDecodeBufferSize); + const int64_t min_capacity = std::max(capacity, kInitialDecodeBufferSize); const int64_t target = ::arrow::bit_util::NextPower2(min_capacity); if (!decode_buffer_) { - PARQUET_ASSIGN_OR_THROW( - decode_buffer_, ::arrow::AllocateResizableBuffer(target, pool_)); + PARQUET_ASSIGN_OR_THROW(decode_buffer_, + ::arrow::AllocateResizableBuffer(target, pool_)); } else if (decode_buffer_->size() < target) { PARQUET_THROW_NOT_OK(decode_buffer_->Resize(target, false)); } @@ -2516,26 +2513,24 @@ class FsstDecoder : public DecoderImpl, virtual public TypedDecodermutable_data() + decode_buffer_size_; const size_t available = static_cast(decode_buffer_->size() - decode_buffer_size_); - const size_t decompressed = - fsst_decompress(&decoder_, compressed_len, compressed_ptr, available, - destination); + const size_t decompressed = fsst_decompress(&decoder_, compressed_len, + compressed_ptr, available, destination); if (decompressed > 0 || compressed_len == 0) { *value_ptr = destination; return decompressed; } - int64_t new_capacity = std::max( - decode_buffer_->size() * 2, - decode_buffer_size_ + OutputUpperBound(compressed_len)); + int64_t new_capacity = + std::max(decode_buffer_->size() * 2, + decode_buffer_size_ + OutputUpperBound(compressed_len)); if (new_capacity <= decode_buffer_->size()) { throw ParquetException("FSST decompression failed"); } diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index 852fdece61a..e0a2f1ec826 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -47,10 +47,10 @@ #include "arrow/util/ubsan.h" #include "arrow/visit_data_inline.h" +#include "fsst.h" // NOLINT(build/include_subdir) #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" -#include "fsst.h" // NOLINT(build/include_subdir) #include "parquet/types.h" #ifdef _MSC_VER @@ -1758,10 +1758,8 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder(total_size) * compression_ratio_hint_; - const int64_t estimated_payload = - static_cast(std::ceil(scaled)); + const double scaled = static_cast(total_size) * compression_ratio_hint_; + const int64_t estimated_payload = static_cast(std::ceil(scaled)); return static_cast(sizeof(fsst_decoder_t)) + std::max(0, estimated_payload); } @@ -1777,12 +1775,11 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder(sizeof(fsst_decoder_t)); - const int64_t length_prefix_bytes = - static_cast(unencoded_values_.size()) * - static_cast(sizeof(uint32_t)); - const int64_t estimated_buffer_size = - decoder_bytes + total_input_size * kFsstCompressionExpansion + - length_prefix_bytes; + const int64_t length_prefix_bytes = static_cast(unencoded_values_.size()) * + static_cast(sizeof(uint32_t)); + const int64_t estimated_buffer_size = decoder_bytes + + total_input_size * kFsstCompressionExpansion + + length_prefix_bytes; PARQUET_ASSIGN_OR_THROW(auto output_buffer, AllocateResizableBuffer(estimated_buffer_size, pool_)); @@ -1867,8 +1864,7 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder(buffer->mutable_data()); int num_valid_values = ::arrow::util::internal::SpacedCompress( src, num_values, valid_bits, valid_bits_offset, buffer_ptr); @@ -1897,8 +1893,8 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder(Encoding::FSST); ASSERT_NO_THROW(encoder->Put(*values)); @@ -2711,13 +2711,13 @@ TEST(TestFsstEncoding, MultiPageRoundTrip) { ->build(); std::unique_ptr writer; - ASSERT_NO_THROW(writer = - ParquetFileWriter::Open(output_stream, parquet_schema, writer_props)); + ASSERT_NO_THROW( + writer = ParquetFileWriter::Open(output_stream, parquet_schema, writer_props)); ASSERT_NE(nullptr, writer); auto* row_group_writer = writer->AppendRowGroup(); ASSERT_NE(nullptr, row_group_writer); - auto* column_writer = static_cast*>( - row_group_writer->NextColumn()); + auto* column_writer = + static_cast*>(row_group_writer->NextColumn()); ASSERT_NE(nullptr, column_writer); auto write_page = [&](const std::vector& source) { @@ -2760,16 +2760,16 @@ TEST(TestFsstEncoding, MultiPageRoundTrip) { ASSERT_NO_THROW(reader = make_reader()); ASSERT_NE(nullptr, reader); auto row_group_reader = reader->RowGroup(0); - auto column_reader = - std::static_pointer_cast>(row_group_reader->Column(0)); + auto column_reader = std::static_pointer_cast>( + row_group_reader->Column(0)); std::vector decoded(kTotalValues); int64_t values_read = 0; while (values_read < kTotalValues) { int64_t batch_length = std::min(1024, kTotalValues - values_read); int64_t batch_read = 0; - column_reader->ReadBatch(batch_length, nullptr, nullptr, - decoded.data() + values_read, &batch_read); + column_reader->ReadBatch(batch_length, nullptr, nullptr, decoded.data() + values_read, + &batch_read); ASSERT_GT(batch_read, 0); values_read += batch_read; }