diff --git a/NOTICE.txt b/NOTICE.txt index 9b98364d2ab..a27486d2a5d 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -21,6 +21,10 @@ This product includes software from the mman-win32 project * Copyright https://code.google.com/p/mman-win32/ * Licensed under the MIT License; +This product includes software from the Fast Static Symbol Table (FSST) project (MIT) + * Copyright (c) 2018-2020 CWI, TU Munich, FSU Jena + * https://github.com/cwida/fsst + This product includes software from the LevelDB project * Copyright (c) 2011 The LevelDB Authors. All rights reserved. * Use of this source code is governed by a BSD-style license that can be diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 4ced2a66bf5..dddbf2271bb 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -49,6 +49,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES Boost Brotli BZip2 + fsst c-ares gflags glog @@ -72,6 +73,10 @@ set(ARROW_THIRDPARTY_DEPENDENCIES ZLIB zstd) +set(fsst_SOURCE + "BUNDLED" + CACHE STRING "Source of fsst dependency") + # For backward compatibility. We use "BOOST_SOURCE" if "Boost_SOURCE" # isn't specified and "BOOST_SOURCE" is specified. # We renamed "BOOST" dependency name to "Boost" in 3.0.0 because @@ -183,6 +188,8 @@ macro(build_dependency DEPENDENCY_NAME) build_brotli() elseif("${DEPENDENCY_NAME}" STREQUAL "BZip2") build_bzip2() + elseif("${DEPENDENCY_NAME}" STREQUAL "fsst") + build_fsst() elseif("${DEPENDENCY_NAME}" STREQUAL "c-ares") build_cares() elseif("${DEPENDENCY_NAME}" STREQUAL "gflags") @@ -382,6 +389,7 @@ endif() if(ARROW_PARQUET) set(ARROW_WITH_RAPIDJSON ON) set(ARROW_WITH_THRIFT ON) + set(ARROW_WITH_FSST ON) endif() if(ARROW_WITH_THRIFT) @@ -2604,6 +2612,28 @@ if(ARROW_USE_XSIMD) endif() endif() +function(build_fsst) + message(STATUS "Configuring vendored FSST sources") + + set(ARROW_FSST_INCLUDE_DIR + "${ARROW_SOURCE_DIR}/thirdparty/fsst" + PARENT_SCOPE) + set(ARROW_FSST_SOURCES + "${ARROW_SOURCE_DIR}/thirdparty/fsst/libfsst.cpp;${ARROW_SOURCE_DIR}/thirdparty/fsst/fsst_avx512.cpp" + PARENT_SCOPE) + set(FSST_VENDORED + TRUE + PARENT_SCOPE) +endfunction() + +if(ARROW_WITH_FSST) + if(NOT fsst_SOURCE STREQUAL "BUNDLED") + message(FATAL_ERROR "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED." + ) + endif() + resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE) +endif() + macro(build_zlib) message(STATUS "Building ZLIB from source") diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index dc7d40d2a38..a38705cadf8 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -42,6 +42,10 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) ${ARGN}) set(TEST_ARGUMENTS PREFIX "parquet" LABELS "parquet-tests") + set(_PARQUET_TEST_EXTRA_ARGS ${ARG_UNPARSED_ARGUMENTS}) + if(PARQUET_TEST_EXTRA_INCLUDES) + list(APPEND _PARQUET_TEST_EXTRA_ARGS EXTRA_INCLUDES ${PARQUET_TEST_EXTRA_INCLUDES}) + endif() if(ARROW_TEST_LINKAGE STREQUAL "static") add_test_case(${REL_TEST_NAME} @@ -49,14 +53,14 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) parquet_static ${PARQUET_TEST_LINK_LIBS} ${TEST_ARGUMENTS} - ${ARG_UNPARSED_ARGUMENTS}) + ${_PARQUET_TEST_EXTRA_ARGS}) else() add_test_case(${REL_TEST_NAME} STATIC_LINK_LIBS parquet_shared ${PARQUET_TEST_LINK_LIBS} ${TEST_ARGUMENTS} - ${ARG_UNPARSED_ARGUMENTS}) + ${_PARQUET_TEST_EXTRA_ARGS}) endif() endfunction() @@ -134,6 +138,9 @@ elseif(NOT MSVC) list(APPEND PARQUET_TEST_LINK_LIBS ${CMAKE_DL_LIBS}) endif() +set(PARQUET_TEST_EXTRA_INCLUDES) +set(PARQUET_PRIVATE_INCLUDE_DIRS) + # # Generated Thrift sources set(PARQUET_THRIFT_SOURCE_DIR "${ARROW_SOURCE_DIR}/src/generated/") @@ -191,6 +198,28 @@ set(PARQUET_SRCS stream_writer.cc types.cc) +if(DEFINED ARROW_FSST_SOURCES) + list(APPEND PARQUET_SRCS ${ARROW_FSST_SOURCES}) +endif() +if(DEFINED ARROW_FSST_INCLUDE_DIR) + list(APPEND PARQUET_PRIVATE_INCLUDE_DIRS ${ARROW_FSST_INCLUDE_DIR}) + list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR}) +endif() +if(DEFINED ARROW_FSST_SOURCES) + set_property(SOURCE ${ARROW_FSST_SOURCES} + APPEND + PROPERTY COMPILE_OPTIONS + "$<$,$>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>" + "$<$,$,$>:-Wno-error=missing-declarations;-Wno-missing-declarations>" + "$<$:/wd4244>") + set_property(SOURCE ${ARROW_FSST_SOURCES} + APPEND + PROPERTY COMPILE_OPTIONS + "$<$,$>>:-include>" + "$<$,$>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>" + ) +endif() + if(ARROW_HAVE_RUNTIME_AVX2) # AVX2 is used as a proxy for BMI2. list(APPEND PARQUET_SRCS level_comparison_avx2.cc level_conversion_bmi2.cc) @@ -306,6 +335,15 @@ add_arrow_lib(parquet STATIC_INSTALL_INTERFACE_LIBS ${PARQUET_STATIC_INSTALL_INTERFACE_LIBS}) +if(PARQUET_PRIVATE_INCLUDE_DIRS) + foreach(_parquet_target parquet_objlib parquet_shared parquet_static) + if(TARGET ${_parquet_target}) + target_include_directories(${_parquet_target} + PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS}) + endif() + endforeach() +endif() + if(WIN32 AND NOT (ARROW_TEST_LINKAGE STREQUAL "static")) add_library(parquet_test_support STATIC "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp") diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 8ecb774022f..59c3b9934a2 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -862,7 +862,8 @@ class ColumnReaderImplBase { case Encoding::RLE: case Encoding::DELTA_BINARY_PACKED: case Encoding::DELTA_BYTE_ARRAY: - case Encoding::DELTA_LENGTH_BYTE_ARRAY: { + case Encoding::DELTA_LENGTH_BYTE_ARRAY: + case Encoding::FSST: { auto decoder = MakeTypedDecoder(encoding, descr_, pool_); current_decoder_ = decoder.get(); decoders_[static_cast(encoding)] = std::move(decoder); diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index d0a857dd22a..03105e4fea4 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include @@ -48,6 +50,7 @@ #include "arrow/util/ubsan.h" #include "arrow/visit_data_inline.h" +#include "fsst.h" // NOLINT(build/include_subdir) #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" @@ -2307,6 +2310,253 @@ class ByteStreamSplitDecoder : public ByteStreamSplitDecoderBase { + public: + using Base = DecoderImpl; + using Base::num_values_; + + explicit FsstDecoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : DecoderImpl(descr, Encoding::FSST), pool_(pool) {} + + void SetData(int num_values, const uint8_t* data, int len) override { + const auto header_size = static_cast(sizeof(fsst_decoder_t)); + if (len < header_size) { + throw ParquetException("FSST page too small to contain decoder header"); + } + num_values_ = num_values; + memcpy(&decoder_, data, sizeof(fsst_decoder_t)); + next_ = data + header_size; + remaining_bytes_ = len - header_size; + decode_buffer_size_ = 0; + } + + int Decode(ByteArray* buffer, int max_values) override { + max_values = std::min(max_values, num_values_); + if (max_values == 0) { + return 0; + } + + const int64_t estimated_output_size = + decode_buffer_size_ + + static_cast(remaining_bytes_) * kMaxDecompressionExpansion; + EnsureDecodeBuffer(estimated_output_size); + + int decoded = 0; + + while (decoded < max_values) { + if (ARROW_PREDICT_FALSE(remaining_bytes_ < static_cast(kLengthPrefixSize))) { + throw ParquetException("FSST data truncated before length prefix"); + } + + const uint32_t compressed_len = SafeLoadAs(next_); + next_ += kLengthPrefixSize; + remaining_bytes_ -= static_cast(kLengthPrefixSize); + + if (ARROW_PREDICT_FALSE(compressed_len > static_cast(remaining_bytes_))) { + throw ParquetException("FSST compressed length exceeds available data"); + } + + const uint8_t* compressed_ptr = next_; + next_ += compressed_len; + remaining_bytes_ -= static_cast(compressed_len); + + uint8_t* value_ptr = nullptr; + const size_t decompressed_len = + DecompressValue(compressed_ptr, compressed_len, &value_ptr); + + buffer[decoded].ptr = value_ptr; + buffer[decoded].len = static_cast(decompressed_len); + decode_buffer_size_ += static_cast(decompressed_len); + ++decoded; + } + + num_values_ -= decoded; + return decoded; + } + + int DecodeSpaced(ByteArray* buffer, int num_values, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset) override { + if (null_count == 0) { + return Decode(buffer, num_values); + } + + const int values_to_decode = num_values - null_count; + temp_values_.resize(values_to_decode); + const int decoded = Decode(temp_values_.data(), values_to_decode); + if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) { + throw ParquetException("Expected to decode ", values_to_decode, + " values but decoded ", decoded, " values."); + } + + int value_index = 0; + for (int i = 0; i < num_values; ++i) { + if (bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + buffer[i] = temp_values_[value_index++]; + } else { + buffer[i].ptr = nullptr; + buffer[i].len = 0; + } + } + + return value_index; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) override { + int values_decoded = 0; + PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits, + valid_bits_offset, builder, &values_decoded)); + return values_decoded; + } + + int DecodeArrow( + int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override { + int values_decoded = 0; + PARQUET_THROW_NOT_OK(DecodeArrowDict(num_values, null_count, valid_bits, + valid_bits_offset, builder, &values_decoded)); + return values_decoded; + } + + private: + Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* out, + int* out_values_decoded) { + const int values_to_decode = num_values - null_count; + temp_values_.resize(values_to_decode); + + const int decoded = Decode(temp_values_.data(), values_to_decode); + if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) { + throw ParquetException("Expected to decode ", values_to_decode, + " values but decoded ", decoded, " values."); + } + + auto visit_binary_helper = [&](auto* helper) { + auto* values_ptr = temp_values_.data(); + int value_index = 0; + + RETURN_NOT_OK( + VisitBitRuns(valid_bits, valid_bits_offset, num_values, + [&](int64_t position, int64_t run_length, bool is_valid) { + if (is_valid) { + for (int64_t i = 0; i < run_length; ++i) { + const auto& value = values_ptr[value_index++]; + RETURN_NOT_OK(helper->AppendValue( + value.ptr, static_cast(value.len))); + } + } else { + RETURN_NOT_OK(helper->AppendNulls(run_length)); + } + return Status::OK(); + })); + + *out_values_decoded = decoded; + return Status::OK(); + }; + + return DispatchArrowBinaryHelper( + out, num_values, /*estimated_data_length=*/{}, visit_binary_helper); + } + + Status DecodeArrowDict(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder, + int* out_values_decoded) { + const int values_to_decode = num_values - null_count; + temp_values_.resize(values_to_decode); + + const int decoded = Decode(temp_values_.data(), values_to_decode); + if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) { + throw ParquetException("Expected to decode ", values_to_decode, + " values but decoded ", decoded, " values."); + } + + RETURN_NOT_OK(builder->Reserve(num_values)); + + int value_index = 0; + RETURN_NOT_OK(VisitBitRuns(valid_bits, valid_bits_offset, num_values, + [&](int64_t position, int64_t run_length, bool is_valid) { + if (is_valid) { + for (int64_t i = 0; i < run_length; ++i) { + const auto& value = temp_values_[value_index++]; + RETURN_NOT_OK(builder->Append( + value.ptr, static_cast(value.len))); + } + } else { + RETURN_NOT_OK(builder->AppendNulls(run_length)); + } + return Status::OK(); + })); + + *out_values_decoded = decoded; + return Status::OK(); + } + + uint8_t* EnsureDecodeBuffer(int64_t capacity) { + const int64_t min_capacity = std::max(capacity, kInitialDecodeBufferSize); + const int64_t target = ::arrow::bit_util::NextPower2(min_capacity); + + if (!decode_buffer_) { + PARQUET_ASSIGN_OR_THROW(decode_buffer_, + ::arrow::AllocateResizableBuffer(target, pool_)); + } else if (decode_buffer_->size() < target) { + PARQUET_THROW_NOT_OK(decode_buffer_->Resize(target, false)); + } + return decode_buffer_->mutable_data(); + } + + size_t DecompressValue(const uint8_t* compressed_ptr, uint32_t compressed_len, + uint8_t** value_ptr) { + EnsureDecodeBuffer(decode_buffer_size_ + OutputUpperBound(compressed_len)); + + while (true) { + uint8_t* destination = decode_buffer_->mutable_data() + decode_buffer_size_; + const size_t available = + static_cast(decode_buffer_->size() - decode_buffer_size_); + + const size_t decompressed = fsst_decompress(&decoder_, compressed_len, + compressed_ptr, available, destination); + + if (decompressed > 0 || compressed_len == 0) { + *value_ptr = destination; + return decompressed; + } + + int64_t new_capacity = + std::max(decode_buffer_->size() * 2, + decode_buffer_size_ + OutputUpperBound(compressed_len)); + if (new_capacity <= decode_buffer_->size()) { + throw ParquetException("FSST decompression failed"); + } + EnsureDecodeBuffer(new_capacity); + } + } + + static int64_t OutputUpperBound(uint32_t compressed_len) { + const int64_t expanded = + static_cast(compressed_len) * kMaxDecompressionExpansion; + return std::max(expanded, kInitialDecodeBufferSize); + } + + static constexpr size_t kLengthPrefixSize = sizeof(uint32_t); + static constexpr int64_t kInitialDecodeBufferSize = 1024; + static constexpr int64_t kMaxDecompressionExpansion = 8; + + fsst_decoder_t decoder_{}; + const uint8_t* next_ = nullptr; + int remaining_bytes_ = 0; + ::arrow::MemoryPool* pool_; + std::shared_ptr<::arrow::ResizableBuffer> decode_buffer_; + int64_t decode_buffer_size_ = 0; + std::vector temp_values_; +}; + } // namespace // ---------------------------------------------------------------------- @@ -2373,6 +2623,13 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin throw ParquetException( "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"); } + } else if (encoding == Encoding::FSST) { + switch (type_num) { + case Type::BYTE_ARRAY: + return std::make_unique(descr, pool); + default: + throw ParquetException("FSST encoding only supports BYTE_ARRAY"); + } } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) { if (type_num == Type::BYTE_ARRAY) { return std::make_unique(descr, pool); diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index f9367555d97..e0a2f1ec826 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -18,8 +18,11 @@ #include "parquet/encoding.h" #include +#include #include #include +#include +#include #include #include #include @@ -44,6 +47,7 @@ #include "arrow/util/ubsan.h" #include "arrow/visit_data_inline.h" +#include "fsst.h" // NOLINT(build/include_subdir) #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/schema.h" @@ -101,7 +105,7 @@ class EncoderImpl : virtual public Encoder { MemoryPool* memory_pool() const override { return pool_; } int64_t ReportUnencodedDataBytes() override { - if (descr_->physical_type() != Type::BYTE_ARRAY) { + if (descr_ != nullptr && descr_->physical_type() != Type::BYTE_ARRAY) { throw ParquetException("ReportUnencodedDataBytes is only supported for BYTE_ARRAY"); } int64_t bytes = unencoded_byte_array_data_bytes_; @@ -1737,6 +1741,182 @@ std::shared_ptr RleBooleanEncoder::FlushValues() { return buffer; } +// ---------------------------------------------------------------------- + +class FsstEncoder : public EncoderImpl, virtual public TypedEncoder { + public: + explicit FsstEncoder(const ColumnDescriptor* descr, MemoryPool* pool) + : EncoderImpl(descr, Encoding::FSST, pool), + encoder_(nullptr), + unencoded_values_() {} + + ~FsstEncoder() override { + if (encoder_) { + fsst_destroy(encoder_); + } + } + + int64_t EstimatedDataEncodedSize() override { + const int64_t total_size = pending_unencoded_bytes_; + const double scaled = static_cast(total_size) * compression_ratio_hint_; + const int64_t estimated_payload = static_cast(std::ceil(scaled)); + return static_cast(sizeof(fsst_decoder_t)) + + std::max(0, estimated_payload); + } + + std::shared_ptr FlushValues() override { + if (unencoded_values_.empty()) { + PARQUET_ASSIGN_OR_THROW(auto buffer, AllocateBuffer(0, pool_)); + return buffer; + } + + BuildSymbolTable(); + + const int64_t total_input_size = pending_unencoded_bytes_; + + const int64_t decoder_bytes = static_cast(sizeof(fsst_decoder_t)); + const int64_t length_prefix_bytes = static_cast(unencoded_values_.size()) * + static_cast(sizeof(uint32_t)); + const int64_t estimated_buffer_size = decoder_bytes + + total_input_size * kFsstCompressionExpansion + + length_prefix_bytes; + + PARQUET_ASSIGN_OR_THROW(auto output_buffer, + AllocateResizableBuffer(estimated_buffer_size, pool_)); + uint8_t* out_ptr = output_buffer->mutable_data(); + + fsst_decoder_t decoder = fsst_decoder(encoder_); + memcpy(out_ptr, &decoder, sizeof(fsst_decoder_t)); + out_ptr += sizeof(fsst_decoder_t); + + int64_t total_output_size = sizeof(fsst_decoder_t); + std::vector out_lengths(1); + std::vector out_ptrs(1); + + for (size_t i = 0; i < unencoded_values_.size(); ++i) { + const int64_t remaining_capacity = + estimated_buffer_size - total_output_size - sizeof(uint32_t); + if (ARROW_PREDICT_FALSE(remaining_capacity <= 0)) { + throw ParquetException("FSST compression buffer exhausted"); + } + size_t available = static_cast(remaining_capacity); + size_t input_length = static_cast(unencoded_values_[i].len); + const unsigned char* input_ptr = unencoded_values_[i].ptr; + size_t num_compressed = + fsst_compress(encoder_, 1, &input_length, &input_ptr, available, + out_ptr + sizeof(uint32_t), out_lengths.data(), out_ptrs.data()); + + if (num_compressed == 0) { + throw ParquetException("FSST compression buffer too small"); + } + + uint32_t len = static_cast(out_lengths[0]); + memcpy(out_ptr, &len, sizeof(uint32_t)); + out_ptr += sizeof(uint32_t) + out_lengths[0]; + total_output_size += sizeof(uint32_t) + out_lengths[0]; + } + + PARQUET_THROW_NOT_OK(output_buffer->Resize(total_output_size)); + UpdateCompressionStats(total_input_size, total_output_size); + if (encoder_ != nullptr) { + fsst_destroy(encoder_); + encoder_ = nullptr; + } + unencoded_values_.clear(); + pending_unencoded_bytes_ = 0; + unencoded_byte_array_data_bytes_ = 0; + return output_buffer; + } + + using TypedEncoder::Put; + + void Put(const ByteArray* src, int num_values) override { + for (int i = 0; i < num_values; ++i) { + unencoded_values_.push_back(src[i]); + const int64_t length = static_cast(src[i].len); + unencoded_byte_array_data_bytes_ += length; + pending_unencoded_bytes_ += length; + } + } + + void Put(const ::arrow::Array& values) override { + AssertVarLengthBinary(values); + + PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BinaryType>( + *values.data(), + [&](::std::string_view view) { + if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) { + return Status::Invalid("Parquet cannot store strings with size 2GB or more"); + } + ByteArray val; + val.len = static_cast(view.size()); + val.ptr = reinterpret_cast(view.data()); + unencoded_values_.push_back(val); + const int64_t length = static_cast(val.len); + unencoded_byte_array_data_bytes_ += length; + pending_unencoded_bytes_ += length; + return Status::OK(); + }, + []() { return Status::OK(); })); + } + + void PutSpaced(const ByteArray* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + if (valid_bits != NULLPTR) { + PARQUET_ASSIGN_OR_THROW( + auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(ByteArray), pool_)); + auto buffer_ptr = reinterpret_cast(buffer->mutable_data()); + int num_valid_values = ::arrow::util::internal::SpacedCompress( + src, num_values, valid_bits, valid_bits_offset, buffer_ptr); + Put(buffer_ptr, num_valid_values); + } else { + Put(src, num_values); + } + } + + private: + void BuildSymbolTable() { + if (encoder_ != nullptr) { + fsst_destroy(encoder_); + encoder_ = nullptr; + } + + if (unencoded_values_.empty()) { + return; + } + + std::vector input_lengths; + std::vector input_ptrs; + + for (const auto& val : unencoded_values_) { + input_lengths.push_back(val.len); + input_ptrs.push_back(val.ptr); + } + + encoder_ = + fsst_create(unencoded_values_.size(), input_lengths.data(), input_ptrs.data(), 0); + + if (!encoder_) { + throw ParquetException("Failed to create FSST encoder"); + } + } + + void UpdateCompressionStats(int64_t input_bytes, int64_t payload_bytes) { + const double ratio = static_cast(std::max(0, payload_bytes)) / + static_cast(std::max(int64_t{1}, input_bytes)); + compression_ratio_hint_ = std::max(kMinimumCompressionRatio, ratio); + } + + static constexpr double kDefaultCompressionRatio = 1.0; + static constexpr double kMinimumCompressionRatio = 0.01; + // Allocate worst-case 4x expansion. + static constexpr int64_t kFsstCompressionExpansion = 4; + fsst_encoder_t* encoder_; + std::vector unencoded_values_; + double compression_ratio_hint_ = kDefaultCompressionRatio; + int64_t pending_unencoded_bytes_ = 0; +}; + } // namespace // ---------------------------------------------------------------------- @@ -1821,6 +2001,13 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin default: throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY"); } + } else if (encoding == Encoding::FSST) { + switch (type_num) { + case Type::BYTE_ARRAY: + return std::make_unique(descr, pool); + default: + throw ParquetException("FSST encoding only supports BYTE_ARRAY"); + } } else if (encoding == Encoding::RLE) { switch (type_num) { case Type::BOOLEAN: diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 66a3f7647fa..38f21d112c5 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -28,7 +29,9 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_dict.h" #include "arrow/array/concatenate.h" +#include "arrow/buffer.h" #include "arrow/compute/cast.h" +#include "arrow/io/memory.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" @@ -41,7 +44,13 @@ #include "arrow/util/endian.h" #include "arrow/util/span.h" #include "arrow/util/string.h" +#include "fsst.h" // NOLINT(build/include_subdir) +#include "parquet/column_page.h" +#include "parquet/column_reader.h" #include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" #include "parquet/platform.h" #include "parquet/schema.h" #include "parquet/test_util.h" @@ -2593,4 +2602,305 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) { } } +// ---------------------------------------------------------------------- + +TEST(TestFsstEncoding, BasicRoundTrip) { + ::arrow::random::RandomArrayGenerator rag(0); + constexpr int64_t kNumValues = 2048; + constexpr int64_t kNumUnique = 128; + constexpr int32_t kMinLength = 4; + constexpr int32_t kMaxLength = 48; + + auto values = + rag.BinaryWithRepeats(kNumValues, kNumUnique, kMinLength, kMaxLength, 0.0); + + auto encoder = MakeTypedEncoder(Encoding::FSST); + ASSERT_NO_THROW(encoder->Put(*values)); + auto encoded = encoder->FlushValues(); + ASSERT_NE(nullptr, encoded); + ASSERT_GT(encoded->size(), 0); + + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(values->length()), encoded->data(), + static_cast(encoded->size())); + + std::vector decoded(values->length()); + ASSERT_EQ(static_cast(values->length()), + decoder->Decode(decoded.data(), static_cast(decoded.size()))); + + const auto& binary = checked_cast(*values); + for (int64_t i = 0; i < values->length(); ++i) { + auto view = binary.GetView(i); + ASSERT_EQ(view.size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i; + } +} + +TEST(TestFsstEncoding, FlushEmptyReturnsEmptyBuffer) { + auto encoder = MakeTypedEncoder(Encoding::FSST); + auto buffer = encoder->FlushValues(); + ASSERT_NE(nullptr, buffer); + ASSERT_EQ(0, buffer->size()); +} + +TEST(TestFsstEncoding, ReportUnencodedBytesResetsCounter) { + ::arrow::random::RandomArrayGenerator rag(42); + auto values = rag.String(256, 1, 24, 0.0); + + auto encoder = MakeTypedEncoder(Encoding::FSST); + ASSERT_NO_THROW(encoder->Put(*values)); + + const auto& binary = checked_cast(*values); + const int64_t expected = binary.total_values_length(); + EXPECT_EQ(expected, encoder->ReportUnencodedDataBytes()); + EXPECT_EQ(0, encoder->ReportUnencodedDataBytes()); + + auto encoded = encoder->FlushValues(); + ASSERT_NE(nullptr, encoded); + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(values->length()), encoded->data(), + static_cast(encoded->size())); + + std::vector decoded(values->length()); + ASSERT_EQ(static_cast(values->length()), + decoder->Decode(decoded.data(), static_cast(decoded.size()))); + + for (int64_t i = 0; i < values->length(); ++i) { + auto view = binary.GetView(i); + ASSERT_EQ(view.size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i; + } + + EXPECT_EQ(0, encoder->ReportUnencodedDataBytes()); +} + +TEST(TestFsstEncoding, MultiPageRoundTrip) { + constexpr int64_t kStringsPerPage = 10000; + constexpr int64_t kNumPages = 2; + constexpr int64_t kTotalValues = kStringsPerPage * kNumPages; + constexpr int64_t kPageSizeBytes = 512 * 1024; + + std::vector page1; + std::vector page2; + page1.reserve(kStringsPerPage); + page2.reserve(kStringsPerPage); + + for (int64_t i = 0; i < kStringsPerPage; ++i) { + page1.push_back("https://www.example.com/user/profile/page" + std::to_string(i) + + "/settings/preferences/advanced/options"); + page2.push_back("ftp://ftp.downloads.org/pub/software/release" + std::to_string(i) + + "/package/installer/binaries/archive"); + } + + std::vector expected; + expected.reserve(kTotalValues); + expected.insert(expected.end(), page1.begin(), page1.end()); + expected.insert(expected.end(), page2.begin(), page2.end()); + + ASSERT_OK_AND_ASSIGN(auto output_stream, ::arrow::io::BufferOutputStream::Create()); + + schema::NodeVector fields; + fields.push_back(schema::PrimitiveNode::Make("url", Repetition::REQUIRED, + Type::BYTE_ARRAY, ConvertedType::UTF8)); + auto parquet_schema = std::static_pointer_cast( + schema::GroupNode::Make("schema", Repetition::REQUIRED, fields)); + + auto writer_props = WriterProperties::Builder() + .encoding(Encoding::FSST) + ->data_pagesize(kPageSizeBytes) + ->build(); + + std::unique_ptr writer; + ASSERT_NO_THROW( + writer = ParquetFileWriter::Open(output_stream, parquet_schema, writer_props)); + ASSERT_NE(nullptr, writer); + auto* row_group_writer = writer->AppendRowGroup(); + ASSERT_NE(nullptr, row_group_writer); + auto* column_writer = + static_cast*>(row_group_writer->NextColumn()); + ASSERT_NE(nullptr, column_writer); + + auto write_page = [&](const std::vector& source) { + std::vector batch; + batch.reserve(source.size()); + for (const auto& value : source) { + batch.emplace_back(value); + } + column_writer->WriteBatch(static_cast(batch.size()), nullptr, nullptr, + batch.data()); + }; + + write_page(page1); + write_page(page2); + + column_writer->Close(); + row_group_writer->Close(); + ASSERT_NO_THROW(writer->Close()); + + ASSERT_OK_AND_ASSIGN(auto buffer, output_stream->Finish()); + + auto make_reader = [&buffer]() -> std::unique_ptr { + auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(buffer); + return ParquetFileReader::Open(buffer_reader); + }; + + std::unique_ptr page_reader_file; + ASSERT_NO_THROW(page_reader_file = make_reader()); + ASSERT_NE(nullptr, page_reader_file); + auto page_row_group = page_reader_file->RowGroup(0); + auto page_reader = page_row_group->GetColumnPageReader(0); + + int data_page_count = 0; + while (page_reader->NextPage() != nullptr) { + ++data_page_count; + } + EXPECT_GT(data_page_count, 1); + + std::unique_ptr reader; + ASSERT_NO_THROW(reader = make_reader()); + ASSERT_NE(nullptr, reader); + auto row_group_reader = reader->RowGroup(0); + auto column_reader = std::static_pointer_cast>( + row_group_reader->Column(0)); + + std::vector decoded(kTotalValues); + int64_t values_read = 0; + while (values_read < kTotalValues) { + int64_t batch_length = std::min(1024, kTotalValues - values_read); + int64_t batch_read = 0; + column_reader->ReadBatch(batch_length, nullptr, nullptr, decoded.data() + values_read, + &batch_read); + ASSERT_GT(batch_read, 0); + values_read += batch_read; + } + ASSERT_EQ(values_read, kTotalValues); + + for (int64_t i = 0; i < kTotalValues; ++i) { + ASSERT_EQ(expected[i].size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(expected[i].data(), decoded[i].ptr, expected[i].size())) << i; + } +} + +TEST(TestFsstEncoding, MultipleFlushesProduceIndependentPages) { + ::arrow::random::RandomArrayGenerator rag(99); + auto encoder = MakeTypedEncoder(Encoding::FSST); + + const std::vector> batches = { + {64, 3, 12}, {512, 2, 24}, {17, 1, 8}}; + + for (const auto& [size, min_len, max_len] : batches) { + auto batch = rag.String(size, min_len, max_len, 0.0); + + ASSERT_NO_THROW(encoder->Put(*batch)); + auto buffer = encoder->FlushValues(); + ASSERT_NE(nullptr, buffer); + ASSERT_GT(buffer->size(), 0); + + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(batch->length()), buffer->data(), + static_cast(buffer->size())); + + std::vector decoded(batch->length()); + ASSERT_EQ(static_cast(batch->length()), + decoder->Decode(decoded.data(), static_cast(decoded.size()))); + + const auto& binary = checked_cast(*batch); + for (int64_t i = 0; i < batch->length(); ++i) { + auto view = binary.GetView(i); + ASSERT_EQ(view.size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i; + } + + encoder->ReportUnencodedDataBytes(); + } +} + +TEST(TestFsstEncoding, DecodeRejectsOverstatedLength) { + ::arrow::random::RandomArrayGenerator rag(123); + auto values = rag.String(64, 3, 32, 0.0); + + auto encoder = MakeTypedEncoder(Encoding::FSST); + ASSERT_NO_THROW(encoder->Put(*values)); + auto encoded = encoder->FlushValues(); + ASSERT_NE(nullptr, encoded); + ASSERT_GT(encoded->size(), + static_cast(sizeof(fsst_decoder_t) + sizeof(uint32_t))); + + ASSERT_OK_AND_ASSIGN(auto corrupted, + ::arrow::AllocateBuffer(encoded->size(), default_memory_pool())); + std::memcpy(corrupted->mutable_data(), encoded->data(), encoded->size()); + + auto* length_ptr = + reinterpret_cast(corrupted->mutable_data() + sizeof(fsst_decoder_t)); + *length_ptr = static_cast(encoded->size()); + + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(values->length()), corrupted->data(), + static_cast(corrupted->size())); + + std::vector decoded(values->length()); + EXPECT_THROW(decoder->Decode(decoded.data(), static_cast(decoded.size())), + ParquetException); +} + +TEST(TestFsstEncoding, SetDataRejectsShortHeader) { + auto decoder = MakeTypedDecoder(Encoding::FSST); + + const int num_values = 1; + const int header_bytes = static_cast(sizeof(fsst_decoder_t)); + std::vector truncated(static_cast(header_bytes - 1), 0); + + EXPECT_THROW( + decoder->SetData(num_values, truncated.data(), static_cast(truncated.size())), + ParquetException); +} + +TEST(TestFsstEncoding, HeavyNullsDecodeSpaced) { + ::arrow::random::RandomArrayGenerator rag(321); + constexpr int64_t kNumValues = 4096; + constexpr double kNullProb = 0.85; + + auto values = rag.String(kNumValues, 1, 24, kNullProb); + + auto encoder = MakeTypedEncoder(Encoding::FSST); + ASSERT_NO_THROW(encoder->Put(*values)); + auto encoded = encoder->FlushValues(); + ASSERT_NE(nullptr, encoded); + + auto decoder = MakeTypedDecoder(Encoding::FSST); + decoder->SetData(static_cast(values->length()), encoded->data(), + static_cast(encoded->size())); + + const auto& binary = checked_cast(*values); + std::vector decoded(values->length()); + std::vector valid_bits(::arrow::bit_util::BytesForBits(values->length()), 0); + + ::arrow::internal::BitmapWriter writer(valid_bits.data(), 0, values->length()); + for (int64_t i = 0; i < values->length(); ++i) { + if (!binary.IsNull(i)) { + writer.Set(); + } else { + writer.Clear(); + } + writer.Next(); + } + writer.Finish(); + + const int null_count = static_cast(binary.null_count()); + ASSERT_EQ(static_cast(values->length() - null_count), + decoder->DecodeSpaced(decoded.data(), static_cast(values->length()), + null_count, valid_bits.data(), 0)); + + for (int64_t i = 0; i < values->length(); ++i) { + if (binary.IsNull(i)) { + EXPECT_EQ(nullptr, decoded[i].ptr); + EXPECT_EQ(0u, decoded[i].len); + continue; + } + auto view = binary.GetView(i); + ASSERT_EQ(view.size(), static_cast(decoded[i].len)) << i; + ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i; + } +} + } // namespace parquet::test diff --git a/cpp/src/parquet/fsst_compat.h b/cpp/src/parquet/fsst_compat.h new file mode 100644 index 00000000000..1c3ef3fae0b --- /dev/null +++ b/cpp/src/parquet/fsst_compat.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +// Provide minimal compatibility shims so third-party FSST sources +// can be compiled with the compilers Arrow supports. + +#if defined(_WIN32) && !defined(_MSC_VER) +# include + +// MinGW does not provide __cpuidex, but FSST only needs the CPUID +// leaf/sub-leaf variant that __cpuid_count implements. +static inline void arrow_fsst_cpuidex(int info[4], int function_id, int subfunction_id) { + __cpuid_count(function_id, subfunction_id, info[0], info[1], info[2], info[3]); +} +# define __cpuidex(info, function_id, subfunction_id) \ + arrow_fsst_cpuidex(info, function_id, subfunction_id) +#endif diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index e3cc5adb964..b94868ec378 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -629,6 +629,13 @@ enum Encoding { Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. */ BYTE_STREAM_SPLIT = 9; + + /** Fast Static Symbol Table (FSST) encoding for BYTE_ARRAY data. + FSST compresses strings using a symbol table that maps 1-8 byte sequences + to single-byte codes. It allows random access to compressed data and works + well with dictionary encoding by compressing dictionary values. + */ + FSST = 10; } /** diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 7109c3d1c83..7e5324814b0 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -259,6 +259,8 @@ std::string EncodingToString(Encoding::type t) { return "RLE_DICTIONARY"; case Encoding::BYTE_STREAM_SPLIT: return "BYTE_STREAM_SPLIT"; + case Encoding::FSST: + return "FSST"; default: return "UNKNOWN"; } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index c2040e555fd..0113ec86aa1 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -531,8 +531,9 @@ struct Encoding { DELTA_BYTE_ARRAY = 7, RLE_DICTIONARY = 8, BYTE_STREAM_SPLIT = 9, + FSST = 10, // Should always be last element (except UNKNOWN) - UNDEFINED = 10, + UNDEFINED = 11, UNKNOWN = 999 }; }; diff --git a/cpp/thirdparty/fsst/LICENSE b/cpp/thirdparty/fsst/LICENSE new file mode 100644 index 00000000000..0cd2c8c4652 --- /dev/null +++ b/cpp/thirdparty/fsst/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018-2020 CWI, TU Munich, FSU Jena + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/cpp/thirdparty/fsst/fsst.h b/cpp/thirdparty/fsst/fsst.h new file mode 100644 index 00000000000..71085d57201 --- /dev/null +++ b/cpp/thirdparty/fsst/fsst.h @@ -0,0 +1,227 @@ +/* + * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019 + * + * =================================================================================================================================== + * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): + * + * Copyright 2018-2020, CWI, TU Munich, FSU Jena + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files + * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst + * =================================================================================================================================== + * + * FSST: Fast Static Symbol Table compression + * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf + * + * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e. + * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual + * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is + * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text. + * + * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) + * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression + * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could + * be seen as strings again and fit in whatever your program is that manipulates strings. + * + * useful property: FSST ensures that strings that are equal, are also equal in their compressed form. + * + * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of + * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. + * + * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program + * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings). + * + * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are + * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length. + */ +#ifndef FSST_INCLUDED_H +#define FSST_INCLUDED_H + +#ifdef _MSC_VER +#define __restrict__ +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#define __ORDER_LITTLE_ENDIAN__ 2 +#include +static inline int __builtin_ctzl(unsigned long long x) { + unsigned long ret; + _BitScanForward64(&ret, x); + return (int)ret; +} +#endif + +#ifdef __cplusplus +#define FSST_FALLTHROUGH [[fallthrough]] +#include +extern "C" { +#else +#define FSST_FALLTHROUGH +#endif + +#include + +/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */ +#define FSST_ESC 255 + +/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */ +typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */ + +/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */ +typedef struct { + unsigned long long version; /* version id */ + unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */ + unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */ + unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ +} fsst_decoder_t; + +/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */ +fsst_encoder_t* +fsst_create( + size_t n, /* IN: number of strings in batch to sample from. */ + const size_t lenIn[], /* IN: byte-lengths of the inputs */ + const unsigned char *strIn[], /* IN: string start pointers. */ + int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */ +); + +/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ +fsst_encoder_t* +fsst_duplicate( + fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */ +); + +#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */ + +/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */ +unsigned int /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */ +fsst_export( + fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ + unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */ +); + +/* Deallocate encoder. */ +void +fsst_destroy(fsst_encoder_t*); + +/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */ +unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */ +fsst_import( + fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ + unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */ +); + +/* Return a decoder structure from an encoder. */ +fsst_decoder_t +fsst_decoder( + fsst_encoder_t *encoder +); + +/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */ +/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */ +size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ +fsst_compress( + fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */ + size_t nstrings, /* IN: number of strings in batch to compress. */ + const size_t lenIn[], /* IN: byte-lengths of the inputs */ + const unsigned char *strIn[], /* IN: input string start pointers. */ + size_t outsize, /* IN: byte-length of output buffer. */ + unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */ + size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */ + unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */ +); + +/* Decompress a single string, inlined for speed. */ +inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */ +fsst_decompress( + const fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */ + size_t lenIn, /* IN: byte-length of compressed string. */ + const unsigned char *strIn, /* IN: compressed string. */ + size_t size, /* IN: byte-length of output buffer. */ + unsigned char *output /* OUT: memory buffer to put the decompressed string in. */ +) { + unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len; + unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output; + unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; + size_t code, posOut = 0, posIn = 0; +#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */ +#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long)) +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + while (posOut+32 <= size && posIn+4 <= lenIn) { + unsigned int nextBlock, escapeMask; + memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int)); + escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u); + if (escapeMask == 0) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } else { + unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3; + switch(firstEscapePos) { /* Duff's device */ + case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + // fall through + case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */ + } + } + } + if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop + if (posIn+2 <= lenIn) { + strOut[posOut] = strIn[posIn+1]; + if (strIn[posIn] != FSST_ESC) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + if (strIn[posIn] != FSST_ESC) { + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } else { + posIn += 2; strOut[posOut++] = strIn[posIn-1]; + } + } else { + posIn += 2; posOut++; + } + } + if (posIn < lenIn) { // last code cannot be an escape + code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; + } + } +#else + while (posOut+8 <= size && posIn < lenIn) + if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */ + FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */ + posOut += len[code]; + } else { + strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */ + posIn++; posOut++; + } +#endif +#endif + while (posIn < lenIn) + if ((code = strIn[posIn++]) < FSST_ESC) { + size_t posWrite = posOut, endWrite = posOut + len[code]; + unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite; + if ((posOut = endWrite) > size) endWrite = size; + for(; posWrite < endWrite; posWrite++) /* only write if there is room */ + strOut[posWrite] = symbolPointer[posWrite]; + } else { + if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */ + posIn++; posOut++; + } + if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0; + return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */ +} + +#ifdef __cplusplus +} +#endif +#endif /* FSST_INCLUDED_H */ diff --git a/cpp/thirdparty/fsst/fsst_avx512.cpp b/cpp/thirdparty/fsst/fsst_avx512.cpp new file mode 100644 index 00000000000..150683d2f9a --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512.cpp @@ -0,0 +1,150 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +#if defined(__x86_64__) || defined(_M_X64) +#include + +#ifdef _WIN32 +namespace libfsst { +bool fsst_hasAVX512() { + int info[4]; + __cpuidex(info, 0x00000007, 0); + return (info[1]>>16)&1; +} +} // namespace libfsst +#else +#include +namespace libfsst { +bool fsst_hasAVX512() { + int info[4]; + __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]); + return (info[1]>>16)&1; +} +} // namespace libfsst +#endif +#else +namespace libfsst { +bool fsst_hasAVX512() { return false; } +} // namespace libfsst +#endif + +namespace libfsst { + +// BULK COMPRESSION OF STRINGS +// +// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes. +// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up. +// +// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4 +// unroll3 performs best on my hardware +// +// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes). +// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs). +// +// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits) +// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order. +// +// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings. +// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process. +// - so 'processed' is the amount of strings we started processing and it is between [480,512]. +// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished. +// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method. +// +// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings. +// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings. +// +// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to. +// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch). +// +// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS +// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however, +// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them. +// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. +// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores. + +size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) { + size_t processed = 0; + // define some constants (all_x means that all 8 lanes contain 64-bits value X) +#ifdef __AVX512F__ + //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c + __m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1)); + __m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME)); + __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE)); +#define all_HASH _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE) +#define all_ONE _mm512_srli_epi64(all_MASK, 63) +#define all_M19 _mm512_srli_epi64(all_MASK, 45) +#define all_M18 _mm512_srli_epi64(all_MASK, 46) +#define all_M28 _mm512_srli_epi64(all_MASK, 36) +#define all_FFFFFF _mm512_srli_epi64(all_MASK, 40) +#define all_FFFF _mm512_srli_epi64(all_MASK, 48) +#define all_FF _mm512_srli_epi64(all_MASK, 56) + + SIMDjob *inputEnd = input+n; + assert(n >= unroll*8 && n <= 512); // should be close to 512 + __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4 + __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll + u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll + + if (unroll >= 4) { + while (input+delta1+delta2+delta3+delta4 < inputEnd) { + #include "fsst_avx512_unroll4.inc" + } + } else if (unroll == 3) { + while (input+delta1+delta2+delta3 < inputEnd) { + #include "fsst_avx512_unroll3.inc" + } + } else if (unroll == 2) { + while (input+delta1+delta2 < inputEnd) { + #include "fsst_avx512_unroll2.inc" + } + } else { + while (input+delta1 < inputEnd) { + #include "fsst_avx512_unroll1.inc" + } + } + + // flush the job states of the unfinished strings at the end of output[] + processed = n - (inputEnd - input); + u32 unfinished = 0; + if (unroll > 1) { + if (unroll > 2) { + if (unroll > 3) { + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); + unfinished += _mm_popcnt_u32((int) loadmask4); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); + unfinished += _mm_popcnt_u32((int) loadmask3); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); + unfinished += _mm_popcnt_u32((int) loadmask2); + } + _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); +#else + (void) symbolTable; + (void) codeBase; + (void) symbolBase; + (void) input; + (void) output; + (void) n; + (void) unroll; +#endif + return processed; +} +} // namespace libfsst + diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc new file mode 100644 index 00000000000..f4b81c7970d --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc @@ -0,0 +1,57 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc new file mode 100644 index 00000000000..aa33cd7e69c --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc @@ -0,0 +1,114 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc new file mode 100644 index 00000000000..e2057032abd --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc @@ -0,0 +1,171 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + code3 = _mm512_and_epi64(code3, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + // calculate the amount of lanes in job3 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + delta3 = _mm_popcnt_u32((int) loadmask3); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; + _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc new file mode 100644 index 00000000000..15cca7c938b --- /dev/null +++ b/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc @@ -0,0 +1,228 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// +// +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// +// +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions: +// +// +// +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// +// +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// +// +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +// +// +// +// + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8). + // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8). + job1 = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; + job2 = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; + job3 = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; + job4 = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + // load the next 8 input string bytes (uncompressed data, aka 'symbols'). + __m512i word1 = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); + __m512i word2 = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); + __m512i word3 = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); + __m512i word4 = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots. + // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1). + __m512i code1 = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code2 = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code3 = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + __m512i code4 = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16)); + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + // get the first three bytes of the string. + __m512i pos1 = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME); + __m512i pos2 = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME); + __m512i pos3 = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME); + __m512i pos4 = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME); + // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT + // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT + // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT + // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT + pos1 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4); + pos2 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4); + pos3 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4); + pos4 = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4); + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + // lookup in the 3-byte-prefix keyed hash table + __m512i icl1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1); + __m512i icl4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1); + // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte). + // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte). + __m512i write1 = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8); + __m512i write2 = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8); + __m512i write3 = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8); + __m512i write4 = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8); + // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table. + __m512i symb1 = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb2 = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb3 = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1); + __m512i symb4 = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1); + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality). + pos1 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF)); + pos2 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF)); + pos3 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF)); + pos4 = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF)); + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq). + __mmask8 match1 = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE); + __mmask8 match2 = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE); + __mmask8 match3 = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE); + __mmask8 match4 = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE); + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave. + code1 = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16)); + code2 = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16)); + code3 = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16)); + code4 = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16)); + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes. + write1 = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF)); + write2 = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF)); + write3 = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF)); + write4 = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF)); + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code) + code1 = _mm512_and_epi64(code1, all_FFFF); + code2 = _mm512_and_epi64(code2, all_FFFF); + code3 = _mm512_and_epi64(code3, all_FFFF); + code4 = _mm512_and_epi64(code4, all_FFFF); + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code) + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1); + _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1); + // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) + job1 = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46)); + job2 = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46)); + job3 = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46)); + job4 = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46)); + // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th) + job1 = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE))); + job2 = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE))); + job3 = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE))); + job4 = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE))); + // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register) + // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register) + // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register) + // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register) + loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18)); + loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18)); + loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18)); + loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18)); + // calculate the amount of lanes in job1 that are done + // calculate the amount of lanes in job2 that are done + // calculate the amount of lanes in job3 that are done + // calculate the amount of lanes in job4 that are done + delta1 = _mm_popcnt_u32((int) loadmask1); + delta2 = _mm_popcnt_u32((int) loadmask2); + delta3 = _mm_popcnt_u32((int) loadmask3); + delta4 = _mm_popcnt_u32((int) loadmask4); + // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length) + // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length) + _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1; + _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2; + _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3; + _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4; diff --git a/cpp/thirdparty/fsst/libfsst.cpp b/cpp/thirdparty/fsst/libfsst.cpp new file mode 100644 index 00000000000..e3ba787b959 --- /dev/null +++ b/cpp/thirdparty/fsst/libfsst.cpp @@ -0,0 +1,651 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include "libfsst.hpp" + +namespace libfsst { +Symbol concat(Symbol a, Symbol b) { + Symbol s; + u32 length = a.length()+b.length(); + if (length > Symbol::maxLength) length = Symbol::maxLength; + s.set_code_len(FSST_CODE_MASK, length); + s.store_num((b.load_num() << (8*a.length())) | a.load_num()); + return s; +} +} // namespace libfsst + +namespace std { +template <> +class hash { + public: + size_t operator()(const libfsst::QSymbol& q) const { + uint64_t k = q.symbol.load_num(); + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + uint64_t h = 0x8445d61a4e774912 ^ (8*m); + k *= m; + k ^= k >> r; + k *= m; + h ^= k; + h *= m; + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; + } +}; +} + +namespace libfsst { +bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } + +std::ostream& operator<<(std::ostream& out, const Symbol& s) { + for (u32 i=0; i line, const size_t len[], bool zeroTerminated=false) { + SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); + int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) + size_t sampleFrac = 128; + + // start by determining the terminator. We use the (lowest) most infrequent byte as terminator + st->zeroTerminated = zeroTerminated; + if (zeroTerminated) { + st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency + } else { + u16 byteHisto[256]; + memset(byteHisto, 0, sizeof(byteHisto)); + for(size_t i=0; iterminator = 256; + while(i-- > 0) { + if (byteHisto[i] > minSize) continue; + st->terminator = i; + minSize = byteHisto[i]; + } + } + assert(st->terminator != 256); + + // a random number between 0 and 128 + auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); }; + + // compress sample, and compute (pair-)frequencies + auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain + int gain = 0; + + for(size_t i=0; i sampleFrac) continue; + } + if (cur < end) { + u16 code2 = 255, code1 = st->findLongestSymbol(cur, end); + cur += st->symbols[code1].length(); + gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1))); + while (true) { + // count single symbol (i.e. an option is not extending it) + counters.count1Inc(code1); + + // as an alternative, consider just using the next byte.. + if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly + counters.count1Inc(*start); + + if (cur==end) { + break; + } + + // now match a new symbol + start = cur; + if (curhashTabSize-1); + Symbol s = st->hashTab[idx]; + code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; + word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) { + code2 = s.code(); + cur += s.length(); + } else if (code2 >= FSST_CODE_BASE) { + cur += 2; + } else { + code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK; + cur += 1; + } + } else { + code2 = st->findLongestSymbol(cur, end); + cur += st->symbols[code2].length(); + } + + // compute compressed output size + gain += ((int) (cur-start))-(1+isEscapeCode(code2)); + + if (sampleFrac < 128) { // no need to count pairs in final round + // consider the symbol that is the concatenation of the two last symbols + counters.count2Inc(code1, code2); + + // as an alternative, consider just extending with the next byte.. + if ((cur-start) > 1) // ..but do not count single byte extensions doubly + counters.count2Inc(code1, *start); + } + code1 = code2; + } + } + } + return gain; + }; + + auto makeTable = [&](SymbolTable *st, Counters &counters) { + // hashmap of c (needed because we can generate duplicate candidates) + unordered_set cands; + + // artificially make terminater the most frequent symbol so it gets included + u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator; + counters.count1Set(terminator,65535); + + auto addOrInc = [&](unordered_set &cands, Symbol s, u64 count) { + if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!! + QSymbol q; + q.symbol = s; + q.gain = count * s.length(); + auto it = cands.find(q); + if (it != cands.end()) { + q.gain += (*it).gain; + cands.erase(*it); + } + cands.insert(q); + }; + + // add candidate symbols based on counted frequency + for (u32 pos1=0; pos1nSymbols; pos1++) { + u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!! + if (!cnt1) continue; + + // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed + Symbol s1 = st->symbols[pos1]; + addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1); + + if (sampleFrac >= 128 || // last round we do not create new (combined) symbols + s1.length() == Symbol::maxLength || // symbol cannot be extended + s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte + continue; + } + for (u32 pos2=0; pos2nSymbols; pos2++) { + u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! + if (!cnt2) continue; + + // create a new symbol + Symbol s2 = st->symbols[pos2]; + Symbol s3 = concat(s1, s2); + if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte + addOrInc(cands, s3, cnt2); + } + } + + // insert candidates into priority queue (by gain) + auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); }; + priority_queue,decltype(cmpGn)> pq(cmpGn); + for (auto& q : cands) + pq.push(q); + + // Create new symbol map using best candidates + st->clear(); + while (st->nSymbols < 255 && !pq.empty()) { + QSymbol q = pq.top(); + pq.pop(); + st->add(q.symbol); + } + }; + + u8 bestCounters[512*sizeof(u16)]; +#ifdef NONOPT_FSST + for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) { + sampleFrac = frac; +#else + for(sampleFrac=8; true; sampleFrac += 30) { +#endif + memset(&counters, 0, sizeof(Counters)); + long gain = compressCount(st, counters); + if (gain >= bestGain) { // a new best solution! + counters.backup1(bestCounters); + *bestTable = *st; bestGain = gain; + } + if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128) + makeTable(st, counters); + } + delete st; + counters.restore1(bestCounters); + makeTable(bestTable, counters); + bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression + return bestTable; +} + +#ifndef NONOPT_FSST +static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) { + size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size; + u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings + SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer + SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this) + size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs) + + while (curLine < nlines && outOff <= (1<<19)) { + size_t prevLine = curLine, chunk, curOff = 0; + + // bail out if the output buffer cannot hold the compressed next string fully + if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7 + else budget -= (len[curLine]-curOff)*2; + + strOut[curLine] = (u8*) 0; + lenOut[curLine] = 0; + + do { + do { + chunk = len[curLine] - curOff; + if (chunk > 511) { + chunk = 511; // large strings need to be chopped up into segments of 511 bytes + } + // create a job in this batch + SIMDjob job; + job.cur = inOff; + job.end = job.cur + chunk; + job.pos = batchPos; + job.out = outOff; + + // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros) + outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes. + if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk + + // register job in this batch + input[batchPos] = job; + jobLine[batchPos] = curLine; + + if (chunk == 0) { + empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out + } else { + // copy string chunk into temp buffer + memcpy(symbolBase + inOff, line[curLine] + curOff, chunk); + inOff += chunk; + curOff += chunk; + symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded + } + if (++batchPos == 512) break; + } while(curOff < len[curLine]); + + if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines) || (((len[curLine])*2 + 7) > budget)) { // cannot accumulate more? + if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling) + // radix-sort jobs on length (longest string first) + // -- this provides best load balancing and allows to skip empty jobs at the end + u16 sortpos[513]; + memset(sortpos, 0, sizeof(sortpos)); + + // calculate length histo + for(size_t i=0; i> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } + } + job.out = out - codeBase; + } + // postprocess job info + job.cur = 0; + job.end = job.out - input[job.pos].out; // misuse .end field as compressed size + job.out = input[job.pos].out; // reset offset to start of encoded string + input[job.pos] = job; + } + + // copy out the result data + for(size_t i=0; i> (u8) s.icl); + if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { + *out++ = (u8) s.code(); cur += s.length(); + } else if (avoidBranch) { + // could be a 2-byte or 1-byte code, or miss + // handle everything with predication + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); + cur += (code>>FSST_LEN_BITS); + } else if ((u8) code < byteLim) { + // 2 byte code after checking there is no longer pattern + *out++ = (u8) code; cur += 2; + } else { + // 1 byte code or miss. + *out = (u8) code; + out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse + cur++; + } + } + } + }; + + for(curLine=0; curLine 511) { + chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST + } + if ((2*chunk+7) > (size_t) (lim-out)) { + return curLine; // out of memory + } + // copy the string to the 511-byte buffer + memcpy(buf, cur, chunk); + buf[chunk] = (u8) symbolTable.terminator; + cur = buf; + end = cur + chunk; + + // based on symboltable stats, choose a variant that is nice to the branch predictor + if (noSuffixOpt) { + compressVariant(true,false); + } else if (avoidBranch) { + compressVariant(false,true); + } else { + compressVariant(false, false); + } + } while((curOff += chunk) < lenIn[curLine]); + lenOut[curLine] = (size_t) (out - strOut[curLine]); + } + return curLine; +} + +#define FSST_SAMPLELINE ((size_t) 512) + +// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes +vector makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) { + size_t totSize = 0; + const size_t *lenIn = *lenRef; + vector sample; + + for(size_t i=0; i sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample + Encoder *encoder = new Encoder(); + encoder->symbolTable = shared_ptr(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); + if (sampleLen != lenIn) delete[] sampleLen; + delete[] sampleBuf; + return (fsst_encoder_t*) encoder; +} + +/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */ +extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) { + Encoder *e = new Encoder(); + e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr + return (fsst_encoder_t*) e; +} + +// export a symbol table in compact format. +extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) { + Encoder *e = (Encoder*) encoder; + // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there. + // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t + // (such functionality could be useful to append compressed data to an existing block). + // + // However, the hash function in the encoder hash table is endian-sensitive, and given its + // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables. + // Doing a endian-conversion during hashing will be slow and self-defeating. + // + // Overall, we could support reconstructing an encoder for incremental compression, but + // should enforce equal-endianness. Bit of a bummer. Not going there now. + // + // The version field is now there just for future-proofness, but not used yet + + // version allows keeping track of fsst versions, track endianness, and encoder reconstruction + u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0 + (((u64) e->symbolTable->suffixLim) << 24) | + (((u64) e->symbolTable->terminator) << 16) | + (((u64) e->symbolTable->nSymbols) << 8) | + FSST_ENDIAN_MARKER; // least significant byte is nonzero + + version = swap64_if_be(version); // ensure version is little-endian encoded + + /* do not assume unaligned reads here */ + memcpy(buf, &version, 8); + buf[8] = e->symbolTable->zeroTerminated; + for(u32 i=0; i<8; i++) + buf[9+i] = (u8) e->symbolTable->lenHisto[i]; + u32 pos = 17; + + // emit only the used bytes of the symbols + for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++) + for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++) + buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes + + return pos; // length of what was serialized +} + +#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */ + +extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) { + u64 version = 0; + u32 code, pos = 17; + u8 lenHisto[8]; + + // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) + memcpy(&version, buf, 8); + version = swap64_if_be(version); // version is always little-endian encoded + + if ((version>>32) != FSST_VERSION) return 0; + decoder->zeroTerminated = buf[8]&1; + memcpy(lenHisto, buf+9, 8); + + // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) + decoder->len[0] = 1; + decoder->symbol[0] = 0; + + // we use lenHisto[0] as 1-byte symbol run length (at the end) + code = decoder->zeroTerminated; + if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end + + // now get all symbols from the buffer + for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */ + for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) { + decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */ + decoder->symbol[code] = 0; + for(u32 j=0; jlen[code]; j++) + ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols + } + } + if (decoder->zeroTerminated) lenHisto[0]++; + + // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols). + while(code<255) { + decoder->symbol[code] = FSST_CORRUPT; + decoder->len[code++] = 8; + } + return pos; +} + +// runtime check for simd +inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { +#ifndef NONOPT_FSST + if (simd && fsst_hasAVX512()) + return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +#endif + (void) simd; + return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch); +} +size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) { + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} + +// adaptive choosing of scalar compression method based on symbol length histogram +inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + bool avoidBranch = false, noSuffixOpt = false; + if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) { + noSuffixOpt = true; + } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) && + (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) && + (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) { + avoidBranch = true; + } + return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd); +} +size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { + return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); +} +} // namespace libfsst + +using namespace libfsst; +// the main compression function (everything automatic) +extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) { + // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) + size_t totLen = accumulate(lenIn, lenIn+nlines, 0); + int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); + return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd); +} + +/* deallocate encoder */ +extern "C" void fsst_destroy(fsst_encoder_t* encoder) { + Encoder *e = (Encoder*) encoder; + delete e; +} + +/* very lazy implementation relying on export and import */ +extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) { + u8 buf[sizeof(fsst_decoder_t)]; + u32 cnt1 = fsst_export(encoder, buf); + fsst_decoder_t decoder; + u32 cnt2 = fsst_import(&decoder, buf); + assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; + return decoder; +} diff --git a/cpp/thirdparty/fsst/libfsst.hpp b/cpp/thirdparty/fsst/libfsst.hpp new file mode 100644 index 00000000000..f61bc0175b6 --- /dev/null +++ b/cpp/thirdparty/fsst/libfsst.hpp @@ -0,0 +1,471 @@ +// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): +// +// Copyright 2018-2020, CWI, TU Munich, FSU Jena +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files +// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, +// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include "fsst.h" // the official FSST API -- also usable by C mortals + +/* unsigned integers */ +namespace libfsst { +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +} // namespace libfsst + +#if UINTPTR_MAX == 0xffffffffU +// We're on a 32-bit platform +#define NONOPT_FSST +#endif + +#define FSST_ENDIAN_MARKER ((u64) 1) +#define FSST_VERSION_20190218 20190218 +#define FSST_VERSION ((u64) FSST_VERSION_20190218) + +// "symbols" are character sequences (up to 8 bytes) +// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism: +// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X. + +// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length +#define FSST_LEN_BITS 12 +#define FSST_CODE_BITS 9 +#define FSST_CODE_BASE 256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */ +#define FSST_CODE_MAX (1UL<=8) { + len = 8; + memcpy(val.str, input, 8); + } else { + memcpy(val.str, input, len); + } + set_code_len(FSST_CODE_MAX, len); + } + void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); } + + u64 load_num() const { return swap64_if_be(val.num); } + void store_num(u64 v) { val.num = swap64_if_be(v); } + + u32 length() const { return (u32) (icl >> 28); } + u16 code() const { return (icl >> 16) & FSST_CODE_MASK; } + u32 ignoredBits() const { return (u32) icl; } + + u8 first() const { assert( length() >= 1); return 0xFF & load_num(); } + u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); } + +#define FSST_HASH_LOG2SIZE 10 +#define FSST_HASH_PRIME 2971215073LL +#define FSST_SHIFT 15 +#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) + size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes +}; + +// Symbol that can be put in a queue, ordered on gain +struct QSymbol{ + Symbol symbol; + mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols + bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } +}; + +// we construct FSST symbol tables using a random sample of about 16KB (1<<14) +#define FSST_SAMPLETARGET (1<<14) +#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) + +// two phases of compression, before and after optimize(): +// +// (1) to encode values we probe (and maintain) three datastructures: +// - u16 byteCodes[256] array at the position of the next byte (s.length==1) +// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2) +// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), +// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are +// pseudo codes representing a single byte these will become escapes) +// +// (2) when we finished looking for the best symbol table we call optimize() to reshape it: +// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1) +// length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes +// (allows shortcut during compression) +// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding +// to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one +// byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[] +// and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction +// is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was +// hence added to make symbolTable construction faster. +// +// this final layout allows for the fastest compression code, only currently present in compressBulk + +// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4 +#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket + +// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key +// ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster +// +// the gain field is only used in the symbol queue that sorts symbols on gain + +struct SymbolTable { + static const u32 hashTabSize = 1<> (u8) s.icl)); + return true; + } + bool add(Symbol s) { + assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX); + u32 len = s.length(); + s.set_code_len(FSST_CODE_BASE + nSymbols, len); + if (len == 1) { + byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<> ((u8) hashTab[idx].icl)))) { + return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol + } + if (s.length() >= 2) { + u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; + if (code >= FSST_CODE_BASE) return code; + } + return byteCodes[s.first()] & FSST_CODE_MASK; + } + u16 findLongestSymbol(const u8* cur, const u8* end) const { + return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol + } + + // rationale for finalize: + // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable() + // consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize() + // (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass) + // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf. + // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). + // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[] + // Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte + // symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time. + // + // In all, we change the layout and coding, as follows.. + // + // before finalize(): + // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255 + // - The first 256 codes are pseudo symbols (all escaped bytes) + // + // after finalize(): + // - table layout is symbols[0..nSymbols>, with nSymbols < 256. + // - Real codes are [0,nSymbols>. 8-th bit not set. + // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255 + // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last) + // the two-byte codes are split in two sections: + // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression + // + // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore). + // + void finalize(u8 zeroTerminated) { + assert(nSymbols <= 255); + u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated); + + // compute running sum of code lengths (starting offsets for each length) + rsum[0] = byteLim; // 1-byte codes are highest + rsum[1] = zeroTerminated; + for(u32 i=1; i<7; i++) + rsum[i+1] = rsum[i] + lenHisto[i]; + + // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim) + suffixLim = rsum[1]; + symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only) + + for(u32 i=zeroTerminated, j=rsum[2]; i 1 && first2 == s2.first2()) // test if symbol k is a suffix of s + opt = 0; + } + newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim + } else + newCode[i] = rsum[len-1]++; + s1.set_code_len(newCode[i],len); + symbols[newCode[i]] = s1; + } + // renumber the codes in byteCodes[] + for(u32 i=0; i<256; i++) + if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); + else + byteCodes[i] = 511 + (1 << FSST_LEN_BITS); + + // renumber the codes in shortCodes[] + for(u32 i=0; i<65536; i++) + if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) + shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); + else + shortCodes[i] = byteCodes[i&0xFF]; + + // replace the symbols in the hash table + for(u32 i=0; i>8; + } + void count1Inc(u32 pos1) { + if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... + } + void count2Inc(u32 pos1, u32 pos2) { + if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) + // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively + count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) + } + u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range + // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros + u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7] + + u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes + high = (high >> (zero << 3)) & 255; // advance to nonzero counter + if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 + return 0; // all zero + + u32 low = count1Low[pos1]; + if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) + return (u32) ((high << 8) + low); + } + u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range + // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros + u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15] + high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters + + u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters + high = (high >> (zero << 2)) & 15; // advance to nonzero counter + if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 + return 0UL; // all zero + + u32 low = count2Low[pos1][pos2]; + if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) + return (u32) ((high << 8) + low); + } + void backup1(u8 *buf) { + memcpy(buf, count1High, FSST_CODE_MAX); + memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX); + } + void restore1(u8 *buf) { + memcpy(count1High, buf, FSST_CODE_MAX); + memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX); + } +}; +#endif + + +#define FSST_BUFSZ (3<<19) // 768KB + +// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression +struct Encoder { + shared_ptr symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc) + union { + Counters counters; // for counting symbol occurences during map construction + u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) + }; +}; + +// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call) +struct SIMDjob { + u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) +}; + +extern bool +fsst_hasAVX512(); // runtime check for avx512 capability + +extern size_t +fsst_compressAVX512( + SymbolTable &symbolTable, + u8* codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) + u8* symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) + SIMDjob* input, // IN: input array (size n) with job information: what to encode, where to store it. + SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer. + size_t n, // IN: size of arrays input and output (should be max 512) + size_t unroll); // IN: degree of SIMD unrolling + +// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) +size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); +size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); +} // namespace libfsst diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 7ba1f4f876b..ad84ffc98ba 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -66,6 +66,8 @@ ARROW_CARES_BUILD_VERSION=1.17.2 ARROW_CARES_BUILD_SHA256_CHECKSUM=4803c844ce20ce510ef0eb83f8ea41fa24ecaae9d280c468c582d2bb25b3913d ARROW_CRC32C_BUILD_VERSION=1.1.2 ARROW_CRC32C_BUILD_SHA256_CHECKSUM=ac07840513072b7fcebda6e821068aa04889018f24e10e46181068fb214d7e56 +ARROW_FSST_BUILD_VERSION=89f49c580c6388acf3b6ed2a49e1bfde6c05e616 +ARROW_FSST_BUILD_SHA256_CHECKSUM=5921d2b837800e1c886903d18971994587328b3e5d08d32eb8e80c00890c304d ARROW_GBENCHMARK_BUILD_VERSION=v1.8.3 ARROW_GBENCHMARK_BUILD_SHA256_CHECKSUM=6bc180a57d23d4d9515519f92b0c83d61b05b5bab188961f36ac7b06b0d9e9ce ARROW_GFLAGS_BUILD_VERSION=v2.2.2 @@ -144,6 +146,7 @@ DEPENDENCIES=( "ARROW_BZIP2_URL bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" "ARROW_CARES_URL cares-${ARROW_CARES_BUILD_VERSION}.tar.gz https://github.com/c-ares/c-ares/releases/download/cares-${ARROW_CARES_BUILD_VERSION//./_}/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz" "ARROW_CRC32C_URL crc32c-${ARROW_CRC32C_BUILD_VERSION}.tar.gz https://github.com/google/crc32c/archive/refs/tags/${ARROW_CRC32C_BUILD_VERSION}.tar.gz" + "ARROW_FSST_URL fsst-${ARROW_FSST_BUILD_VERSION}.tar.gz https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz" "ARROW_GBENCHMARK_URL gbenchmark-${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz https://github.com/google/benchmark/archive/${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz" "ARROW_GFLAGS_URL gflags-${ARROW_GFLAGS_BUILD_VERSION}.tar.gz https://github.com/gflags/gflags/archive/${ARROW_GFLAGS_BUILD_VERSION}.tar.gz" "ARROW_GLOG_URL glog-${ARROW_GLOG_BUILD_VERSION}.tar.gz https://github.com/google/glog/archive/${ARROW_GLOG_BUILD_VERSION}.tar.gz" diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 5212dc47e0b..0544ab3b1f0 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -31,6 +31,7 @@ cpp/src/generated/substrait/* cpp/thirdparty/flatbuffers/include/flatbuffers/base.h cpp/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h cpp/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h +cpp/thirdparty/fsst/* dev/requirements*.txt dev/archery/MANIFEST.in dev/archery/requirements*.txt diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 14cd3e363a4..86f9f27305a 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1487,6 +1487,7 @@ cdef encoding_name_from_enum(ParquetEncoding encoding_): ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY', ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY', ParquetEncoding_BYTE_STREAM_SPLIT: 'BYTE_STREAM_SPLIT', + ParquetEncoding_FSST: 'FSST', }.get(encoding_, 'UNKNOWN') @@ -1499,6 +1500,7 @@ cdef encoding_enum_from_name(str encoding_name): 'DELTA_BINARY_PACKED': ParquetEncoding_DELTA_BINARY_PACKED, 'DELTA_LENGTH_BYTE_ARRAY': ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY, 'DELTA_BYTE_ARRAY': ParquetEncoding_DELTA_BYTE_ARRAY, + 'FSST': ParquetEncoding_FSST, 'RLE_DICTIONARY': 'dict', 'PLAIN_DICTIONARY': 'dict', }.get(encoding_name, None) diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd index 42d48ba050f..3663b8bcf62 100644 --- a/python/pyarrow/includes/libparquet.pxd +++ b/python/pyarrow/includes/libparquet.pxd @@ -130,6 +130,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY" ParquetEncoding_BYTE_STREAM_SPLIT \ " parquet::Encoding::BYTE_STREAM_SPLIT" + ParquetEncoding_FSST" parquet::Encoding::FSST" enum ParquetCompression" parquet::Compression::type": ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 24cb586c82b..5fb3cc5b699 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -821,7 +821,7 @@ def _sanitize_table(table, new_schema, flavor): Can only be used when ``use_dictionary`` is set to False, and cannot be used in combination with ``use_byte_stream_split``. Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', - 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'FSST'}. Certain encodings are only compatible with certain data types. Please refer to the encodings section of `Reading and writing Parquet files `_.