From 7274e4b7532bacf94da255669f69dc851331e6ce Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Sun, 23 Nov 2025 14:05:27 +0000
Subject: [PATCH 01/16] Add FSST support

---
 cpp/src/parquet/CMakeLists.txt                |   2 +
 cpp/src/parquet/column_reader.cc              |   3 +-
 cpp/src/parquet/decoder.cc                    | 262 +++++++
 cpp/src/parquet/encoder.cc                    | 180 ++++-
 cpp/src/parquet/encoding_test.cc              | 310 +++++++++
 cpp/src/parquet/parquet.thrift                |   7 +
 cpp/src/parquet/thirdparty/fsst/fsst.cpp      | 200 ++++++
 cpp/src/parquet/thirdparty/fsst/fsst.h        | 227 ++++++
 .../parquet/thirdparty/fsst/fsst_avx512.cpp   | 149 ++++
 .../parquet/thirdparty/fsst/fsst_avx512.inc   |  57 ++
 .../thirdparty/fsst/fsst_avx512_unroll1.inc   |  57 ++
 .../thirdparty/fsst/fsst_avx512_unroll2.inc   | 114 +++
 .../thirdparty/fsst/fsst_avx512_unroll3.inc   | 171 +++++
 .../thirdparty/fsst/fsst_avx512_unroll4.inc   | 228 ++++++
 cpp/src/parquet/thirdparty/fsst/libfsst.cpp   | 651 ++++++++++++++++++
 cpp/src/parquet/thirdparty/fsst/libfsst.hpp   | 471 +++++++++++++
 cpp/src/parquet/types.cc                      |   2 +
 cpp/src/parquet/types.h                       |   3 +-
 python/pyarrow/_parquet.pyx                   |   2 +
 python/pyarrow/includes/libparquet.pxd        |   1 +
 python/pyarrow/parquet/core.py                |   2 +-
 21 files changed, 3095 insertions(+), 4 deletions(-)
 create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst.cpp
 create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst.h
 create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp
 create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc
 create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc
 create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc
 create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc
 create mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc
 create mode 100644 cpp/src/parquet/thirdparty/fsst/libfsst.cpp
 create mode 100644 cpp/src/parquet/thirdparty/fsst/libfsst.hpp
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index dc7d40d2a38..e06d13d1884 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -171,6 +171,8 @@ set(PARQUET_SRCS
     encryption/internal_file_encryptor.cc
     exception.cc
     file_reader.cc
+    thirdparty/fsst/libfsst.cpp
+    thirdparty/fsst/fsst_avx512.cpp
     file_writer.cc
     geospatial/statistics.cc
     geospatial/util_internal.cc
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 8ecb774022f..59c3b9934a2 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -862,7 +862,8 @@ class ColumnReaderImplBase {
         case Encoding::RLE:
         case Encoding::DELTA_BINARY_PACKED:
         case Encoding::DELTA_BYTE_ARRAY:
-        case Encoding::DELTA_LENGTH_BYTE_ARRAY: {
+        case Encoding::DELTA_LENGTH_BYTE_ARRAY:
+        case Encoding::FSST: {
           auto decoder = MakeTypedDecoder<DType>(encoding, descr_, pool_);
           current_decoder_ = decoder.get();
           decoders_[static_cast<int>(encoding)] = std::move(decoder);
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index d0a857dd22a..77bc3c760ff 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <string>
@@ -28,6 +29,7 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+#include <deque>
 
 #include "arrow/array.h"
 #include "arrow/array/builder_binary.h"
@@ -51,6 +53,7 @@
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
+#include "parquet/thirdparty/fsst/fsst.h"
 #include "parquet/types.h"
 
 #ifdef _MSC_VER
@@ -2307,6 +2310,258 @@ class ByteStreamSplitDecoder<FLBAType> : public ByteStreamSplitDecoderBase<FLBAT
   }
 };
 
+// ----------------------------------------------------------------------
+
+class FsstDecoder : public DecoderImpl, virtual public TypedDecoder<ByteArrayType> {
+ public:
+  using Base = DecoderImpl;
+  using Base::num_values_;
+
+  explicit FsstDecoder(const ColumnDescriptor* descr,
+                       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+      : DecoderImpl(descr, Encoding::FSST), pool_(pool) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    const auto header_size = static_cast<int>(sizeof(fsst_decoder_t));
+    if (len < header_size) {
+      throw ParquetException("FSST page too small to contain decoder header");
+    }
+    num_values_ = num_values;
+    memcpy(&decoder_, data, sizeof(fsst_decoder_t));
+    next_ = data + header_size;
+    remaining_bytes_ = len - header_size;
+    decode_buffer_size_ = 0;
+  }
+
+  int Decode(ByteArray* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    if (max_values == 0) {
+      return 0;
+    }
+
+    const int64_t estimated_output_size =
+        decode_buffer_size_ +
+        static_cast<int64_t>(remaining_bytes_) * kMaxDecompressionExpansion;
+    EnsureDecodeBuffer(estimated_output_size);
+
+    int decoded = 0;
+
+    while (decoded < max_values) {
+      if (ARROW_PREDICT_FALSE(remaining_bytes_ < static_cast<int>(kLengthPrefixSize))) {
+        throw ParquetException("FSST data truncated before length prefix");
+      }
+
+      const uint32_t compressed_len = SafeLoadAs<uint32_t>(next_);
+      next_ += kLengthPrefixSize;
+      remaining_bytes_ -= static_cast<int>(kLengthPrefixSize);
+
+      if (ARROW_PREDICT_FALSE(compressed_len > static_cast<uint32_t>(remaining_bytes_))) {
+        throw ParquetException("FSST compressed length exceeds available data");
+      }
+
+      const uint8_t* compressed_ptr = next_;
+      next_ += compressed_len;
+      remaining_bytes_ -= static_cast<int>(compressed_len);
+
+      uint8_t* value_ptr = nullptr;
+      const size_t decompressed_len =
+          DecompressValue(compressed_ptr, compressed_len, &value_ptr);
+
+      buffer[decoded].ptr = value_ptr;
+      buffer[decoded].len = static_cast<uint32_t>(decompressed_len);
+      decode_buffer_size_ += static_cast<int64_t>(decompressed_len);
+      ++decoded;
+    }
+
+    num_values_ -= decoded;
+    return decoded;
+  }
+
+  int DecodeSpaced(ByteArray* buffer, int num_values, int null_count,
+                   const uint8_t* valid_bits,
+                   int64_t valid_bits_offset) override {
+    if (null_count == 0) {
+      return Decode(buffer, num_values);
+    }
+
+    const int values_to_decode = num_values - null_count;
+    temp_values_.resize(values_to_decode);
+    const int decoded = Decode(temp_values_.data(), values_to_decode);
+    if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) {
+      throw ParquetException("Expected to decode ", values_to_decode,
+                             " values but decoded ", decoded, " values.");
+    }
+
+    int value_index = 0;
+    for (int i = 0; i < num_values; ++i) {
+      if (bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
+        buffer[i] = temp_values_[value_index++];
+      } else {
+        buffer[i].ptr = nullptr;
+        buffer[i].len = 0;
+      }
+    }
+
+    return value_index;
+  }
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<ByteArrayType>::Accumulator* builder) override {
+    int values_decoded = 0;
+    PARQUET_THROW_NOT_OK(
+        DecodeArrowDense(num_values, null_count, valid_bits, valid_bits_offset, builder,
+                         &values_decoded));
+    return values_decoded;
+  }
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) override {
+    int values_decoded = 0;
+    PARQUET_THROW_NOT_OK(
+        DecodeArrowDict(num_values, null_count, valid_bits, valid_bits_offset, builder,
+                        &values_decoded));
+    return values_decoded;
+  }
+
+ private:
+  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset,
+                          typename EncodingTraits<ByteArrayType>::Accumulator* out,
+                          int* out_values_decoded) {
+    const int values_to_decode = num_values - null_count;
+    temp_values_.resize(values_to_decode);
+
+    const int decoded = Decode(temp_values_.data(), values_to_decode);
+    if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) {
+      throw ParquetException("Expected to decode ", values_to_decode,
+                             " values but decoded ", decoded, " values.");
+    }
+
+    auto visit_binary_helper = [&](auto* helper) {
+      auto* values_ptr = temp_values_.data();
+      int value_index = 0;
+
+      RETURN_NOT_OK(
+          VisitBitRuns(valid_bits, valid_bits_offset, num_values,
+                       [&](int64_t position, int64_t run_length, bool is_valid) {
+                         if (is_valid) {
+                           for (int64_t i = 0; i < run_length; ++i) {
+                             const auto& value = values_ptr[value_index++];
+                             RETURN_NOT_OK(helper->AppendValue(
+                                 value.ptr, static_cast<int32_t>(value.len)));
+                           }
+                         } else {
+                           RETURN_NOT_OK(helper->AppendNulls(run_length));
+                         }
+                         return Status::OK();
+                       }));
+
+      *out_values_decoded = decoded;
+      return Status::OK();
+    };
+
+    return DispatchArrowBinaryHelper<ByteArrayType>(
+        out, num_values, /*estimated_data_length=*/{}, visit_binary_helper);
+  }
+
+  Status DecodeArrowDict(int num_values, int null_count, const uint8_t* valid_bits,
+                         int64_t valid_bits_offset,
+                         typename EncodingTraits<ByteArrayType>::DictAccumulator* builder,
+                         int* out_values_decoded) {
+    const int values_to_decode = num_values - null_count;
+    temp_values_.resize(values_to_decode);
+
+    const int decoded = Decode(temp_values_.data(), values_to_decode);
+    if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) {
+      throw ParquetException("Expected to decode ", values_to_decode,
+                             " values but decoded ", decoded, " values.");
+    }
+
+    RETURN_NOT_OK(builder->Reserve(num_values));
+
+    int value_index = 0;
+    RETURN_NOT_OK(VisitBitRuns(
+        valid_bits, valid_bits_offset, num_values,
+        [&](int64_t position, int64_t run_length, bool is_valid) {
+          if (is_valid) {
+            for (int64_t i = 0; i < run_length; ++i) {
+              const auto& value = temp_values_[value_index++];
+              RETURN_NOT_OK(builder->Append(value.ptr, static_cast<int32_t>(value.len)));
+            }
+          } else {
+            RETURN_NOT_OK(builder->AppendNulls(run_length));
+          }
+          return Status::OK();
+        }));
+
+    *out_values_decoded = decoded;
+    return Status::OK();
+  }
+
+  uint8_t* EnsureDecodeBuffer(int64_t capacity) {
+    const int64_t min_capacity =
+        std::max<int64_t>(capacity, kInitialDecodeBufferSize);
+    const int64_t target = ::arrow::bit_util::NextPower2(min_capacity);
+
+    if (!decode_buffer_) {
+      PARQUET_ASSIGN_OR_THROW(
+          decode_buffer_, ::arrow::AllocateResizableBuffer(target, pool_));
+    } else if (decode_buffer_->size() < target) {
+      PARQUET_THROW_NOT_OK(decode_buffer_->Resize(target, false));
+    }
+    return decode_buffer_->mutable_data();
+  }
+
+  size_t DecompressValue(const uint8_t* compressed_ptr, uint32_t compressed_len,
+                         uint8_t** value_ptr) {
+    EnsureDecodeBuffer(decode_buffer_size_ +
+                       OutputUpperBound(compressed_len));
+
+    while (true) {
+      uint8_t* destination = decode_buffer_->mutable_data() + decode_buffer_size_;
+      const size_t available =
+          static_cast<size_t>(decode_buffer_->size() - decode_buffer_size_);
+
+      const size_t decompressed =
+          fsst_decompress(&decoder_, compressed_len, compressed_ptr, available,
+                          destination);
+
+      if (decompressed > 0 || compressed_len == 0) {
+        *value_ptr = destination;
+        return decompressed;
+      }
+
+      int64_t new_capacity = std::max<int64_t>(
+          decode_buffer_->size() * 2,
+          decode_buffer_size_ + OutputUpperBound(compressed_len));
+      if (new_capacity <= decode_buffer_->size()) {
+        throw ParquetException("FSST decompression failed");
+      }
+      EnsureDecodeBuffer(new_capacity);
+    }
+  }
+
+  static int64_t OutputUpperBound(uint32_t compressed_len) {
+    const int64_t expanded =
+        static_cast<int64_t>(compressed_len) * kMaxDecompressionExpansion;
+    return std::max<int64_t>(expanded, kInitialDecodeBufferSize);
+  }
+
+  static constexpr size_t kLengthPrefixSize = sizeof(uint32_t);
+  static constexpr int64_t kInitialDecodeBufferSize = 1024;
+  static constexpr int64_t kMaxDecompressionExpansion = 8;
+
+  fsst_decoder_t decoder_{};
+  const uint8_t* next_ = nullptr;
+  int remaining_bytes_ = 0;
+  ::arrow::MemoryPool* pool_;
+  std::shared_ptr<::arrow::ResizableBuffer> decode_buffer_;
+  int64_t decode_buffer_size_ = 0;
+  std::vector<ByteArray> temp_values_;
+};
+
 }  // namespace
 
 // ----------------------------------------------------------------------
@@ -2373,6 +2628,13 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
         throw ParquetException(
             "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY");
     }
+  } else if (encoding == Encoding::FSST) {
+    switch (type_num) {
+      case Type::BYTE_ARRAY:
+        return std::make_unique<FsstDecoder>(descr, pool);
+      default:
+        throw ParquetException("FSST encoding only supports BYTE_ARRAY");
+    }
   } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
     if (type_num == Type::BYTE_ARRAY) {
       return std::make_unique<DeltaLengthByteArrayDecoder>(descr, pool);
diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc
index f9367555d97..61b1bb9606c 100644
--- a/cpp/src/parquet/encoder.cc
+++ b/cpp/src/parquet/encoder.cc
@@ -18,8 +18,11 @@
 #include "parquet/encoding.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
+#include <iomanip>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <string>
@@ -47,6 +50,7 @@
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
+#include "parquet/thirdparty/fsst/fsst.h"
 #include "parquet/types.h"
 
 #ifdef _MSC_VER
@@ -101,7 +105,7 @@ class EncoderImpl : virtual public Encoder {
   MemoryPool* memory_pool() const override { return pool_; }
 
   int64_t ReportUnencodedDataBytes() override {
-    if (descr_->physical_type() != Type::BYTE_ARRAY) {
+    if (descr_ != nullptr && descr_->physical_type() != Type::BYTE_ARRAY) {
       throw ParquetException("ReportUnencodedDataBytes is only supported for BYTE_ARRAY");
     }
     int64_t bytes = unencoded_byte_array_data_bytes_;
@@ -1737,6 +1741,173 @@ std::shared_ptr<Buffer> RleBooleanEncoder::FlushValues() {
   return buffer;
 }
 
+// ----------------------------------------------------------------------
+
+class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayType> {
+ public:
+  explicit FsstEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::FSST, pool),
+        encoder_(nullptr),
+        unencoded_values_() {}
+
+  ~FsstEncoder() override {
+    if (encoder_) {
+      fsst_destroy(encoder_);
+    }
+  }
+
+  int64_t EstimatedDataEncodedSize() override {
+    const int64_t total_size = pending_unencoded_bytes_;
+    const double scaled =
+        static_cast<double>(total_size) * compression_ratio_hint_;
+    const int64_t estimated_payload =
+        static_cast<int64_t>(std::ceil(scaled));
+    return static_cast<int64_t>(sizeof(fsst_decoder_t)) +
+           std::max<int64_t>(0, estimated_payload);
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    if (unencoded_values_.empty()) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, AllocateBuffer(0, pool_));
+      return buffer;
+    }
+
+    BuildSymbolTable();
+
+    const int64_t total_input_size = pending_unencoded_bytes_;
+
+    const int64_t decoder_bytes = static_cast<int64_t>(sizeof(fsst_decoder_t));
+    const int64_t length_prefix_bytes =
+        static_cast<int64_t>(unencoded_values_.size()) * static_cast<int64_t>(sizeof(uint32_t));
+    const int64_t estimated_buffer_size =
+        decoder_bytes + total_input_size * kFsstCompressionExpansion + length_prefix_bytes;
+
+    PARQUET_ASSIGN_OR_THROW(auto output_buffer,
+                            AllocateResizableBuffer(estimated_buffer_size, pool_));
+    uint8_t* out_ptr = output_buffer->mutable_data();
+
+    fsst_decoder_t decoder = fsst_decoder(encoder_);
+    memcpy(out_ptr, &decoder, sizeof(fsst_decoder_t));
+    out_ptr += sizeof(fsst_decoder_t);
+
+    int64_t total_output_size = sizeof(fsst_decoder_t);
+    std::vector<size_t> out_lengths(1);
+    std::vector<unsigned char*> out_ptrs(1);
+
+    for (size_t i = 0; i < unencoded_values_.size(); ++i) {
+      const int64_t remaining_capacity =
+          estimated_buffer_size - total_output_size - sizeof(uint32_t);
+      if (ARROW_PREDICT_FALSE(remaining_capacity <= 0)) {
+        throw ParquetException("FSST compression buffer exhausted");
+      }
+      size_t available = static_cast<size_t>(remaining_capacity);
+      size_t input_length = static_cast<size_t>(unencoded_values_[i].len);
+      const unsigned char* input_ptr = unencoded_values_[i].ptr;
+      size_t num_compressed =
+          fsst_compress(encoder_, 1, &input_length, &input_ptr, available,
+                        out_ptr + sizeof(uint32_t), out_lengths.data(), out_ptrs.data());
+
+      if (num_compressed == 0) {
+        throw ParquetException("FSST compression buffer too small");
+      }
+
+      uint32_t len = static_cast<uint32_t>(out_lengths[0]);
+      memcpy(out_ptr, &len, sizeof(uint32_t));
+      out_ptr += sizeof(uint32_t) + out_lengths[0];
+      total_output_size += sizeof(uint32_t) + out_lengths[0];
+    }
+
+    PARQUET_THROW_NOT_OK(output_buffer->Resize(total_output_size));
+    UpdateCompressionStats(total_input_size, total_output_size);
+    unencoded_values_.clear();
+    pending_unencoded_bytes_ = 0;
+    unencoded_byte_array_data_bytes_ = 0;
+    return output_buffer;
+  }
+
+  using TypedEncoder<ByteArrayType>::Put;
+
+  void Put(const ByteArray* src, int num_values) override {
+    for (int i = 0; i < num_values; ++i) {
+      unencoded_values_.push_back(src[i]);
+      const int64_t length = static_cast<int64_t>(src[i].len);
+      unencoded_byte_array_data_bytes_ += length;
+      pending_unencoded_bytes_ += length;
+    }
+  }
+
+  void Put(const ::arrow::Array& values) override {
+    AssertVarLengthBinary(values);
+
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BinaryType>(
+        *values.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+            return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+          }
+          ByteArray val;
+          val.len = static_cast<uint32_t>(view.size());
+          val.ptr = reinterpret_cast<const uint8_t*>(view.data());
+          unencoded_values_.push_back(val);
+          const int64_t length = static_cast<int64_t>(val.len);
+          unencoded_byte_array_data_bytes_ += length;
+          pending_unencoded_bytes_ += length;
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+  }
+
+  void PutSpaced(const ByteArray* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(ByteArray), pool_));
+      auto buffer_ptr = reinterpret_cast<ByteArray*>(buffer->mutable_data());
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<ByteArray>(
+          src, num_values, valid_bits, valid_bits_offset, buffer_ptr);
+      Put(buffer_ptr, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+ private:
+  void BuildSymbolTable() {
+    if (unencoded_values_.empty()) {
+      return;
+    }
+
+    std::vector<size_t> input_lengths;
+    std::vector<const unsigned char*> input_ptrs;
+
+    for (const auto& val : unencoded_values_) {
+      input_lengths.push_back(val.len);
+      input_ptrs.push_back(val.ptr);
+    }
+
+    encoder_ = fsst_create(unencoded_values_.size(), input_lengths.data(),
+                          input_ptrs.data(), 0);
+
+    if (!encoder_) {
+      throw ParquetException("Failed to create FSST encoder");
+    }
+  }
+
+  void UpdateCompressionStats(int64_t input_bytes, int64_t payload_bytes) {
+    const double ratio = static_cast<double>(std::max<int64_t>(0, payload_bytes)) /
+                         static_cast<double>(std::max<int64_t>(int64_t{1}, input_bytes));
+    compression_ratio_hint_ = std::max(kMinimumCompressionRatio, ratio);
+  }
+
+  static constexpr double kDefaultCompressionRatio = 1.0;
+  static constexpr double kMinimumCompressionRatio = 0.01;
+  // Allocate worst-case 4x expansion.
+  static constexpr int64_t kFsstCompressionExpansion = 4;
+  fsst_encoder_t* encoder_;
+  std::vector<ByteArray> unencoded_values_;
+  double compression_ratio_hint_ = kDefaultCompressionRatio;
+  int64_t pending_unencoded_bytes_ = 0;
+};
+
 }  // namespace
 
 // ----------------------------------------------------------------------
@@ -1821,6 +1992,13 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin
       default:
         throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY");
     }
+  } else if (encoding == Encoding::FSST) {
+    switch (type_num) {
+      case Type::BYTE_ARRAY:
+        return std::make_unique<FsstEncoder>(descr, pool);
+      default:
+        throw ParquetException("FSST encoding only supports BYTE_ARRAY");
+    }
   } else if (encoding == Encoding::RLE) {
     switch (type_num) {
       case Type::BOOLEAN:
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index 66a3f7647fa..ca51d8e1b24 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <cstring>
 #include <limits>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -28,7 +29,9 @@
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_dict.h"
 #include "arrow/array/concatenate.h"
+#include "arrow/buffer.h"
 #include "arrow/compute/cast.h"
+#include "arrow/io/memory.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
@@ -41,10 +44,16 @@
 #include "arrow/util/endian.h"
 #include "arrow/util/span.h"
 #include "arrow/util/string.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
 #include "parquet/encoding.h"
+#include "parquet/file_reader.h"
+#include "parquet/file_writer.h"
+#include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/test_util.h"
+#include "parquet/thirdparty/fsst/fsst.h"
 #include "parquet/types.h"
 
 using arrow::default_memory_pool;
@@ -2593,4 +2602,305 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) {
   }
 }
 
+// ----------------------------------------------------------------------
+
+TEST(TestFsstEncoding, BasicRoundTrip) {
+  ::arrow::random::RandomArrayGenerator rag(0);
+  constexpr int64_t kNumValues = 2048;
+  constexpr int64_t kNumUnique = 128;
+  constexpr int32_t kMinLength = 4;
+  constexpr int32_t kMaxLength = 48;
+
+  auto values = rag.BinaryWithRepeats(kNumValues, kNumUnique, kMinLength, kMaxLength,
+                                      0.0);
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  ASSERT_NO_THROW(encoder->Put(*values));
+  auto encoded = encoder->FlushValues();
+  ASSERT_NE(nullptr, encoded);
+  ASSERT_GT(encoded->size(), 0);
+
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+  decoder->SetData(static_cast<int>(values->length()), encoded->data(),
+                   static_cast<int>(encoded->size()));
+
+  std::vector<ByteArray> decoded(values->length());
+  ASSERT_EQ(static_cast<int>(values->length()),
+            decoder->Decode(decoded.data(), static_cast<int>(decoded.size())));
+
+  const auto& binary = checked_cast<const ::arrow::BinaryArray&>(*values);
+  for (int64_t i = 0; i < values->length(); ++i) {
+    auto view = binary.GetView(i);
+    ASSERT_EQ(view.size(), static_cast<size_t>(decoded[i].len)) << i;
+    ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i;
+  }
+}
+
+TEST(TestFsstEncoding, FlushEmptyReturnsEmptyBuffer) {
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  auto buffer = encoder->FlushValues();
+  ASSERT_NE(nullptr, buffer);
+  ASSERT_EQ(0, buffer->size());
+}
+
+TEST(TestFsstEncoding, ReportUnencodedBytesResetsCounter) {
+  ::arrow::random::RandomArrayGenerator rag(42);
+  auto values = rag.String(256, 1, 24, 0.0);
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  ASSERT_NO_THROW(encoder->Put(*values));
+
+  const auto& binary = checked_cast<const ::arrow::BinaryArray&>(*values);
+  const int64_t expected = binary.total_values_length();
+  EXPECT_EQ(expected, encoder->ReportUnencodedDataBytes());
+  EXPECT_EQ(0, encoder->ReportUnencodedDataBytes());
+
+  auto encoded = encoder->FlushValues();
+  ASSERT_NE(nullptr, encoded);
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+  decoder->SetData(static_cast<int>(values->length()), encoded->data(),
+                   static_cast<int>(encoded->size()));
+
+  std::vector<ByteArray> decoded(values->length());
+  ASSERT_EQ(static_cast<int>(values->length()),
+            decoder->Decode(decoded.data(), static_cast<int>(decoded.size())));
+
+  for (int64_t i = 0; i < values->length(); ++i) {
+    auto view = binary.GetView(i);
+    ASSERT_EQ(view.size(), static_cast<size_t>(decoded[i].len)) << i;
+    ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i;
+  }
+
+  EXPECT_EQ(0, encoder->ReportUnencodedDataBytes());
+}
+
+TEST(TestFsstEncoding, MultiPageRoundTrip) {
+  constexpr int64_t kStringsPerPage = 10000;
+  constexpr int64_t kNumPages = 2;
+  constexpr int64_t kTotalValues = kStringsPerPage * kNumPages;
+  constexpr int64_t kPageSizeBytes = 512 * 1024;
+
+  std::vector<std::string> page1;
+  std::vector<std::string> page2;
+  page1.reserve(kStringsPerPage);
+  page2.reserve(kStringsPerPage);
+
+  for (int64_t i = 0; i < kStringsPerPage; ++i) {
+    page1.push_back("https://www.example.com/user/profile/page" + std::to_string(i) +
+                    "/settings/preferences/advanced/options");
+    page2.push_back("ftp://ftp.downloads.org/pub/software/release" + std::to_string(i) +
+                    "/package/installer/binaries/archive");
+  }
+
+  std::vector<std::string> expected;
+  expected.reserve(kTotalValues);
+  expected.insert(expected.end(), page1.begin(), page1.end());
+  expected.insert(expected.end(), page2.begin(), page2.end());
+
+  ASSERT_OK_AND_ASSIGN(auto output_stream, ::arrow::io::BufferOutputStream::Create());
+
+  schema::NodeVector fields;
+  fields.push_back(schema::PrimitiveNode::Make("url", Repetition::REQUIRED,
+                                               Type::BYTE_ARRAY, ConvertedType::UTF8));
+  auto parquet_schema = std::static_pointer_cast<schema::GroupNode>(
+      schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+
+  auto writer_props = WriterProperties::Builder()
+                          .encoding(Encoding::FSST)
+                          ->data_pagesize(kPageSizeBytes)
+                          ->build();
+
+  std::unique_ptr<ParquetFileWriter> writer;
+  ASSERT_NO_THROW(writer =
+                      ParquetFileWriter::Open(output_stream, parquet_schema, writer_props));
+  ASSERT_NE(nullptr, writer);
+  auto* row_group_writer = writer->AppendRowGroup();
+  ASSERT_NE(nullptr, row_group_writer);
+  auto* column_writer = static_cast<TypedColumnWriter<ByteArrayType>*>(
+      row_group_writer->NextColumn());
+  ASSERT_NE(nullptr, column_writer);
+
+  auto write_page = [&](const std::vector<std::string>& source) {
+    std::vector<ByteArray> batch;
+    batch.reserve(source.size());
+    for (const auto& value : source) {
+      batch.emplace_back(value);
+    }
+    column_writer->WriteBatch(static_cast<int64_t>(batch.size()), nullptr, nullptr,
+                              batch.data());
+  };
+
+  write_page(page1);
+  write_page(page2);
+
+  column_writer->Close();
+  row_group_writer->Close();
+  ASSERT_NO_THROW(writer->Close());
+
+  ASSERT_OK_AND_ASSIGN(auto buffer, output_stream->Finish());
+
+  auto make_reader = [&buffer]() -> std::unique_ptr<ParquetFileReader> {
+    auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(buffer);
+    return ParquetFileReader::Open(buffer_reader);
+  };
+
+  std::unique_ptr<ParquetFileReader> page_reader_file;
+  ASSERT_NO_THROW(page_reader_file = make_reader());
+  ASSERT_NE(nullptr, page_reader_file);
+  auto page_row_group = page_reader_file->RowGroup(0);
+  auto page_reader = page_row_group->GetColumnPageReader(0);
+
+  int data_page_count = 0;
+  while (page_reader->NextPage() != nullptr) {
+    ++data_page_count;
+  }
+  EXPECT_GT(data_page_count, 1);
+
+  std::unique_ptr<ParquetFileReader> reader;
+  ASSERT_NO_THROW(reader = make_reader());
+  ASSERT_NE(nullptr, reader);
+  auto row_group_reader = reader->RowGroup(0);
+  auto column_reader =
+      std::static_pointer_cast<TypedColumnReader<ByteArrayType>>(row_group_reader->Column(0));
+
+  std::vector<ByteArray> decoded(kTotalValues);
+  int64_t values_read = 0;
+  while (values_read < kTotalValues) {
+    int64_t batch_length = std::min<int64_t>(1024, kTotalValues - values_read);
+    int64_t batch_read = 0;
+    column_reader->ReadBatch(batch_length, nullptr, nullptr,
+                             decoded.data() + values_read, &batch_read);
+    ASSERT_GT(batch_read, 0);
+    values_read += batch_read;
+  }
+  ASSERT_EQ(values_read, kTotalValues);
+
+  for (int64_t i = 0; i < kTotalValues; ++i) {
+    ASSERT_EQ(expected[i].size(), static_cast<size_t>(decoded[i].len)) << i;
+    ASSERT_EQ(0, memcmp(expected[i].data(), decoded[i].ptr, expected[i].size())) << i;
+  }
+}
+
+TEST(TestFsstEncoding, MultipleFlushesProduceIndependentPages) {
+  ::arrow::random::RandomArrayGenerator rag(99);
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+
+  const std::vector<std::tuple<int64_t, int32_t, int32_t>> batches = {
+      {64, 3, 12}, {512, 2, 24}, {17, 1, 8}};
+
+  for (const auto& [size, min_len, max_len] : batches) {
+    auto batch = rag.String(size, min_len, max_len, 0.0);
+
+    ASSERT_NO_THROW(encoder->Put(*batch));
+    auto buffer = encoder->FlushValues();
+    ASSERT_NE(nullptr, buffer);
+    ASSERT_GT(buffer->size(), 0);
+
+    auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+    decoder->SetData(static_cast<int>(batch->length()), buffer->data(),
+                     static_cast<int>(buffer->size()));
+
+    std::vector<ByteArray> decoded(batch->length());
+    ASSERT_EQ(static_cast<int>(batch->length()),
+              decoder->Decode(decoded.data(), static_cast<int>(decoded.size())));
+
+    const auto& binary = checked_cast<const ::arrow::BinaryArray&>(*batch);
+    for (int64_t i = 0; i < batch->length(); ++i) {
+      auto view = binary.GetView(i);
+      ASSERT_EQ(view.size(), static_cast<size_t>(decoded[i].len)) << i;
+      ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i;
+    }
+
+    encoder->ReportUnencodedDataBytes();
+  }
+}
+
+TEST(TestFsstEncoding, DecodeRejectsOverstatedLength) {
+  ::arrow::random::RandomArrayGenerator rag(123);
+  auto values = rag.String(64, 3, 32, 0.0);
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  ASSERT_NO_THROW(encoder->Put(*values));
+  auto encoded = encoder->FlushValues();
+  ASSERT_NE(nullptr, encoded);
+  ASSERT_GT(encoded->size(),
+            static_cast<int64_t>(sizeof(fsst_decoder_t) + sizeof(uint32_t)));
+
+  ASSERT_OK_AND_ASSIGN(auto corrupted,
+                       ::arrow::AllocateBuffer(encoded->size(), default_memory_pool()));
+  std::memcpy(corrupted->mutable_data(), encoded->data(), encoded->size());
+
+  auto* length_ptr =
+      reinterpret_cast<uint32_t*>(corrupted->mutable_data() + sizeof(fsst_decoder_t));
+  *length_ptr = static_cast<uint32_t>(encoded->size());
+
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+  decoder->SetData(static_cast<int>(values->length()), corrupted->data(),
+                   static_cast<int>(corrupted->size()));
+
+  std::vector<ByteArray> decoded(values->length());
+  EXPECT_THROW(decoder->Decode(decoded.data(), static_cast<int>(decoded.size())),
+               ParquetException);
+}
+
+TEST(TestFsstEncoding, SetDataRejectsShortHeader) {
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+
+  const int num_values = 1;
+  const int header_bytes = static_cast<int>(sizeof(fsst_decoder_t));
+  std::vector<uint8_t> truncated(static_cast<size_t>(header_bytes - 1), 0);
+
+  EXPECT_THROW(
+      decoder->SetData(num_values, truncated.data(), static_cast<int>(truncated.size())),
+      ParquetException);
+}
+
+TEST(TestFsstEncoding, HeavyNullsDecodeSpaced) {
+  ::arrow::random::RandomArrayGenerator rag(321);
+  constexpr int64_t kNumValues = 4096;
+  constexpr double kNullProb = 0.85;
+
+  auto values = rag.String(kNumValues, 1, 24, kNullProb);
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  ASSERT_NO_THROW(encoder->Put(*values));
+  auto encoded = encoder->FlushValues();
+  ASSERT_NE(nullptr, encoded);
+
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+  decoder->SetData(static_cast<int>(values->length()), encoded->data(),
+                   static_cast<int>(encoded->size()));
+
+  const auto& binary = checked_cast<const ::arrow::BinaryArray&>(*values);
+  std::vector<ByteArray> decoded(values->length());
+  std::vector<uint8_t> valid_bits(::arrow::bit_util::BytesForBits(values->length()), 0);
+
+  ::arrow::internal::BitmapWriter writer(valid_bits.data(), 0, values->length());
+  for (int64_t i = 0; i < values->length(); ++i) {
+    if (!binary.IsNull(i)) {
+      writer.Set();
+    } else {
+      writer.Clear();
+    }
+    writer.Next();
+  }
+  writer.Finish();
+
+  const int null_count = binary.null_count();
+  ASSERT_EQ(static_cast<int>(values->length() - null_count),
+            decoder->DecodeSpaced(decoded.data(), static_cast<int>(values->length()),
+                                  null_count, valid_bits.data(), 0));
+
+  for (int64_t i = 0; i < values->length(); ++i) {
+    if (binary.IsNull(i)) {
+      EXPECT_EQ(nullptr, decoded[i].ptr);
+      EXPECT_EQ(0u, decoded[i].len);
+      continue;
+    }
+    auto view = binary.GetView(i);
+    ASSERT_EQ(view.size(), static_cast<size_t>(decoded[i].len)) << i;
+    ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i;
+  }
+}
+
 }  // namespace parquet::test
diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift
index e3cc5adb964..b94868ec378 100644
--- a/cpp/src/parquet/parquet.thrift
+++ b/cpp/src/parquet/parquet.thrift
@@ -629,6 +629,13 @@ enum Encoding {
       Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11.
    */
   BYTE_STREAM_SPLIT = 9;
+
+  /** Fast Static Symbol Table (FSST) encoding for BYTE_ARRAY data.
+      FSST compresses strings using a symbol table that maps 1-8 byte sequences
+      to single-byte codes. It allows random access to compressed data and works
+      well with dictionary encoding by compressing dictionary values.
+   */
+  FSST = 10;
 }
 
 /**
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst.cpp b/cpp/src/parquet/thirdparty/fsst/fsst.cpp
new file mode 100644
index 00000000000..c1a9cc34980
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/fsst.cpp
@@ -0,0 +1,200 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2019, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+#ifdef FSST12
+#include "fsst12.h" // the official FSST API -- also usable by C mortals
+#else
+#include "fsst.h" // the official FSST API -- also usable by C mortals
+#endif
+#include <condition_variable>
+#include <iostream>
+#include <fstream>
+#include <mutex>
+#include <vector>
+#include <thread>
+using namespace std;
+
+// Utility to compress and decompress (-d) data with FSST (using stdin and stdout).
+//
+// The utility has a poor-man's async I/O in that it uses double buffering for input and output,
+// and two background pthreads for reading and writing. The idea is to make the CPU overlap with I/O.
+//
+// The data format is quite simple. A FSST compressed file is a sequence of blocks, each with format:
+// (1) 3-byte block length field (max blocksize is hence 16MB). This byte-length includes (1), (2) and (3).
+// (2) FSST dictionary as produced by fst_export().
+// (3) the FSST compressed data.
+//
+// The natural strength of FSST is in fact not block-based compression, but rather the compression and
+// *individual* decompression of many small strings separately. Think of compressed databases and (column-store)
+// data formats. But, this utility is to serve as an apples-to-apples comparison point with utilities like lz4.
+
+namespace {
+
+class BinarySemaphore {
+   private:
+   mutex m;
+   condition_variable cv;
+   bool value;
+
+   public:
+   explicit BinarySemaphore(bool initialValue = false) : value(initialValue) {}
+   void wait() {
+      unique_lock<mutex> lock(m);
+      while (!value) cv.wait(lock);
+      value = false;
+   }
+   void post() {
+      { unique_lock<mutex> lock(m); value = true; }
+      cv.notify_one();
+   }
+};
+
+bool stopThreads = false;
+BinarySemaphore srcDoneIO[2], dstDoneIO[2], srcDoneCPU[2], dstDoneCPU[2];
+unsigned char *srcBuf[2] = { NULL, NULL };
+unsigned char *dstBuf[2] = { NULL, NULL };
+unsigned char *dstMem[2] = { NULL, NULL };
+size_t srcLen[2] = { 0, 0 };
+size_t dstLen[2] = { 0, 0 };
+
+#define FSST_MEMBUF (1ULL<<22)
+int decompress = 0;
+size_t blksz = FSST_MEMBUF-(1+FSST_MAXHEADER/2); // block size of compression (max compressed size must fit 3 bytes)
+
+#define DESERIALIZE(p) (((unsigned long long) (p)[0]) << 16) | (((unsigned long long) (p)[1]) << 8) | ((unsigned long long) (p)[2])
+#define SERIALIZE(l,p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; }
+
+void reader(ifstream& src) {
+   for(int swap=0; true; swap = 1-swap) {
+      srcDoneCPU[swap].wait();
+      if (stopThreads) break;
+      src.read((char*) srcBuf[swap], blksz);
+      srcLen[swap] = (unsigned long) src.gcount();
+      if (decompress) {
+         if (blksz && srcLen[swap] == blksz) {
+            blksz = DESERIALIZE(srcBuf[swap]+blksz-3); // read size of next block
+            srcLen[swap] -= 3; // cut off size bytes
+         } else {
+            blksz = 0;
+         }
+      }
+      srcDoneIO[swap].post();
+   }
+}
+
+void writer(ofstream& dst) {
+   for(int swap=0; true; swap = 1-swap) {
+      dstDoneCPU[swap].wait();
+      if (!dstLen[swap]) break;
+      dst.write((char*) dstBuf[swap], dstLen[swap]);
+      dstDoneIO[swap].post();
+   }
+   for(int swap=0; swap<2; swap++)
+      dstDoneIO[swap].post();
+}
+
+}
+
+#ifdef FSST_STANDALONE
+int main(int argc, char* argv[]) {
+   size_t srcTot = 0, dstTot = 0;
+   if (argc < 2 || argc > 4 || (argc == 4 && (argv[1][0] != '-' || argv[1][1] != 'd' || argv[1][2]))) {
+      cerr << "usage: " << argv[0] << " -d infile outfile" << endl;
+      cerr << "       " << argv[0] << " infile outfile" << endl;
+      cerr << "       " << argv[0] << " infile" << endl;
+      return -1;
+   }
+   decompress = (argc == 4);
+   string srcfile(argv[1+decompress]), dstfile;
+   if (argc == 2) {
+      dstfile = srcfile + ".fsst";
+   } else {
+      dstfile = argv[2+decompress];
+   }
+   ifstream src;
+   ofstream dst;
+   src.open(srcfile, ios::binary);
+   dst.open(dstfile, ios::binary);
+   dst.exceptions(ios_base::failbit);
+   dst.exceptions(ios_base::badbit);
+   src.exceptions(ios_base::badbit);
+   if (decompress) {
+       unsigned char tmp[3];
+       src.read((char*) tmp, 3);
+       if (src.gcount() != 3) {
+          cerr << "failed to open input." << endl;
+          return -1;
+       }
+       blksz = DESERIALIZE(tmp); // read first block size
+   }
+   vector<unsigned char> buffer(FSST_MEMBUF*6);
+   srcBuf[0] = buffer.data();
+   srcBuf[1] = srcBuf[0] + (FSST_MEMBUF*(1ULL+decompress));
+   dstMem[0] = srcBuf[1] + (FSST_MEMBUF*(1ULL+decompress));
+   dstMem[1] = dstMem[0] + (FSST_MEMBUF*(2ULL-decompress));
+
+   for(int swap=0; swap<2; swap++) {
+      srcDoneCPU[swap].post(); // input buffer is not being processed initially
+      dstDoneIO[swap].post();  // output buffer is not being written initially
+   }
+   thread readerThread([&src]{ reader(src); });
+   thread writerThread([&dst]{ writer(dst); });
+
+   for(int swap=0; true; swap = 1-swap) {
+      srcDoneIO[swap].wait(); // wait until input buffer is available (i.e. done reading)
+      dstDoneIO[swap].wait(); // wait until output buffer is ready writing hence free for use
+      if (srcLen[swap] == 0) {
+         dstLen[swap] = 0;
+         break;
+      }
+      if (decompress) {
+          fsst_decoder_t decoder;
+          size_t hdr = fsst_import(&decoder, srcBuf[swap]);
+          dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF, dstBuf[swap] = dstMem[swap]);
+      } else {
+         unsigned char tmp[FSST_MAXHEADER];
+         fsst_encoder_t* encoder = fsst_create(1, &srcLen[swap], const_cast<const unsigned char **>(&srcBuf[swap]), 0);
+         size_t hdr = fsst_export(encoder, tmp);
+         if (fsst_compress(encoder, 1, &srcLen[swap], const_cast<const unsigned char **>(&srcBuf[swap]),
+                           FSST_MEMBUF * 2, dstMem[swap] + FSST_MAXHEADER + 3,
+                           &dstLen[swap], &dstBuf[swap]) < 1)
+            return -1;
+         dstLen[swap] += 3 + hdr;
+          dstBuf[swap] -= 3 + hdr;
+          SERIALIZE(dstLen[swap],dstBuf[swap]); // block starts with size
+          copy(tmp, tmp+hdr, dstBuf[swap]+3); // then the header (followed by the compressed bytes which are already there)
+          fsst_destroy(encoder);
+      }
+      srcTot += srcLen[swap];
+      dstTot += dstLen[swap];
+      srcDoneCPU[swap].post(); // input buffer may be re-used by the reader for the next block
+      dstDoneCPU[swap].post(); // output buffer is ready for writing out
+   }
+   cerr  << (decompress?"Dec":"C") << "ompressed " << srcTot <<  " bytes into " << dstTot << " bytes ==> " << (int) ((100*dstTot)/srcTot) << "%" << endl;
+
+   // force wait until all background writes finished
+   stopThreads = true;
+   for(int swap=0; swap<2; swap++) {
+      srcDoneCPU[swap].post();
+      dstDoneCPU[swap].post();
+   }
+   dstDoneIO[0].wait();
+   dstDoneIO[1].wait();
+   readerThread.join();
+   writerThread.join();
+}
+#endif  // FSST_STANDALONE
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst.h b/cpp/src/parquet/thirdparty/fsst/fsst.h
new file mode 100644
index 00000000000..71085d57201
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/fsst.h
@@ -0,0 +1,227 @@
+/* 
+ * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
+ *
+ * ===================================================================================================================================
+ * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+ *
+ * Copyright 2018-2020, CWI, TU Munich, FSU Jena
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files 
+ * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, 
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
+ * furnished to do so, subject to the following conditions:
+ *
+ * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+ * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+ * ===================================================================================================================================
+ *
+ * FSST: Fast Static Symbol Table compression 
+ * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
+ *
+ * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
+ * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual
+ * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is 
+ * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
+ *
+ * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) 
+ * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression 
+ * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could 
+ * be seen as strings again and fit in whatever your program is that manipulates strings.
+ *
+ * useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
+ * 
+ * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of 
+ * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. 
+ *
+ * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
+ * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
+ *
+ * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are 
+ * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length.
+ */
+#ifndef FSST_INCLUDED_H
+#define FSST_INCLUDED_H
+
+#ifdef _MSC_VER
+#define __restrict__ 
+#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#define __ORDER_LITTLE_ENDIAN__ 2
+#include <intrin.h>
+static inline int __builtin_ctzl(unsigned long long x) {
+    unsigned long ret;
+    _BitScanForward64(&ret, x);
+    return (int)ret;
+}
+#endif
+
+#ifdef __cplusplus
+#define FSST_FALLTHROUGH [[fallthrough]]
+#include <cstring>
+extern "C" {
+#else
+#define FSST_FALLTHROUGH 
+#endif
+
+#include <stddef.h>
+
+/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */
+#define FSST_ESC 255
+
+/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */
+typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */
+
+/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */
+typedef struct {
+   unsigned long long version;     /* version id */
+   unsigned char zeroTerminated;   /* terminator is a single-byte code that does not appear in longer symbols */
+   unsigned char len[255];         /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */
+   unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ 
+} fsst_decoder_t;
+
+/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */
+fsst_encoder_t*  
+fsst_create(
+   size_t n,         /* IN: number of strings in batch to sample from. */
+   const size_t lenIn[],   /* IN: byte-lengths of the inputs */
+   const unsigned char *strIn[],  /* IN: string start pointers. */
+   int zeroTerminated       /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */
+);
+
+/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ 
+fsst_encoder_t*    
+fsst_duplicate(
+   fsst_encoder_t *encoder  /* IN: the symbol table to duplicate. */ 
+);
+
+#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */
+
+/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */
+unsigned int                /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */
+fsst_export(
+   fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ 
+   unsigned char *buf       /* OUT: pointer to a byte-buffer where to serialize this symbol table. */
+); 
+
+/* Deallocate encoder. */
+void
+fsst_destroy(fsst_encoder_t*);
+
+/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */
+unsigned int                /* OUT: number of bytes consumed in buf (0 on failure). */
+fsst_import(
+   fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ 
+   unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */
+); 
+
+/* Return a decoder structure from an encoder. */
+fsst_decoder_t    
+fsst_decoder(
+   fsst_encoder_t *encoder   
+);
+
+/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */
+/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */
+size_t                      /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ 
+fsst_compress(
+   fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */
+   size_t nstrings,         /* IN: number of strings in batch to compress. */
+   const size_t lenIn[],          /* IN: byte-lengths of the inputs */
+   const unsigned char *strIn[],  /* IN: input string start pointers. */
+   size_t outsize,          /* IN: byte-length of output buffer. */
+   unsigned char *output,   /* OUT: memory buffer to put the compressed strings in (one after the other). */
+   size_t lenOut[],         /* OUT: byte-lengths of the compressed strings. */
+   unsigned char *strOut[]  /* OUT: output string start pointers. Will all point into [output,output+size). */
+);
+
+/* Decompress a single string, inlined for speed. */
+inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */
+fsst_decompress(
+   const fsst_decoder_t *decoder,  /* IN: use this symbol table for compression. */
+   size_t lenIn,             /* IN: byte-length of compressed string. */
+   const unsigned char *strIn,     /* IN: compressed string. */
+   size_t size,              /* IN: byte-length of output buffer. */
+   unsigned char *output     /* OUT: memory buffer to put the decompressed string in. */
+) {
+   unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
+   unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
+   unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; 
+   size_t code, posOut = 0, posIn = 0;
+#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
+#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+   while (posOut+32 <= size && posIn+4 <= lenIn) {
+      unsigned int nextBlock, escapeMask;
+      memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int));
+      escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u);
+      if (escapeMask == 0) {
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+     } else { 
+         unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3;
+         switch(firstEscapePos) { /* Duff's device */
+         case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */
+         }
+      }
+   }
+   if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop
+      if (posIn+2 <= lenIn) { 
+	 strOut[posOut] = strIn[posIn+1]; 
+         if (strIn[posIn] != FSST_ESC) {
+            code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+            if (strIn[posIn] != FSST_ESC) {
+               code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+            } else { 
+               posIn += 2; strOut[posOut++] = strIn[posIn-1]; 
+            }
+         } else {
+            posIn += 2; posOut++; 
+         } 
+      }
+      if (posIn < lenIn) { // last code cannot be an escape
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+      }
+   }
+#else
+   while (posOut+8 <= size && posIn < lenIn)
+      if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */
+         FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */
+         posOut += len[code];
+      } else { 
+         strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */
+         posIn++; posOut++; 
+      }
+#endif
+#endif
+   while (posIn < lenIn)
+      if ((code = strIn[posIn++]) < FSST_ESC) {
+         size_t posWrite = posOut, endWrite = posOut + len[code];
+         unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite;
+         if ((posOut = endWrite) > size) endWrite = size;
+         for(; posWrite < endWrite; posWrite++)  /* only write if there is room */
+            strOut[posWrite] = symbolPointer[posWrite];
+      } else {
+         if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */
+         posIn++; posOut++; 
+      } 
+   if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0;
+   return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* FSST_INCLUDED_H */
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp
new file mode 100644
index 00000000000..e43d3e03652
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp
@@ -0,0 +1,149 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+#include "libfsst.hpp"
+
+#if defined(__x86_64__) || defined(_M_X64)
+#include <immintrin.h>
+
+#ifdef _WIN32
+namespace libfsst {
+bool fsst_hasAVX512() {
+   int info[4];
+   __cpuidex(info, 0x00000007, 0);
+   return (info[1]>>16)&1;
+}
+}  // namespace libfsst
+#else
+#include <cpuid.h>
+namespace libfsst {
+bool fsst_hasAVX512() {
+   int info[4];
+    __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]);
+   return (info[1]>>16)&1;
+}
+}  // namespace libfsst
+#endif
+#else
+namespace libfsst {
+bool fsst_hasAVX512() { return false; }
+}  // namespace libfsst
+#endif
+
+namespace libfsst {
+
+// BULK COMPRESSION OF STRINGS
+//
+// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes.
+// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up.
+//
+// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4
+// unroll3 performs best on my hardware
+//
+// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes).
+// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs).
+//
+// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits)
+// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order.
+//
+// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings.
+// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process.
+// - so 'processed' is the amount of strings we started processing and it is between [480,512].
+// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished.
+// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method.
+//
+// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings.
+// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings.
+//
+// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to.
+// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch).
+//
+// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS
+// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however,
+// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them.
+// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization.
+// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores.
+
+size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) {
+   size_t processed = 0;
+   // define some constants (all_x means that all 8 lanes contain 64-bits value X)
+#ifdef __AVX512F__
+ //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c
+   __m512i all_MASK     = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1));
+   __m512i all_PRIME    = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME));
+   __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE));
+#define    all_HASH       _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE)
+#define    all_ONE        _mm512_srli_epi64(all_MASK, 63)
+#define    all_M19        _mm512_srli_epi64(all_MASK, 45)
+#define    all_M18        _mm512_srli_epi64(all_MASK, 46)
+#define    all_M28        _mm512_srli_epi64(all_MASK, 36)
+#define    all_FFFFFF     _mm512_srli_epi64(all_MASK, 40)
+#define    all_FFFF       _mm512_srli_epi64(all_MASK, 48)
+#define    all_FF         _mm512_srli_epi64(all_MASK, 56)
+
+   SIMDjob *inputEnd = input+n;
+   assert(n >= unroll*8 && n <= 512); // should be close to 512
+   __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4
+   __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll
+   u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll
+
+   if (unroll >= 4) {
+      while (input+delta1+delta2+delta3+delta4 < inputEnd) {
+         #include "fsst_avx512_unroll4.inc"
+      }
+   } else if (unroll == 3) {
+      while (input+delta1+delta2+delta3 < inputEnd) {
+         #include "fsst_avx512_unroll3.inc"
+      }
+   } else if (unroll == 2) {
+      while (input+delta1+delta2 < inputEnd) {
+         #include "fsst_avx512_unroll2.inc"
+      }
+   } else {
+      while (input+delta1 < inputEnd) {
+         #include "fsst_avx512_unroll1.inc"
+      }
+   }
+
+   // flush the job states of the unfinished strings at the end of output[]
+   processed = n - (inputEnd - input);
+   u32 unfinished = 0;
+   if (unroll > 1) {
+      if (unroll > 2) {
+         if (unroll > 3) {
+            _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4);
+            unfinished += _mm_popcnt_u32((int) loadmask4);
+         }
+         _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3);
+         unfinished += _mm_popcnt_u32((int) loadmask3);
+      }
+      _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2);
+      unfinished += _mm_popcnt_u32((int) loadmask2);
+   }
+   _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1);
+#else
+   (void) symbolTable;
+   (void) codeBase;
+   (void) symbolBase;
+   (void) input;
+   (void) output;
+   (void) n;
+   (void) unroll;
+#endif
+   return processed;
+}
+}  // namespace libfsst
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc
new file mode 100644
index 00000000000..0a74541dd88
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc
@@ -0,0 +1,57 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmaskX=11111111, deltaX=8).
+            jobX      = _mm512_mask_expandloadu_epi64(jobX, loadmaskX, input); input += deltaX; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  wordX     = _mm512_i64gather_epi64(_mm512_srli_epi64(jobX, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // codeX: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  codeX     = _mm512_i64gather_epi64(_mm512_and_epi64(wordX, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+   __m512i  posX      = _mm512_mullo_epi64(_mm512_and_epi64(wordX, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: posX = posX*PRIME; posX ^= posX>>SHIFT
+            posX      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(posX,_mm512_srli_epi64(posX,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  iclX      = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the writeX register (in case it turns out to be an escaped byte).
+   __m512i  writeX    = _mm512_slli_epi64(_mm512_and_epi64(wordX, all_FF), 8);
+                        // lookup just like the iclX above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symbX     = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            posX      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(iclX, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 matchX    = _mm512_cmpeq_epi64_mask(symbX, _mm512_and_epi64(wordX, posX)) & _mm512_cmplt_epi64_mask(iclX, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            codeX     = _mm512_mask_mov_epi64(codeX, matchX, _mm512_srli_epi64(iclX, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            writeX    = _mm512_or_epi64(writeX, _mm512_and_epi64(codeX, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            codeX     = _mm512_and_epi64(codeX, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(jobX, all_M19), writeX, 1);
+                        // increase the jobX.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            jobX      = _mm512_add_epi64(jobX, _mm512_slli_epi64(_mm512_srli_epi64(codeX, FSST_LEN_BITS), 46));
+                        // increase the jobX.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            jobX      = _mm512_add_epi64(jobX, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(codeX, 8), all_ONE)));
+                        // test which lanes are done now (jobX.cur==jobX.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the jobX register)
+            loadmaskX = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(jobX, 46), _mm512_and_epi64(_mm512_srli_epi64(jobX, 28), all_M18));
+                        // calculate the amount of lanes in jobX that are done
+            deltaX    = _mm_popcnt_u32((int) loadmaskX); 
+                        // write out the job state for the lanes that are done (we need the final 'jobX.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmaskX, jobX); output += deltaX;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc
new file mode 100644
index 00000000000..f4b81c7970d
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc
@@ -0,0 +1,57 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc
new file mode 100644
index 00000000000..aa33cd7e69c
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc
@@ -0,0 +1,114 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc
new file mode 100644
index 00000000000..e2057032abd
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc
@@ -0,0 +1,171 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+            job3      = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+   __m512i  word3     = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code3     = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+   __m512i  pos3      = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+                        // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+            pos3      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl3      = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+   __m512i  write3    = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb3     = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+            pos3      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+   __mmask8 match3    = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+            code3     = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+            write3    = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+            code3     = _mm512_and_epi64(code3, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+            job3      = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+            job3      = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+                        // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+            loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+                        // calculate the amount of lanes in job3 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+            delta3    = _mm_popcnt_u32((int) loadmask3); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc
new file mode 100644
index 00000000000..15cca7c938b
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc
@@ -0,0 +1,228 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+            job3      = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; 
+            job4      = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+   __m512i  word3     = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); 
+   __m512i  word4     = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code3     = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code4     = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+   __m512i  pos3      = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME);
+   __m512i  pos4      = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+                        // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT
+                        // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+            pos3      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4);
+            pos4      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl3      = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl4      = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+   __m512i  write3    = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8);
+   __m512i  write4    = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb3     = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb4     = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+            pos3      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF));
+            pos4      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+   __mmask8 match3    = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE);
+   __mmask8 match4    = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+            code3     = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16));
+            code4     = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+            write3    = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF));
+            write4    = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+            code3     = _mm512_and_epi64(code3, all_FFFF);
+            code4     = _mm512_and_epi64(code4, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+            job3      = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46));
+            job4      = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+            job3      = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE)));
+            job4      = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+                        // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register)
+                        // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+            loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18));
+            loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+                        // calculate the amount of lanes in job3 that are done
+                        // calculate the amount of lanes in job4 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+            delta3    = _mm_popcnt_u32((int) loadmask3); 
+            delta4    = _mm_popcnt_u32((int) loadmask4); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4;
diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.cpp b/cpp/src/parquet/thirdparty/fsst/libfsst.cpp
new file mode 100644
index 00000000000..e3ba787b959
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/libfsst.cpp
@@ -0,0 +1,651 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+#include "libfsst.hpp"
+
+namespace libfsst {
+Symbol concat(Symbol a, Symbol b) {
+   Symbol s;
+   u32 length = a.length()+b.length();
+   if (length > Symbol::maxLength) length = Symbol::maxLength; 
+   s.set_code_len(FSST_CODE_MASK, length);
+   s.store_num((b.load_num() << (8*a.length())) | a.load_num());
+   return s;
+}
+}  // namespace libfsst
+
+namespace std {
+template <>
+class hash<libfsst::QSymbol> {
+   public:
+   size_t operator()(const libfsst::QSymbol& q) const {
+      uint64_t k = q.symbol.load_num();
+      const uint64_t m = 0xc6a4a7935bd1e995;
+      const int r = 47;
+      uint64_t h = 0x8445d61a4e774912 ^ (8*m);
+      k *= m;
+      k ^= k >> r;
+      k *= m;
+      h ^= k;
+      h *= m;
+      h ^= h >> r;
+      h *= m;
+      h ^= h >> r;
+      return h;
+   }
+};
+}
+
+namespace libfsst {
+bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
+
+std::ostream& operator<<(std::ostream& out, const Symbol& s) {
+   for (u32 i=0; i<s.length(); i++)
+      out << s.val.str[i];
+   return out;
+}
+
+SymbolTable *buildSymbolTable(Counters& counters, vector<const u8*> line, const size_t len[], bool zeroTerminated=false) {
+   SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
+   int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
+   size_t sampleFrac = 128;
+
+   // start by determining the terminator. We use the (lowest) most infrequent byte as terminator 
+   st->zeroTerminated = zeroTerminated;
+   if (zeroTerminated) {
+      st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
+   } else {
+      u16 byteHisto[256];
+      memset(byteHisto, 0, sizeof(byteHisto));
+      for(size_t i=0; i<line.size(); i++) {
+         const u8* cur = line[i];
+         const u8* end = cur + len[i];
+         while(cur < end) byteHisto[*cur++]++;
+      }
+      u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
+      while(i-- > 0) {
+         if (byteHisto[i] > minSize) continue;
+         st->terminator = i;
+         minSize = byteHisto[i];
+      }
+   }
+   assert(st->terminator != 256);
+
+   // a random number between 0 and 128
+   auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
+
+   // compress sample, and compute (pair-)frequencies
+   auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
+      int gain = 0;
+
+      for(size_t i=0; i<line.size(); i++) {
+         const u8* cur = line[i], *start = cur;
+         const u8* end = cur + len[i];
+
+         if (sampleFrac < 128) {
+            // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
+            if (rnd128(i) > sampleFrac) continue;
+         }
+         if (cur < end) {
+            u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
+            cur += st->symbols[code1].length();
+            gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
+            while (true) {
+               // count single symbol (i.e. an option is not extending it)
+               counters.count1Inc(code1);
+	
+               // as an alternative, consider just using the next byte..
+               if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
+                  counters.count1Inc(*start); 
+
+               if (cur==end) { 
+                  break;
+               } 
+
+               // now match a new symbol
+	       start = cur;
+               if (cur<end-7) {
+                  u64 word = fsst_unaligned_load(cur);
+                  size_t code = word & 0xFFFFFF;
+                  size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
+                  Symbol s = st->hashTab[idx];
+                  code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
+                  word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                  if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) {
+                     code2 = s.code(); 
+		     cur += s.length();
+                  } else if (code2 >= FSST_CODE_BASE) {
+                     cur += 2;
+                  } else {
+                     code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
+                     cur += 1;
+                  }
+               } else {
+                  code2 = st->findLongestSymbol(cur, end);
+                  cur += st->symbols[code2].length();
+               }
+ 
+               // compute compressed output size
+               gain += ((int) (cur-start))-(1+isEscapeCode(code2));
+
+               if (sampleFrac < 128) { // no need to count pairs in final round
+	          // consider the symbol that is the concatenation of the two last symbols
+                  counters.count2Inc(code1, code2);
+
+                  // as an alternative, consider just extending with the next byte..
+                  if ((cur-start) > 1)  // ..but do not count single byte extensions doubly
+                     counters.count2Inc(code1, *start);
+               }
+               code1 = code2;
+            }
+         }
+      }
+      return gain; 
+   };
+
+   auto makeTable = [&](SymbolTable *st, Counters &counters) {
+      // hashmap of c (needed because we can generate duplicate candidates)
+      unordered_set<QSymbol> cands;
+
+      // artificially make terminater the most frequent symbol so it gets included
+      u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
+      counters.count1Set(terminator,65535); 
+
+      auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
+         if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
+         QSymbol q;
+         q.symbol = s;
+         q.gain = count * s.length();
+         auto it = cands.find(q);
+         if (it != cands.end()) {
+            q.gain += (*it).gain;
+            cands.erase(*it);
+         }
+         cands.insert(q);
+      };
+
+      // add candidate symbols based on counted frequency
+      for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) { 
+         u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
+         if (!cnt1) continue;
+
+         // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
+         Symbol s1 = st->symbols[pos1];
+         addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
+
+         if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
+             s1.length() == Symbol::maxLength || // symbol cannot be extended
+             s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
+            continue;
+         }
+         for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) { 
+            u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
+            if (!cnt2) continue;
+
+            // create a new symbol
+            Symbol s2 = st->symbols[pos2];
+            Symbol s3 = concat(s1, s2);
+            if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
+               addOrInc(cands, s3, cnt2);
+         }
+      }
+
+      // insert candidates into priority queue (by gain)
+      auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); };
+      priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
+      for (auto& q : cands)
+         pq.push(q);
+
+      // Create new symbol map using best candidates
+      st->clear();
+      while (st->nSymbols < 255 && !pq.empty()) {
+         QSymbol q = pq.top();
+         pq.pop();
+         st->add(q.symbol);
+      }
+   };
+
+   u8 bestCounters[512*sizeof(u16)];
+#ifdef NONOPT_FSST
+   for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
+      sampleFrac = frac;
+#else
+   for(sampleFrac=8; true; sampleFrac += 30) {
+#endif
+      memset(&counters, 0, sizeof(Counters));
+      long gain = compressCount(st, counters);
+      if (gain >= bestGain) { // a new best solution!
+         counters.backup1(bestCounters);
+         *bestTable = *st; bestGain = gain;
+      } 
+      if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
+      makeTable(st, counters);
+   }
+   delete st;
+   counters.restore1(bestCounters);
+   makeTable(bestTable, counters);
+   bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
+   return bestTable;
+}
+
+#ifndef NONOPT_FSST
+static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
+   size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
+   u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings 
+   SIMDjob input[512];  // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
+   SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
+   size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
+
+   while (curLine < nlines && outOff <= (1<<19)) {
+      size_t prevLine = curLine, chunk, curOff = 0;
+ 
+      // bail out if the output buffer cannot hold the compressed next string fully
+      if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
+      else budget -= (len[curLine]-curOff)*2;
+
+      strOut[curLine] = (u8*) 0; 
+      lenOut[curLine] = 0;
+
+      do {
+         do {
+            chunk = len[curLine] - curOff;
+            if (chunk > 511) {
+               chunk = 511; // large strings need to be chopped up into segments of 511 bytes
+            }
+            // create a job in this batch
+            SIMDjob job;
+            job.cur = inOff;
+            job.end = job.cur + chunk;
+            job.pos = batchPos;
+            job.out = outOff;
+   
+            // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
+            outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
+            if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
+   
+            // register job in this batch
+            input[batchPos] = job;
+            jobLine[batchPos] = curLine;
+   
+            if (chunk == 0) {
+               empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
+            } else {
+               // copy string chunk into temp buffer 
+               memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
+               inOff += chunk;
+               curOff += chunk;
+               symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
+            }
+            if (++batchPos == 512) break;
+         } while(curOff < len[curLine]);
+   
+         if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines) || (((len[curLine])*2 + 7) > budget)) { // cannot accumulate more?
+            if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
+               // radix-sort jobs on length (longest string first) 
+               // -- this provides best load balancing and allows to skip empty jobs at the end
+               u16 sortpos[513]; 
+               memset(sortpos, 0, sizeof(sortpos));
+   
+               // calculate length histo 
+               for(size_t i=0; i<batchPos; i++) { 
+                  size_t len = input[i].end - input[i].cur; 
+                  sortpos[512UL - len]++;
+               }
+               // calculate running sum
+               for(size_t i=1; i<=512; i++) 
+                  sortpos[i] += sortpos[i-1]; 
+   
+               // move jobs to their final destination
+               SIMDjob inputOrdered[512];
+               for(size_t i=0; i<batchPos; i++) {
+                  size_t len = input[i].end - input[i].cur; 
+                  size_t pos = sortpos[511UL - len]++;
+                  inputOrdered[pos] = input[i]; 
+                }
+               // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..) 
+               for(size_t done = fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll);
+                   done < batchPos; done++) output[done] = inputOrdered[done]; 
+            } else {
+               memcpy(output, input, batchPos*sizeof(SIMDjob));
+            }
+   
+            // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
+            for(size_t i=0; i<batchPos; i++) {
+               SIMDjob job = output[i];
+               if (job.cur < job.end) { // finish encoding this string with scalar code
+                  u8* cur = symbolBase + job.cur;
+                  u8* end = symbolBase + job.end;
+                  u8* out = codeBase + job.out;
+                  while (cur < end) {
+                     u64 word = fsst_unaligned_load(cur);
+                     size_t code = symbolTable.shortCodes[word & 0xFFFF];
+                     size_t pos = word & 0xFFFFFF;
+                     size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
+                     Symbol s = symbolTable.hashTab[idx];
+                     out[1] = (u8) word; // speculatively write out escaped byte
+                     word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                     if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
+                        *out++ = (u8) s.code(); cur += s.length();
+                     } else {
+                        // could be a 2-byte or 1-byte code, or miss
+                        // handle everything with predication 
+                        *out = (u8) code; 
+                        out += 1+((code&FSST_CODE_BASE)>>8);
+                        cur += (code>>FSST_LEN_BITS); 
+                    }
+                  }
+                  job.out = out - codeBase;
+               } 
+               // postprocess job info
+               job.cur = 0;
+               job.end = job.out - input[job.pos].out; // misuse .end field as compressed size 
+               job.out = input[job.pos].out; // reset offset to start of encoded string
+               input[job.pos] = job; 
+            }
+   
+            // copy out the result data
+            for(size_t i=0; i<batchPos; i++) {
+               size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
+               size_t sz = input[i].end; // had stored compressed lengths here
+               if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
+               lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
+               memcpy(dst, codeBase+input[i].out, sz);
+               dst += sz;
+            }
+   
+            // go for the next batch of 512 chunks
+            inOff = outOff = batchPos = empty = 0;
+            budget = (size_t) (lim - dst);
+         } 
+      } while (curLine == prevLine && outOff <= (1<<19));
+   }
+   return curLine;
+}
+#endif
+
+// optimized adaptive *scalar* compression method
+static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, const size_t lenIn[], const u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
+   const u8 *cur = NULL, *end =  NULL, *lim = out + size;
+   size_t curLine, suffixLim = symbolTable.suffixLim;
+   u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
+
+   u8 buf[512+8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
+
+   // three variants are possible. dead code falls away since the bool arguments are constants
+   auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
+      while (cur < end) {
+         u64 word = fsst_unaligned_load(cur);
+         size_t code = symbolTable.shortCodes[word & 0xFFFF];
+         if (noSuffixOpt && ((u8) code) < suffixLim) {
+            // 2 byte code without having to worry about longer matches
+            *out++ = (u8) code; cur += 2;
+         } else {
+            size_t pos = word & 0xFFFFFF;
+            size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
+            Symbol s = symbolTable.hashTab[idx];
+            out[1] = (u8) word; // speculatively write out escaped byte
+            word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+            if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
+               *out++ = (u8) s.code(); cur += s.length();
+            } else if (avoidBranch) {
+               // could be a 2-byte or 1-byte code, or miss
+               // handle everything with predication 
+               *out = (u8) code; 
+               out += 1+((code&FSST_CODE_BASE)>>8);
+               cur += (code>>FSST_LEN_BITS); 
+            } else if ((u8) code < byteLim) {
+               // 2 byte code after checking there is no longer pattern
+               *out++ = (u8) code; cur += 2;
+            } else {
+               // 1 byte code or miss. 
+               *out = (u8) code; 
+               out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
+               cur++;
+            }
+         }
+      }
+   };
+
+   for(curLine=0; curLine<nlines; curLine++) {
+      size_t chunk, curOff = 0;
+      strOut[curLine] = out;
+      do {
+         cur = strIn[curLine] + curOff; 
+         chunk = lenIn[curLine] - curOff;
+         if (chunk > 511) {
+            chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST 
+         }
+         if ((2*chunk+7) > (size_t) (lim-out)) {
+            return curLine; // out of memory
+         }
+         // copy the string to the 511-byte buffer
+         memcpy(buf, cur, chunk);
+         buf[chunk] = (u8) symbolTable.terminator;
+         cur = buf;
+         end = cur + chunk; 
+
+         // based on symboltable stats, choose a variant that is nice to the branch predictor
+         if (noSuffixOpt) {
+            compressVariant(true,false);
+         } else if (avoidBranch) {
+            compressVariant(false,true);
+         } else {
+          compressVariant(false, false);
+         }
+      } while((curOff += chunk) < lenIn[curLine]);
+      lenOut[curLine] = (size_t) (out - strOut[curLine]);
+   } 
+   return curLine;
+}
+
+#define FSST_SAMPLELINE ((size_t) 512)
+
+// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
+vector<const u8*> makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) {
+   size_t totSize = 0;
+   const size_t *lenIn = *lenRef;
+   vector<const u8*> sample;
+
+   for(size_t i=0; i<nlines; i++) 
+      totSize += lenIn[i];
+
+   if (totSize < FSST_SAMPLETARGET) { 
+      for(size_t i=0; i<nlines; i++) 
+         sample.push_back(strIn[i]);
+   } else {
+      size_t sampleRnd = FSST_HASH(4637947);
+      const u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
+      size_t *sampleLen =  new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
+      *lenRef = sampleLen;
+      size_t* sampleLenLim = sampleLen + nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE;
+
+      while(sampleBuf < sampleLim && sampleLen < sampleLenLim) {
+         // choose a non-empty line
+         sampleRnd = FSST_HASH(sampleRnd);
+         size_t linenr = sampleRnd % nlines;
+         while (lenIn[linenr] == 0) 
+            if (++linenr == nlines) linenr = 0;
+
+         // choose a chunk
+         size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
+         sampleRnd = FSST_HASH(sampleRnd);
+         size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
+
+         // add the chunk to the sample
+         size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
+         memcpy(sampleBuf, strIn[linenr]+chunk, len);
+         sample.push_back(sampleBuf);
+         sampleBuf += *sampleLen++ = len;
+      }
+   }
+   return sample;
+}
+
+extern "C" fsst_encoder_t* fsst_create(size_t n, const size_t lenIn[], const u8 *strIn[], int zeroTerminated) {
+   u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
+   const size_t *sampleLen = lenIn;
+   vector<const u8*> sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample
+   Encoder *encoder = new Encoder();
+   encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
+   if (sampleLen != lenIn) delete[] sampleLen; 
+   delete[] sampleBuf; 
+   return (fsst_encoder_t*) encoder;
+}
+
+/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */
+extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) {
+   Encoder *e = new Encoder();
+   e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
+   return (fsst_encoder_t*) e;
+}
+
+// export a symbol table in compact format. 
+extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
+   Encoder *e = (Encoder*) encoder;
+   // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
+   // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t
+   // (such functionality could be useful to append compressed data to an existing block).
+   //
+   // However, the hash function in the encoder hash table is endian-sensitive, and given its
+   // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
+   // Doing a endian-conversion during hashing will be slow and self-defeating.
+   //
+   // Overall, we could support reconstructing an encoder for incremental compression, but 
+   // should enforce equal-endianness. Bit of a bummer. Not going there now.
+   // 
+   // The version field is now there just for future-proofness, but not used yet
+   
+   // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
+   u64 version = (FSST_VERSION << 32) |  // version is 24 bits, most significant byte is 0 
+                 (((u64) e->symbolTable->suffixLim) << 24) | 
+                 (((u64) e->symbolTable->terminator) << 16) | 
+                 (((u64) e->symbolTable->nSymbols) << 8) | 
+                 FSST_ENDIAN_MARKER; // least significant byte is nonzero
+
+   version = swap64_if_be(version); // ensure version is little-endian encoded
+
+   /* do not assume unaligned reads here */
+   memcpy(buf, &version, 8);
+   buf[8] = e->symbolTable->zeroTerminated;
+   for(u32 i=0; i<8; i++)
+      buf[9+i] = (u8) e->symbolTable->lenHisto[i];
+   u32 pos = 17;
+
+   // emit only the used bytes of the symbols 
+   for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
+      for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
+         buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
+
+   return pos; // length of what was serialized
+}
+
+#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
+
+extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) {
+   u64 version = 0;
+   u32 code, pos = 17;
+   u8 lenHisto[8];
+
+   // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
+   memcpy(&version, buf, 8);
+   version = swap64_if_be(version); // version is always little-endian encoded
+
+   if ((version>>32) != FSST_VERSION) return 0;
+   decoder->zeroTerminated = buf[8]&1;
+   memcpy(lenHisto, buf+9, 8);
+
+   // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) 
+   decoder->len[0] = 1;
+   decoder->symbol[0] = 0;
+
+   // we use lenHisto[0] as 1-byte symbol run length (at the end)
+   code = decoder->zeroTerminated;
+   if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
+
+   // now get all symbols from the buffer
+   for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
+      for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++)  {
+         decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1  */
+         decoder->symbol[code] = 0;
+         for(u32 j=0; j<decoder->len[code]; j++) 
+            ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
+      }
+   }
+   if (decoder->zeroTerminated) lenHisto[0]++; 
+
+   // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
+   while(code<255) {
+       decoder->symbol[code] = FSST_CORRUPT;    
+       decoder->len[code++] = 8;
+   }
+   return pos;
+}
+
+// runtime check for simd
+inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+#ifndef NONOPT_FSST
+   if (simd && fsst_hasAVX512())
+      return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+#endif
+   (void) simd;
+   return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch);
+}
+size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
+}
+
+// adaptive choosing of scalar compression method based on symbol length histogram 
+inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
+   bool avoidBranch = false, noSuffixOpt = false;
+   if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
+      noSuffixOpt = true;
+   } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
+              (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
+              (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
+      avoidBranch = true;
+   }
+   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
+}
+size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
+   return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+}
+}  // namespace libfsst
+
+using namespace libfsst;
+// the main compression function (everything automatic)
+extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) {
+   // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
+   size_t totLen = accumulate(lenIn, lenIn+nlines, 0);
+   int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); 
+   return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
+}
+
+/* deallocate encoder */
+extern "C" void fsst_destroy(fsst_encoder_t* encoder) {
+  Encoder *e = (Encoder*) encoder; 
+   delete e;
+}
+
+/* very lazy implementation relying on export and import */
+extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) {
+   u8 buf[sizeof(fsst_decoder_t)];
+   u32 cnt1 = fsst_export(encoder, buf);
+   fsst_decoder_t decoder;
+   u32 cnt2 = fsst_import(&decoder, buf);
+   assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; 
+   return decoder;
+}
diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp
new file mode 100644
index 00000000000..f61bc0175b6
--- /dev/null
+++ b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp
@@ -0,0 +1,471 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// 
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files   
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,   
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is   
+// furnished to do so, subject to the following conditions:
+// 
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
+//                 
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst 
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+
+using namespace std;
+
+#include "fsst.h" // the official FSST API -- also usable by C mortals
+
+/* unsigned integers */
+namespace libfsst {
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+}  // namespace libfsst
+
+#if UINTPTR_MAX == 0xffffffffU
+// We're on a 32-bit platform
+#define NONOPT_FSST
+#endif
+
+#define FSST_ENDIAN_MARKER ((u64) 1)
+#define FSST_VERSION_20190218 20190218
+#define FSST_VERSION ((u64) FSST_VERSION_20190218)
+
+// "symbols" are character sequences (up to 8 bytes)
+// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism:
+// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X.
+
+// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length
+#define FSST_LEN_BITS       12
+#define FSST_CODE_BITS      9 
+#define FSST_CODE_BASE      256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */
+#define FSST_CODE_MAX       (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */
+#define FSST_CODE_MASK      (FSST_CODE_MAX-1UL)   /* all bits set: indicating a symbol that has not been assigned a code yet */
+
+namespace libfsst {
+constexpr inline uint64_t swap64_if_be(uint64_t v) noexcept {
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    return __builtin_bswap64(v);
+#else
+    return v; // little-endian (or unknown), so no swap needed
+#endif
+}
+
+inline uint64_t fsst_unaligned_load(u8 const* V) {
+    uint64_t Ret;
+    memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
+    return swap64_if_be(Ret);
+}
+
+struct Symbol {
+   static const unsigned maxLength = 8;
+
+   // the byte sequence that this symbol stands for
+   union { char str[maxLength]; u64 num; } val; // usually we process it as a num(ber), as this is fast
+
+   // icl = u64 ignoredBits:16,code:12,length:4,unused:32 -- but we avoid exposing this bit-field notation
+   u64 icl;  // use a single u64 to be sure "code" is accessed with one load and can be compared with one comparison
+
+   Symbol() : icl(0) { val.num = 0; }
+
+   explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { store_num(c); } // single-char symbol
+   explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {}
+   explicit Symbol(const u8* begin, const u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {}
+   explicit Symbol(const char* input, u32 len) {
+      val.num = 0;
+      if (len>=8) {
+          len = 8;
+          memcpy(val.str, input, 8);
+      } else {
+          memcpy(val.str, input, len);
+      }
+      set_code_len(FSST_CODE_MAX, len);
+   }
+   void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); }
+
+   u64 load_num() const { return swap64_if_be(val.num); }
+   void store_num(u64 v) { val.num = swap64_if_be(v); }
+
+   u32 length() const { return (u32) (icl >> 28); }
+   u16 code() const { return (icl >> 16) & FSST_CODE_MASK; }
+   u32 ignoredBits() const { return (u32) icl; }
+
+   u8 first() const { assert( length() >= 1); return 0xFF & load_num(); }
+   u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); }
+
+#define FSST_HASH_LOG2SIZE 10 
+#define FSST_HASH_PRIME 2971215073LL
+#define FSST_SHIFT 15
+#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT))
+   size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes
+};
+
+// Symbol that can be put in a queue, ordered on gain
+struct QSymbol{
+   Symbol symbol;
+   mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols
+   bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); }
+};
+
+// we construct FSST symbol tables using a random sample of about 16KB (1<<14) 
+#define FSST_SAMPLETARGET (1<<14)
+#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET)
+
+// two phases of compression, before and after optimize():
+//
+// (1) to encode values we probe (and maintain) three datastructures:
+// - u16 byteCodes[256] array at the position of the next byte  (s.length==1)
+// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2)
+// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), 
+// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are 
+// pseudo codes representing a single byte these will become escapes)
+//
+// (2) when we finished looking for the best symbol table we call optimize() to reshape it:
+// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1)
+//   length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes 
+//   (allows shortcut during compression)
+// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding 
+//   to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one
+//   byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[]
+//   and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction
+//   is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was
+//   hence added to make symbolTable construction faster.
+//
+// this final layout allows for the fastest compression code, only currently present in compressBulk
+
+// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4
+#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket
+
+// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key
+//             ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster
+//
+// the gain field is only used in the symbol queue that sorts symbols on gain
+
+struct SymbolTable {
+   static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
+
+   // lookup table using the next two bytes (65536 codes), or just the next single byte
+   u16 shortCodes[65536]; // contains code for 2-byte symbol, otherwise code for pseudo byte (escaped byte)
+
+   // lookup table (only used during symbolTable construction, not during normal text compression)
+   u16 byteCodes[256]; // contains code for every 1-byte symbol, otherwise code for pseudo byte (escaped byte)
+
+   // 'symbols' is the current symbol  table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code'
+   Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols   
+
+   // replicate long symbols in hashTab (avoid indirection). 
+   Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes
+
+   u16 nSymbols;          // amount of symbols in the map (max 255)
+   u16 suffixLim;         // codes higher than this do not have a longer suffix
+   u16 terminator;        // code of 1-byte symbol, that can be used as a terminator during compression
+   bool zeroTerminated;   // whether we are expecting zero-terminated strings (we then also produce zero-terminated compressed strings)
+   u16 lenHisto[FSST_CODE_BITS]; // lenHisto[x] is the amount of symbols of byte-length (x+1) in this SymbolTable
+
+   SymbolTable() : nSymbols(0), suffixLim(FSST_CODE_MAX), terminator(0), zeroTerminated(false) {
+      // stuff done once at startup
+      for (u32 i=0; i<256; i++) {
+         symbols[i] = Symbol(i,i|(1<<FSST_LEN_BITS)); // pseudo symbols
+      }
+      Symbol unused = Symbol((u8) 0,FSST_CODE_MASK); // single-char symbol, exception code
+      for (u32 i=256; i<FSST_CODE_MAX; i++) {
+         symbols[i] = unused; // we start with all symbols unused
+      }
+      // empty hash table
+      Symbol s;
+      s.val.num = 0;
+      s.icl = FSST_ICL_FREE; //marks empty in hashtab
+      for(u32 i=0; i<hashTabSize; i++)
+         hashTab[i] = s;
+
+      // fill byteCodes[] with the pseudo code all bytes (escaped bytes)
+      for(u32 i=0; i<256; i++)
+         byteCodes[i] = (1<<FSST_LEN_BITS) | i;
+
+      // fill shortCodes[] with the pseudo code for the first byte of each two-byte pattern
+      for(u32 i=0; i<65536; i++)
+         shortCodes[i] = (1<<FSST_LEN_BITS) | (i&255);
+
+      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
+   }
+
+   void clear() {
+      // clear a symbolTable with minimal effort (only erase the used positions in it)
+      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
+      for(u32 i=FSST_CODE_BASE; i<FSST_CODE_BASE+nSymbols; i++) {
+          if (symbols[i].length() == 1) {
+              u16 val = symbols[i].first();
+              byteCodes[val] = (1<<FSST_LEN_BITS) | val;
+          } else if (symbols[i].length() == 2) {
+              u16 val = symbols[i].first2();
+              shortCodes[val] = (1<<FSST_LEN_BITS) | (val&255);
+          } else {
+              u32 idx = symbols[i].hash() & (hashTabSize-1);
+              hashTab[idx].val.num = 0;
+              hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab
+          }           
+      } 
+      nSymbols = 0; // no need to clean symbols[] as no symbols are used
+   }
+   bool hashInsert(Symbol s) {
+      u32 idx = s.hash() & (hashTabSize-1);
+      bool taken = (hashTab[idx].icl < FSST_ICL_FREE);
+      if (taken) return false; // collision in hash table
+      hashTab[idx].icl = s.icl;
+      hashTab[idx].store_num(s.load_num() & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl));
+      return true;
+   }
+   bool add(Symbol s) {
+      assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX);
+      u32 len = s.length();
+      s.set_code_len(FSST_CODE_BASE + nSymbols, len);
+      if (len == 1) {
+         byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<<FSST_LEN_BITS); // len=1 (<<FSST_LEN_BITS)
+      } else if (len == 2) {
+         shortCodes[s.first2()] = FSST_CODE_BASE + nSymbols + (2<<FSST_LEN_BITS); // len=2 (<<FSST_LEN_BITS)
+      } else if (!hashInsert(s)) {
+         return false;
+      }
+      symbols[FSST_CODE_BASE + nSymbols++] = s;
+      lenHisto[len-1]++;
+      return true;
+   }
+   /// Find longest expansion, return code (= position in symbol table)
+   u16 findLongestSymbol(Symbol s) const {
+      size_t idx = s.hash() & (hashTabSize-1);
+      if (hashTab[idx].icl <= s.icl && hashTab[idx].load_num() == (s.load_num() & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
+         return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol 
+      }
+      if (s.length() >= 2) {
+         u16 code =  shortCodes[s.first2()] & FSST_CODE_MASK;
+         if (code >= FSST_CODE_BASE) return code; 
+      }
+      return byteCodes[s.first()] & FSST_CODE_MASK;
+   }
+   u16 findLongestSymbol(const u8* cur, const u8* end) const {
+      return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol
+   }
+
+   // rationale for finalize:
+   // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable()
+   //   consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize()
+   //   (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass)
+   // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf.
+   //   we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). 
+   // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[]
+   //   Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte
+   //   symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time.
+   //
+   // In all, we change the layout and coding, as follows..
+   //
+   // before finalize(): 
+   // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255
+   // - The first 256 codes are pseudo symbols (all escaped bytes)
+   //
+   // after finalize(): 
+   // - table layout is symbols[0..nSymbols>, with nSymbols < 256. 
+   // - Real codes are [0,nSymbols>. 8-th bit not set. 
+   // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255
+   // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last)
+   // the two-byte codes are split in two sections: 
+   // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression
+   //
+   // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore).
+   //
+   void finalize(u8 zeroTerminated) {
+       assert(nSymbols <= 255);
+       u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated);
+
+       // compute running sum of code lengths (starting offsets for each length) 
+       rsum[0] = byteLim; // 1-byte codes are highest
+       rsum[1] = zeroTerminated;
+       for(u32 i=1; i<7; i++)
+          rsum[i+1] = rsum[i] + lenHisto[i];
+
+       // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim)
+       suffixLim = rsum[1];
+       symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only)
+
+       for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) {  
+          Symbol s1 = symbols[FSST_CODE_BASE+i];
+          u32 len = s1.length(), opt = (len == 2)*nSymbols;
+          if (opt) {
+              u16 first2 = s1.first2();
+              for(u32 k=0; k<opt; k++) {  
+                 Symbol s2 = symbols[FSST_CODE_BASE+k];
+                 if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s
+                    opt = 0;
+              }
+              newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim 
+          } else 
+              newCode[i] = rsum[len-1]++;
+          s1.set_code_len(newCode[i],len);
+          symbols[newCode[i]] = s1; 
+       }
+       // renumber the codes in byteCodes[] 
+       for(u32 i=0; i<256; i++) 
+          if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
+             byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS);
+          else 
+             byteCodes[i] = 511 + (1 << FSST_LEN_BITS);
+       
+       // renumber the codes in shortCodes[] 
+       for(u32 i=0; i<65536; i++)
+          if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
+             shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS));
+          else 
+             shortCodes[i] = byteCodes[i&0xFF];
+
+       // replace the symbols in the hash table
+       for(u32 i=0; i<hashTabSize; i++)
+          if (hashTab[i].icl < FSST_ICL_FREE)
+             hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]];
+   }
+};
+
+#ifdef NONOPT_FSST
+struct Counters {
+   u16 count1[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample 
+   u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample 
+
+   void count1Set(u32 pos1, u16 val) { 
+      count1[pos1] = val;
+   }
+   void count1Inc(u32 pos1) { 
+      count1[pos1]++;
+   }
+   void count2Inc(u32 pos1, u32 pos2) {  
+      count2[pos1][pos2]++;
+   }
+   u32 count1GetNext(u32 &pos1) { 
+      return count1[pos1];
+   }
+   u32 count2GetNext(u32 pos1, u32 &pos2) { 
+      return count2[pos1][pos2];
+   }
+   void backup1(u8 *buf) {
+      memcpy(buf, count1, FSST_CODE_MAX*sizeof(u16));
+   }
+   void restore1(u8 *buf) {
+      memcpy(count1, buf, FSST_CODE_MAX*sizeof(u16));
+   }
+};
+#else
+// we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons
+// first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure.
+// second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7)
+struct Counters {
+   // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes
+   u8 count1High[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample (16-bits)
+   u8 count1Low[FSST_CODE_MAX];    // it is split in a low and high byte: cnt = count1High*256 + count1Low
+   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
+   u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
+   // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
+   
+   void count1Set(u32 pos1, u16 val) { 
+      count1Low[pos1] = val&255;
+      count1High[pos1] = val>>8;
+   }
+   void count1Inc(u32 pos1) { 
+      if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
+         count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
+   }
+   void count2Inc(u32 pos1, u32 pos2) {  
+       if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
+          // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
+          count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
+   }
+   u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
+      // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
+      u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7]
+
+      u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes
+      high = (high >> (zero << 3)) & 255; // advance to nonzero counter
+      if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
+         return 0; // all zero
+
+      u32 low = count1Low[pos1];
+      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
+      return (u32) ((high << 8) + low);
+   }
+   u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
+      // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
+      u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15]
+      high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters
+
+      u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters
+      high = (high >> (zero << 2)) & 15;  // advance to nonzero counter
+      if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
+         return 0UL; // all zero
+
+      u32 low = count2Low[pos1][pos2];
+      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
+      return (u32) ((high << 8) + low);
+   }
+   void backup1(u8 *buf) {
+      memcpy(buf, count1High, FSST_CODE_MAX);
+      memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
+   }
+   void restore1(u8 *buf) {
+      memcpy(count1High, buf, FSST_CODE_MAX);
+      memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
+   }
+}; 
+#endif
+
+
+#define FSST_BUFSZ (3<<19) // 768KB
+
+// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression 
+struct Encoder {
+   shared_ptr<SymbolTable> symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc)
+   union {
+      Counters counters;     // for counting symbol occurences during map construction
+      u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) 
+   };
+};
+
+// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call)
+struct SIMDjob {
+   u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)  
+};
+
+extern bool 
+fsst_hasAVX512(); // runtime check for avx512 capability
+
+extern size_t 
+fsst_compressAVX512(
+   SymbolTable &symbolTable, 
+   u8* codeBase,    // IN: base address for codes, i.e. compression output (points to simdbuf+256KB)
+   u8* symbolBase,  // IN: base address for string bytes, i.e. compression input (points to simdbuf)
+   SIMDjob* input,  // IN: input array (size n) with job information: what to encode, where to store it.
+   SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer.
+   size_t n,         // IN: size of arrays input and output (should be max 512)
+   size_t unroll);   // IN: degree of SIMD unrolling
+
+// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
+size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
+size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd);
+}  // namespace libfsst
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 7109c3d1c83..7e5324814b0 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -259,6 +259,8 @@ std::string EncodingToString(Encoding::type t) {
       return "RLE_DICTIONARY";
     case Encoding::BYTE_STREAM_SPLIT:
       return "BYTE_STREAM_SPLIT";
+    case Encoding::FSST:
+      return "FSST";
     default:
       return "UNKNOWN";
   }
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index c2040e555fd..0113ec86aa1 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -531,8 +531,9 @@ struct Encoding {
     DELTA_BYTE_ARRAY = 7,
     RLE_DICTIONARY = 8,
     BYTE_STREAM_SPLIT = 9,
+    FSST = 10,
     // Should always be last element (except UNKNOWN)
-    UNDEFINED = 10,
+    UNDEFINED = 11,
     UNKNOWN = 999
   };
 };
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 14cd3e363a4..86f9f27305a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1487,6 +1487,7 @@ cdef encoding_name_from_enum(ParquetEncoding encoding_):
         ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY',
         ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY',
         ParquetEncoding_BYTE_STREAM_SPLIT: 'BYTE_STREAM_SPLIT',
+        ParquetEncoding_FSST: 'FSST',
     }.get(encoding_, 'UNKNOWN')
 
 
@@ -1499,6 +1500,7 @@ cdef encoding_enum_from_name(str encoding_name):
         'DELTA_BINARY_PACKED': ParquetEncoding_DELTA_BINARY_PACKED,
         'DELTA_LENGTH_BYTE_ARRAY': ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY,
         'DELTA_BYTE_ARRAY': ParquetEncoding_DELTA_BYTE_ARRAY,
+        'FSST': ParquetEncoding_FSST,
         'RLE_DICTIONARY': 'dict',
         'PLAIN_DICTIONARY': 'dict',
     }.get(encoding_name, None)
diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd
index 42d48ba050f..3663b8bcf62 100644
--- a/python/pyarrow/includes/libparquet.pxd
+++ b/python/pyarrow/includes/libparquet.pxd
@@ -130,6 +130,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
         ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY"
         ParquetEncoding_BYTE_STREAM_SPLIT \
             " parquet::Encoding::BYTE_STREAM_SPLIT"
+        ParquetEncoding_FSST" parquet::Encoding::FSST"
 
     enum ParquetCompression" parquet::Compression::type":
         ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 24cb586c82b..5fb3cc5b699 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -821,7 +821,7 @@ def _sanitize_table(table, new_schema, flavor):
     Can only be used when ``use_dictionary`` is set to False, and
     cannot be used in combination with ``use_byte_stream_split``.
     Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT',
-    'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}.
+    'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'FSST'}.
     Certain encodings are only compatible with certain data types.
     Please refer to the encodings section of `Reading and writing Parquet
     files <https://arrow.apache.org/docs/cpp/parquet.html#encodings>`_.

From 9ba9672710837cee922a2c5aa7dea1b78230b9f8 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Sun, 23 Nov 2025 14:20:27 +0000
Subject: [PATCH 02/16] update

---
 cpp/src/parquet/thirdparty/fsst/libfsst.hpp | 127 ++++++++++----------
 1 file changed, 65 insertions(+), 62 deletions(-)

diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp
index f61bc0175b6..e4d7c0aa9b9 100644
--- a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp
+++ b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp
@@ -1,20 +1,20 @@
 // this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-// 
+//
 // Copyright 2018-2020, CWI, TU Munich, FSU Jena
-// 
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files   
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,   
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is   
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
-// 
+//
 // - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
-//                 
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst 
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -59,7 +59,7 @@ typedef uint64_t u64;
 
 // we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length
 #define FSST_LEN_BITS       12
-#define FSST_CODE_BITS      9 
+#define FSST_CODE_BITS      9
 #define FSST_CODE_BASE      256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */
 #define FSST_CODE_MAX       (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */
 #define FSST_CODE_MASK      (FSST_CODE_MAX-1UL)   /* all bits set: indicating a symbol that has not been assigned a code yet */
@@ -115,7 +115,7 @@ struct Symbol {
    u8 first() const { assert( length() >= 1); return 0xFF & load_num(); }
    u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); }
 
-#define FSST_HASH_LOG2SIZE 10 
+#define FSST_HASH_LOG2SIZE 10
 #define FSST_HASH_PRIME 2971215073LL
 #define FSST_SHIFT 15
 #define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT))
@@ -129,7 +129,7 @@ struct QSymbol{
    bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); }
 };
 
-// we construct FSST symbol tables using a random sample of about 16KB (1<<14) 
+// we construct FSST symbol tables using a random sample of about 16KB (1<<14)
 #define FSST_SAMPLETARGET (1<<14)
 #define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET)
 
@@ -138,15 +138,15 @@ struct QSymbol{
 // (1) to encode values we probe (and maintain) three datastructures:
 // - u16 byteCodes[256] array at the position of the next byte  (s.length==1)
 // - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2)
-// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), 
-// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are 
+// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2),
+// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are
 // pseudo codes representing a single byte these will become escapes)
 //
 // (2) when we finished looking for the best symbol table we call optimize() to reshape it:
 // - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1)
-//   length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes 
+//   length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes
 //   (allows shortcut during compression)
-// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding 
+// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding
 //   to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one
 //   byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[]
 //   and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction
@@ -173,9 +173,9 @@ struct SymbolTable {
    u16 byteCodes[256]; // contains code for every 1-byte symbol, otherwise code for pseudo byte (escaped byte)
 
    // 'symbols' is the current symbol  table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code'
-   Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols   
+   Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols
 
-   // replicate long symbols in hashTab (avoid indirection). 
+   // replicate long symbols in hashTab (avoid indirection).
    Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes
 
    u16 nSymbols;          // amount of symbols in the map (max 255)
@@ -225,8 +225,8 @@ struct SymbolTable {
               u32 idx = symbols[i].hash() & (hashTabSize-1);
               hashTab[idx].val.num = 0;
               hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab
-          }           
-      } 
+          }
+      }
       nSymbols = 0; // no need to clean symbols[] as no symbols are used
    }
    bool hashInsert(Symbol s) {
@@ -256,11 +256,11 @@ struct SymbolTable {
    u16 findLongestSymbol(Symbol s) const {
       size_t idx = s.hash() & (hashTabSize-1);
       if (hashTab[idx].icl <= s.icl && hashTab[idx].load_num() == (s.load_num() & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
-         return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol 
+         return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol
       }
       if (s.length() >= 2) {
          u16 code =  shortCodes[s.first2()] & FSST_CODE_MASK;
-         if (code >= FSST_CODE_BASE) return code; 
+         if (code >= FSST_CODE_BASE) return code;
       }
       return byteCodes[s.first()] & FSST_CODE_MASK;
    }
@@ -273,23 +273,23 @@ struct SymbolTable {
    //   consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize()
    //   (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass)
    // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf.
-   //   we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). 
+   //   we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations).
    // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[]
    //   Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte
    //   symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time.
    //
    // In all, we change the layout and coding, as follows..
    //
-   // before finalize(): 
+   // before finalize():
    // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255
    // - The first 256 codes are pseudo symbols (all escaped bytes)
    //
-   // after finalize(): 
-   // - table layout is symbols[0..nSymbols>, with nSymbols < 256. 
-   // - Real codes are [0,nSymbols>. 8-th bit not set. 
+   // after finalize():
+   // - table layout is symbols[0..nSymbols>, with nSymbols < 256.
+   // - Real codes are [0,nSymbols>. 8-th bit not set.
    // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255
    // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last)
-   // the two-byte codes are split in two sections: 
+   // the two-byte codes are split in two sections:
    // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression
    //
    // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore).
@@ -298,7 +298,7 @@ struct SymbolTable {
        assert(nSymbols <= 255);
        u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated);
 
-       // compute running sum of code lengths (starting offsets for each length) 
+       // compute running sum of code lengths (starting offsets for each length)
        rsum[0] = byteLim; // 1-byte codes are highest
        rsum[1] = zeroTerminated;
        for(u32 i=1; i<7; i++)
@@ -308,34 +308,34 @@ struct SymbolTable {
        suffixLim = rsum[1];
        symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only)
 
-       for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) {  
+       for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) {
           Symbol s1 = symbols[FSST_CODE_BASE+i];
           u32 len = s1.length(), opt = (len == 2)*nSymbols;
           if (opt) {
               u16 first2 = s1.first2();
-              for(u32 k=0; k<opt; k++) {  
+              for(u32 k=0; k<opt; k++) {
                  Symbol s2 = symbols[FSST_CODE_BASE+k];
                  if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s
                     opt = 0;
               }
-              newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim 
-          } else 
+              newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim
+          } else
               newCode[i] = rsum[len-1]++;
           s1.set_code_len(newCode[i],len);
-          symbols[newCode[i]] = s1; 
+          symbols[newCode[i]] = s1;
        }
-       // renumber the codes in byteCodes[] 
-       for(u32 i=0; i<256; i++) 
+       // renumber the codes in byteCodes[]
+       for(u32 i=0; i<256; i++)
           if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
              byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS);
-          else 
+          else
              byteCodes[i] = 511 + (1 << FSST_LEN_BITS);
-       
-       // renumber the codes in shortCodes[] 
+
+       // renumber the codes in shortCodes[]
        for(u32 i=0; i<65536; i++)
           if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
              shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS));
-          else 
+          else
              shortCodes[i] = byteCodes[i&0xFF];
 
        // replace the symbols in the hash table
@@ -347,22 +347,22 @@ struct SymbolTable {
 
 #ifdef NONOPT_FSST
 struct Counters {
-   u16 count1[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample 
-   u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample 
+   u16 count1[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample
+   u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample
 
-   void count1Set(u32 pos1, u16 val) { 
+   void count1Set(u32 pos1, u16 val) {
       count1[pos1] = val;
    }
-   void count1Inc(u32 pos1) { 
+   void count1Inc(u32 pos1) {
       count1[pos1]++;
    }
-   void count2Inc(u32 pos1, u32 pos2) {  
+   void count2Inc(u32 pos1, u32 pos2) {
       count2[pos1][pos2]++;
    }
-   u32 count1GetNext(u32 &pos1) { 
+   u32 count1GetNext(u32 &pos1) {
       return count1[pos1];
    }
-   u32 count2GetNext(u32 pos1, u32 &pos2) { 
+   u32 count2GetNext(u32 pos1, u32 &pos2) {
       return count2[pos1][pos2];
    }
    void backup1(u8 *buf) {
@@ -383,16 +383,16 @@ struct Counters {
    u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
    u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
    // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
-   
-   void count1Set(u32 pos1, u16 val) { 
+
+   void count1Set(u32 pos1, u16 val) {
       count1Low[pos1] = val&255;
       count1High[pos1] = val>>8;
    }
-   void count1Inc(u32 pos1) { 
+   void count1Inc(u32 pos1) {
       if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
          count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
    }
-   void count2Inc(u32 pos1, u32 pos2) {  
+   void count2Inc(u32 pos1, u32 pos2) {
        if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
           // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
           count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
@@ -432,32 +432,32 @@ struct Counters {
       memcpy(count1High, buf, FSST_CODE_MAX);
       memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
    }
-}; 
+};
 #endif
 
 
 #define FSST_BUFSZ (3<<19) // 768KB
 
-// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression 
+// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression
 struct Encoder {
    shared_ptr<SymbolTable> symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc)
    union {
       Counters counters;     // for counting symbol occurences during map construction
-      u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) 
+      u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in)
    };
 };
 
 // job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call)
 struct SIMDjob {
-   u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)  
+   u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)
 };
 
-extern bool 
+extern bool
 fsst_hasAVX512(); // runtime check for avx512 capability
 
-extern size_t 
+extern size_t
 fsst_compressAVX512(
-   SymbolTable &symbolTable, 
+   SymbolTable &symbolTable,
    u8* codeBase,    // IN: base address for codes, i.e. compression output (points to simdbuf+256KB)
    u8* symbolBase,  // IN: base address for string bytes, i.e. compression input (points to simdbuf)
    SIMDjob* input,  // IN: input array (size n) with job information: what to encode, where to store it.
@@ -465,6 +465,9 @@ fsst_compressAVX512(
    size_t n,         // IN: size of arrays input and output (should be max 512)
    size_t unroll);   // IN: degree of SIMD unrolling
 
+// Symbol manipulation
+Symbol concat(Symbol a, Symbol b);
+
 // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
 size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
 size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd);

From 12eef2e8d9884ed0c4efff97c91c2b604494a360 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 08:10:50 +0000
Subject: [PATCH 03/16] cmake build

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake   |  35 +
 cpp/src/parquet/CMakeLists.txt                |  29 +-
 cpp/src/parquet/decoder.cc                    |   2 +-
 cpp/src/parquet/encoder.cc                    |   2 +-
 cpp/src/parquet/encoding_test.cc              |   2 +-
 cpp/src/parquet/thirdparty/fsst/fsst.cpp      | 200 ------
 cpp/src/parquet/thirdparty/fsst/fsst.h        | 227 ------
 .../parquet/thirdparty/fsst/fsst_avx512.cpp   | 149 ----
 .../parquet/thirdparty/fsst/fsst_avx512.inc   |  57 --
 .../thirdparty/fsst/fsst_avx512_unroll1.inc   |  57 --
 .../thirdparty/fsst/fsst_avx512_unroll2.inc   | 114 ---
 .../thirdparty/fsst/fsst_avx512_unroll3.inc   | 171 -----
 .../thirdparty/fsst/fsst_avx512_unroll4.inc   | 228 ------
 cpp/src/parquet/thirdparty/fsst/libfsst.cpp   | 651 ------------------
 cpp/src/parquet/thirdparty/fsst/libfsst.hpp   | 474 -------------
 cpp/thirdparty/versions.txt                   |   3 +
 16 files changed, 66 insertions(+), 2335 deletions(-)
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst.cpp
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst.h
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/libfsst.cpp
 delete mode 100644 cpp/src/parquet/thirdparty/fsst/libfsst.hpp

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 4ced2a66bf5..8d67849ea47 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -49,6 +49,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES
     Boost
     Brotli
     BZip2
+    fsst
     c-ares
     gflags
     glog
@@ -183,6 +184,8 @@ macro(build_dependency DEPENDENCY_NAME)
     build_brotli()
   elseif("${DEPENDENCY_NAME}" STREQUAL "BZip2")
     build_bzip2()
+  elseif("${DEPENDENCY_NAME}" STREQUAL "fsst")
+    build_fsst()
   elseif("${DEPENDENCY_NAME}" STREQUAL "c-ares")
     build_cares()
   elseif("${DEPENDENCY_NAME}" STREQUAL "gflags")
@@ -382,6 +385,7 @@ endif()
 if(ARROW_PARQUET)
   set(ARROW_WITH_RAPIDJSON ON)
   set(ARROW_WITH_THRIFT ON)
+  set(ARROW_WITH_FSST ON)
 endif()
 
 if(ARROW_WITH_THRIFT)
@@ -637,6 +641,14 @@ else()
   )
 endif()
 
+if(DEFINED ENV{ARROW_FSST_URL})
+  set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}")
+else()
+  set_urls(FSST_SOURCE_URL
+           "https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz"
+           "${THIRDPARTY_MIRROR_URL}/fsst-${ARROW_FSST_BUILD_VERSION}.tar.gz")
+endif()
+
 if(DEFINED ENV{ARROW_GBENCHMARK_URL})
   set(GBENCHMARK_SOURCE_URL "$ENV{ARROW_GBENCHMARK_URL}")
 else()
@@ -2604,6 +2616,29 @@ if(ARROW_USE_XSIMD)
   endif()
 endif()
 
+function(build_fsst)
+  message(STATUS "Building FSST from source using FetchContent")
+
+  fetchcontent_declare(fsst
+                       URL ${FSST_SOURCE_URL}
+                       URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}")
+
+  prepare_fetchcontent()
+  fetchcontent_makeavailable(fsst)
+
+  set(ARROW_FSST_INCLUDE_DIR
+      "${fsst_SOURCE_DIR}"
+      CACHE INTERNAL "FSST include directory")
+  set(ARROW_FSST_SOURCES
+      "${fsst_SOURCE_DIR}/libfsst.cpp;${fsst_SOURCE_DIR}/fsst_avx512.cpp"
+      CACHE INTERNAL "FSST source files")
+  set(FSST_VENDORED TRUE CACHE INTERNAL "Whether FSST is built from source")
+endfunction()
+
+if(ARROW_WITH_FSST)
+  resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE)
+endif()
+
 macro(build_zlib)
   message(STATUS "Building ZLIB from source")
 
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index e06d13d1884..a2d300d684d 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -42,6 +42,10 @@ function(ADD_PARQUET_TEST REL_TEST_NAME)
                         ${ARGN})
 
   set(TEST_ARGUMENTS PREFIX "parquet" LABELS "parquet-tests")
+  set(_PARQUET_TEST_EXTRA_ARGS ${ARG_UNPARSED_ARGUMENTS})
+  if(PARQUET_TEST_EXTRA_INCLUDES)
+    list(APPEND _PARQUET_TEST_EXTRA_ARGS EXTRA_INCLUDES ${PARQUET_TEST_EXTRA_INCLUDES})
+  endif()
 
   if(ARROW_TEST_LINKAGE STREQUAL "static")
     add_test_case(${REL_TEST_NAME}
@@ -49,14 +53,14 @@ function(ADD_PARQUET_TEST REL_TEST_NAME)
                   parquet_static
                   ${PARQUET_TEST_LINK_LIBS}
                   ${TEST_ARGUMENTS}
-                  ${ARG_UNPARSED_ARGUMENTS})
+                  ${_PARQUET_TEST_EXTRA_ARGS})
   else()
     add_test_case(${REL_TEST_NAME}
                   STATIC_LINK_LIBS
                   parquet_shared
                   ${PARQUET_TEST_LINK_LIBS}
                   ${TEST_ARGUMENTS}
-                  ${ARG_UNPARSED_ARGUMENTS})
+                  ${_PARQUET_TEST_EXTRA_ARGS})
   endif()
 endfunction()
 
@@ -134,6 +138,9 @@ elseif(NOT MSVC)
   list(APPEND PARQUET_TEST_LINK_LIBS ${CMAKE_DL_LIBS})
 endif()
 
+set(PARQUET_TEST_EXTRA_INCLUDES)
+set(PARQUET_PRIVATE_INCLUDE_DIRS)
+
 #
 # Generated Thrift sources
 set(PARQUET_THRIFT_SOURCE_DIR "${ARROW_SOURCE_DIR}/src/generated/")
@@ -171,8 +178,6 @@ set(PARQUET_SRCS
     encryption/internal_file_encryptor.cc
     exception.cc
     file_reader.cc
-    thirdparty/fsst/libfsst.cpp
-    thirdparty/fsst/fsst_avx512.cpp
     file_writer.cc
     geospatial/statistics.cc
     geospatial/util_internal.cc
@@ -193,6 +198,14 @@ set(PARQUET_SRCS
     stream_writer.cc
     types.cc)
 
+if(DEFINED ARROW_FSST_SOURCES)
+  list(APPEND PARQUET_SRCS ${ARROW_FSST_SOURCES})
+endif()
+if(DEFINED ARROW_FSST_INCLUDE_DIR)
+  list(APPEND PARQUET_PRIVATE_INCLUDE_DIRS ${ARROW_FSST_INCLUDE_DIR})
+  list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR})
+endif()
+
 if(ARROW_HAVE_RUNTIME_AVX2)
   # AVX2 is used as a proxy for BMI2.
   list(APPEND PARQUET_SRCS level_comparison_avx2.cc level_conversion_bmi2.cc)
@@ -308,6 +321,14 @@ add_arrow_lib(parquet
               STATIC_INSTALL_INTERFACE_LIBS
               ${PARQUET_STATIC_INSTALL_INTERFACE_LIBS})
 
+if(PARQUET_PRIVATE_INCLUDE_DIRS)
+  foreach(_parquet_target parquet_objlib parquet_shared parquet_static)
+    if(TARGET ${_parquet_target})
+      target_include_directories(${_parquet_target} PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS})
+    endif()
+  endforeach()
+endif()
+
 if(WIN32 AND NOT (ARROW_TEST_LINKAGE STREQUAL "static"))
   add_library(parquet_test_support STATIC
               "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp")
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 77bc3c760ff..42838fd059f 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -53,7 +53,7 @@
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
-#include "parquet/thirdparty/fsst/fsst.h"
+#include "fsst.h"
 #include "parquet/types.h"
 
 #ifdef _MSC_VER
diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc
index 61b1bb9606c..00daa366f6a 100644
--- a/cpp/src/parquet/encoder.cc
+++ b/cpp/src/parquet/encoder.cc
@@ -50,7 +50,7 @@
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
-#include "parquet/thirdparty/fsst/fsst.h"
+#include "fsst.h"
 #include "parquet/types.h"
 
 #ifdef _MSC_VER
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index ca51d8e1b24..a43bafd0960 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -53,7 +53,7 @@
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/test_util.h"
-#include "parquet/thirdparty/fsst/fsst.h"
+#include "fsst.h"
 #include "parquet/types.h"
 
 using arrow::default_memory_pool;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst.cpp b/cpp/src/parquet/thirdparty/fsst/fsst.cpp
deleted file mode 100644
index c1a9cc34980..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/fsst.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-// Copyright 2018-2019, CWI, TU Munich, FSU Jena
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-#ifdef FSST12
-#include "fsst12.h" // the official FSST API -- also usable by C mortals
-#else
-#include "fsst.h" // the official FSST API -- also usable by C mortals
-#endif
-#include <condition_variable>
-#include <iostream>
-#include <fstream>
-#include <mutex>
-#include <vector>
-#include <thread>
-using namespace std;
-
-// Utility to compress and decompress (-d) data with FSST (using stdin and stdout).
-//
-// The utility has a poor-man's async I/O in that it uses double buffering for input and output,
-// and two background pthreads for reading and writing. The idea is to make the CPU overlap with I/O.
-//
-// The data format is quite simple. A FSST compressed file is a sequence of blocks, each with format:
-// (1) 3-byte block length field (max blocksize is hence 16MB). This byte-length includes (1), (2) and (3).
-// (2) FSST dictionary as produced by fst_export().
-// (3) the FSST compressed data.
-//
-// The natural strength of FSST is in fact not block-based compression, but rather the compression and
-// *individual* decompression of many small strings separately. Think of compressed databases and (column-store)
-// data formats. But, this utility is to serve as an apples-to-apples comparison point with utilities like lz4.
-
-namespace {
-
-class BinarySemaphore {
-   private:
-   mutex m;
-   condition_variable cv;
-   bool value;
-
-   public:
-   explicit BinarySemaphore(bool initialValue = false) : value(initialValue) {}
-   void wait() {
-      unique_lock<mutex> lock(m);
-      while (!value) cv.wait(lock);
-      value = false;
-   }
-   void post() {
-      { unique_lock<mutex> lock(m); value = true; }
-      cv.notify_one();
-   }
-};
-
-bool stopThreads = false;
-BinarySemaphore srcDoneIO[2], dstDoneIO[2], srcDoneCPU[2], dstDoneCPU[2];
-unsigned char *srcBuf[2] = { NULL, NULL };
-unsigned char *dstBuf[2] = { NULL, NULL };
-unsigned char *dstMem[2] = { NULL, NULL };
-size_t srcLen[2] = { 0, 0 };
-size_t dstLen[2] = { 0, 0 };
-
-#define FSST_MEMBUF (1ULL<<22)
-int decompress = 0;
-size_t blksz = FSST_MEMBUF-(1+FSST_MAXHEADER/2); // block size of compression (max compressed size must fit 3 bytes)
-
-#define DESERIALIZE(p) (((unsigned long long) (p)[0]) << 16) | (((unsigned long long) (p)[1]) << 8) | ((unsigned long long) (p)[2])
-#define SERIALIZE(l,p) { (p)[0] = ((l)>>16)&255; (p)[1] = ((l)>>8)&255; (p)[2] = (l)&255; }
-
-void reader(ifstream& src) {
-   for(int swap=0; true; swap = 1-swap) {
-      srcDoneCPU[swap].wait();
-      if (stopThreads) break;
-      src.read((char*) srcBuf[swap], blksz);
-      srcLen[swap] = (unsigned long) src.gcount();
-      if (decompress) {
-         if (blksz && srcLen[swap] == blksz) {
-            blksz = DESERIALIZE(srcBuf[swap]+blksz-3); // read size of next block
-            srcLen[swap] -= 3; // cut off size bytes
-         } else {
-            blksz = 0;
-         }
-      }
-      srcDoneIO[swap].post();
-   }
-}
-
-void writer(ofstream& dst) {
-   for(int swap=0; true; swap = 1-swap) {
-      dstDoneCPU[swap].wait();
-      if (!dstLen[swap]) break;
-      dst.write((char*) dstBuf[swap], dstLen[swap]);
-      dstDoneIO[swap].post();
-   }
-   for(int swap=0; swap<2; swap++)
-      dstDoneIO[swap].post();
-}
-
-}
-
-#ifdef FSST_STANDALONE
-int main(int argc, char* argv[]) {
-   size_t srcTot = 0, dstTot = 0;
-   if (argc < 2 || argc > 4 || (argc == 4 && (argv[1][0] != '-' || argv[1][1] != 'd' || argv[1][2]))) {
-      cerr << "usage: " << argv[0] << " -d infile outfile" << endl;
-      cerr << "       " << argv[0] << " infile outfile" << endl;
-      cerr << "       " << argv[0] << " infile" << endl;
-      return -1;
-   }
-   decompress = (argc == 4);
-   string srcfile(argv[1+decompress]), dstfile;
-   if (argc == 2) {
-      dstfile = srcfile + ".fsst";
-   } else {
-      dstfile = argv[2+decompress];
-   }
-   ifstream src;
-   ofstream dst;
-   src.open(srcfile, ios::binary);
-   dst.open(dstfile, ios::binary);
-   dst.exceptions(ios_base::failbit);
-   dst.exceptions(ios_base::badbit);
-   src.exceptions(ios_base::badbit);
-   if (decompress) {
-       unsigned char tmp[3];
-       src.read((char*) tmp, 3);
-       if (src.gcount() != 3) {
-          cerr << "failed to open input." << endl;
-          return -1;
-       }
-       blksz = DESERIALIZE(tmp); // read first block size
-   }
-   vector<unsigned char> buffer(FSST_MEMBUF*6);
-   srcBuf[0] = buffer.data();
-   srcBuf[1] = srcBuf[0] + (FSST_MEMBUF*(1ULL+decompress));
-   dstMem[0] = srcBuf[1] + (FSST_MEMBUF*(1ULL+decompress));
-   dstMem[1] = dstMem[0] + (FSST_MEMBUF*(2ULL-decompress));
-
-   for(int swap=0; swap<2; swap++) {
-      srcDoneCPU[swap].post(); // input buffer is not being processed initially
-      dstDoneIO[swap].post();  // output buffer is not being written initially
-   }
-   thread readerThread([&src]{ reader(src); });
-   thread writerThread([&dst]{ writer(dst); });
-
-   for(int swap=0; true; swap = 1-swap) {
-      srcDoneIO[swap].wait(); // wait until input buffer is available (i.e. done reading)
-      dstDoneIO[swap].wait(); // wait until output buffer is ready writing hence free for use
-      if (srcLen[swap] == 0) {
-         dstLen[swap] = 0;
-         break;
-      }
-      if (decompress) {
-          fsst_decoder_t decoder;
-          size_t hdr = fsst_import(&decoder, srcBuf[swap]);
-          dstLen[swap] = fsst_decompress(&decoder, srcLen[swap] - hdr, srcBuf[swap] + hdr, FSST_MEMBUF, dstBuf[swap] = dstMem[swap]);
-      } else {
-         unsigned char tmp[FSST_MAXHEADER];
-         fsst_encoder_t* encoder = fsst_create(1, &srcLen[swap], const_cast<const unsigned char **>(&srcBuf[swap]), 0);
-         size_t hdr = fsst_export(encoder, tmp);
-         if (fsst_compress(encoder, 1, &srcLen[swap], const_cast<const unsigned char **>(&srcBuf[swap]),
-                           FSST_MEMBUF * 2, dstMem[swap] + FSST_MAXHEADER + 3,
-                           &dstLen[swap], &dstBuf[swap]) < 1)
-            return -1;
-         dstLen[swap] += 3 + hdr;
-          dstBuf[swap] -= 3 + hdr;
-          SERIALIZE(dstLen[swap],dstBuf[swap]); // block starts with size
-          copy(tmp, tmp+hdr, dstBuf[swap]+3); // then the header (followed by the compressed bytes which are already there)
-          fsst_destroy(encoder);
-      }
-      srcTot += srcLen[swap];
-      dstTot += dstLen[swap];
-      srcDoneCPU[swap].post(); // input buffer may be re-used by the reader for the next block
-      dstDoneCPU[swap].post(); // output buffer is ready for writing out
-   }
-   cerr  << (decompress?"Dec":"C") << "ompressed " << srcTot <<  " bytes into " << dstTot << " bytes ==> " << (int) ((100*dstTot)/srcTot) << "%" << endl;
-
-   // force wait until all background writes finished
-   stopThreads = true;
-   for(int swap=0; swap<2; swap++) {
-      srcDoneCPU[swap].post();
-      dstDoneCPU[swap].post();
-   }
-   dstDoneIO[0].wait();
-   dstDoneIO[1].wait();
-   readerThread.join();
-   writerThread.join();
-}
-#endif  // FSST_STANDALONE
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst.h b/cpp/src/parquet/thirdparty/fsst/fsst.h
deleted file mode 100644
index 71085d57201..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/fsst.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/* 
- * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
- *
- * ===================================================================================================================================
- * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
- *
- * Copyright 2018-2020, CWI, TU Munich, FSU Jena
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files 
- * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, 
- * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
- * furnished to do so, subject to the following conditions:
- *
- * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
- * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
- * ===================================================================================================================================
- *
- * FSST: Fast Static Symbol Table compression 
- * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
- *
- * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
- * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual
- * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is 
- * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
- *
- * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) 
- * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression 
- * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could 
- * be seen as strings again and fit in whatever your program is that manipulates strings.
- *
- * useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
- * 
- * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of 
- * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. 
- *
- * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
- * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
- *
- * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are 
- * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length.
- */
-#ifndef FSST_INCLUDED_H
-#define FSST_INCLUDED_H
-
-#ifdef _MSC_VER
-#define __restrict__ 
-#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
-#define __ORDER_LITTLE_ENDIAN__ 2
-#include <intrin.h>
-static inline int __builtin_ctzl(unsigned long long x) {
-    unsigned long ret;
-    _BitScanForward64(&ret, x);
-    return (int)ret;
-}
-#endif
-
-#ifdef __cplusplus
-#define FSST_FALLTHROUGH [[fallthrough]]
-#include <cstring>
-extern "C" {
-#else
-#define FSST_FALLTHROUGH 
-#endif
-
-#include <stddef.h>
-
-/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */
-#define FSST_ESC 255
-
-/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */
-typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */
-
-/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */
-typedef struct {
-   unsigned long long version;     /* version id */
-   unsigned char zeroTerminated;   /* terminator is a single-byte code that does not appear in longer symbols */
-   unsigned char len[255];         /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */
-   unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ 
-} fsst_decoder_t;
-
-/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */
-fsst_encoder_t*  
-fsst_create(
-   size_t n,         /* IN: number of strings in batch to sample from. */
-   const size_t lenIn[],   /* IN: byte-lengths of the inputs */
-   const unsigned char *strIn[],  /* IN: string start pointers. */
-   int zeroTerminated       /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */
-);
-
-/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ 
-fsst_encoder_t*    
-fsst_duplicate(
-   fsst_encoder_t *encoder  /* IN: the symbol table to duplicate. */ 
-);
-
-#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */
-
-/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */
-unsigned int                /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */
-fsst_export(
-   fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ 
-   unsigned char *buf       /* OUT: pointer to a byte-buffer where to serialize this symbol table. */
-); 
-
-/* Deallocate encoder. */
-void
-fsst_destroy(fsst_encoder_t*);
-
-/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */
-unsigned int                /* OUT: number of bytes consumed in buf (0 on failure). */
-fsst_import(
-   fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ 
-   unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */
-); 
-
-/* Return a decoder structure from an encoder. */
-fsst_decoder_t    
-fsst_decoder(
-   fsst_encoder_t *encoder   
-);
-
-/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */
-/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */
-size_t                      /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ 
-fsst_compress(
-   fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */
-   size_t nstrings,         /* IN: number of strings in batch to compress. */
-   const size_t lenIn[],          /* IN: byte-lengths of the inputs */
-   const unsigned char *strIn[],  /* IN: input string start pointers. */
-   size_t outsize,          /* IN: byte-length of output buffer. */
-   unsigned char *output,   /* OUT: memory buffer to put the compressed strings in (one after the other). */
-   size_t lenOut[],         /* OUT: byte-lengths of the compressed strings. */
-   unsigned char *strOut[]  /* OUT: output string start pointers. Will all point into [output,output+size). */
-);
-
-/* Decompress a single string, inlined for speed. */
-inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */
-fsst_decompress(
-   const fsst_decoder_t *decoder,  /* IN: use this symbol table for compression. */
-   size_t lenIn,             /* IN: byte-length of compressed string. */
-   const unsigned char *strIn,     /* IN: compressed string. */
-   size_t size,              /* IN: byte-length of output buffer. */
-   unsigned char *output     /* OUT: memory buffer to put the decompressed string in. */
-) {
-   unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
-   unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
-   unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; 
-   size_t code, posOut = 0, posIn = 0;
-#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
-#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
-#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-   while (posOut+32 <= size && posIn+4 <= lenIn) {
-      unsigned int nextBlock, escapeMask;
-      memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int));
-      escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u);
-      if (escapeMask == 0) {
-         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
-         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
-         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
-         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
-     } else { 
-         unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3;
-         switch(firstEscapePos) { /* Duff's device */
-         case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
-                 // fall through
-         case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
-                 // fall through
-         case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
-                 // fall through
-         case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */
-         }
-      }
-   }
-   if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop
-      if (posIn+2 <= lenIn) { 
-	 strOut[posOut] = strIn[posIn+1]; 
-         if (strIn[posIn] != FSST_ESC) {
-            code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
-            if (strIn[posIn] != FSST_ESC) {
-               code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
-            } else { 
-               posIn += 2; strOut[posOut++] = strIn[posIn-1]; 
-            }
-         } else {
-            posIn += 2; posOut++; 
-         } 
-      }
-      if (posIn < lenIn) { // last code cannot be an escape
-         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
-      }
-   }
-#else
-   while (posOut+8 <= size && posIn < lenIn)
-      if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */
-         FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */
-         posOut += len[code];
-      } else { 
-         strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */
-         posIn++; posOut++; 
-      }
-#endif
-#endif
-   while (posIn < lenIn)
-      if ((code = strIn[posIn++]) < FSST_ESC) {
-         size_t posWrite = posOut, endWrite = posOut + len[code];
-         unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite;
-         if ((posOut = endWrite) > size) endWrite = size;
-         for(; posWrite < endWrite; posWrite++)  /* only write if there is room */
-            strOut[posWrite] = symbolPointer[posWrite];
-      } else {
-         if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */
-         posIn++; posOut++; 
-      } 
-   if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0;
-   return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */
-}
-
-#ifdef __cplusplus
-}
-#endif
-#endif /* FSST_INCLUDED_H */
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp
deleted file mode 100644
index e43d3e03652..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-#include "libfsst.hpp"
-
-#if defined(__x86_64__) || defined(_M_X64)
-#include <immintrin.h>
-
-#ifdef _WIN32
-namespace libfsst {
-bool fsst_hasAVX512() {
-   int info[4];
-   __cpuidex(info, 0x00000007, 0);
-   return (info[1]>>16)&1;
-}
-}  // namespace libfsst
-#else
-#include <cpuid.h>
-namespace libfsst {
-bool fsst_hasAVX512() {
-   int info[4];
-    __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]);
-   return (info[1]>>16)&1;
-}
-}  // namespace libfsst
-#endif
-#else
-namespace libfsst {
-bool fsst_hasAVX512() { return false; }
-}  // namespace libfsst
-#endif
-
-namespace libfsst {
-
-// BULK COMPRESSION OF STRINGS
-//
-// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes.
-// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up.
-//
-// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4
-// unroll3 performs best on my hardware
-//
-// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes).
-// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs).
-//
-// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits)
-// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order.
-//
-// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings.
-// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process.
-// - so 'processed' is the amount of strings we started processing and it is between [480,512].
-// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished.
-// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method.
-//
-// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings.
-// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings.
-//
-// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to.
-// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch).
-//
-// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS
-// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however,
-// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them.
-// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization.
-// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores.
-
-size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) {
-   size_t processed = 0;
-   // define some constants (all_x means that all 8 lanes contain 64-bits value X)
-#ifdef __AVX512F__
- //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c
-   __m512i all_MASK     = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1));
-   __m512i all_PRIME    = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME));
-   __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE));
-#define    all_HASH       _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE)
-#define    all_ONE        _mm512_srli_epi64(all_MASK, 63)
-#define    all_M19        _mm512_srli_epi64(all_MASK, 45)
-#define    all_M18        _mm512_srli_epi64(all_MASK, 46)
-#define    all_M28        _mm512_srli_epi64(all_MASK, 36)
-#define    all_FFFFFF     _mm512_srli_epi64(all_MASK, 40)
-#define    all_FFFF       _mm512_srli_epi64(all_MASK, 48)
-#define    all_FF         _mm512_srli_epi64(all_MASK, 56)
-
-   SIMDjob *inputEnd = input+n;
-   assert(n >= unroll*8 && n <= 512); // should be close to 512
-   __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4
-   __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll
-   u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll
-
-   if (unroll >= 4) {
-      while (input+delta1+delta2+delta3+delta4 < inputEnd) {
-         #include "fsst_avx512_unroll4.inc"
-      }
-   } else if (unroll == 3) {
-      while (input+delta1+delta2+delta3 < inputEnd) {
-         #include "fsst_avx512_unroll3.inc"
-      }
-   } else if (unroll == 2) {
-      while (input+delta1+delta2 < inputEnd) {
-         #include "fsst_avx512_unroll2.inc"
-      }
-   } else {
-      while (input+delta1 < inputEnd) {
-         #include "fsst_avx512_unroll1.inc"
-      }
-   }
-
-   // flush the job states of the unfinished strings at the end of output[]
-   processed = n - (inputEnd - input);
-   u32 unfinished = 0;
-   if (unroll > 1) {
-      if (unroll > 2) {
-         if (unroll > 3) {
-            _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4);
-            unfinished += _mm_popcnt_u32((int) loadmask4);
-         }
-         _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3);
-         unfinished += _mm_popcnt_u32((int) loadmask3);
-      }
-      _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2);
-      unfinished += _mm_popcnt_u32((int) loadmask2);
-   }
-   _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1);
-#else
-   (void) symbolTable;
-   (void) codeBase;
-   (void) symbolBase;
-   (void) input;
-   (void) output;
-   (void) n;
-   (void) unroll;
-#endif
-   return processed;
-}
-}  // namespace libfsst
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc
deleted file mode 100644
index 0a74541dd88..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512.inc
+++ /dev/null
@@ -1,57 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-//
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmaskX=11111111, deltaX=8).
-            jobX      = _mm512_mask_expandloadu_epi64(jobX, loadmaskX, input); input += deltaX; 
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-   __m512i  wordX     = _mm512_i64gather_epi64(_mm512_srli_epi64(jobX, 46), symbolBase, 1); 
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // codeX: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-   __m512i  codeX     = _mm512_i64gather_epi64(_mm512_and_epi64(wordX, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-                        // get the first three bytes of the string. 
-   __m512i  posX      = _mm512_mullo_epi64(_mm512_and_epi64(wordX, all_FFFFFF), all_PRIME);
-                        // hash them into a random number: posX = posX*PRIME; posX ^= posX>>SHIFT
-            posX      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(posX,_mm512_srli_epi64(posX,FSST_SHIFT)), all_HASH), 4);
-                        // lookup in the 3-byte-prefix keyed hash table
-   __m512i  iclX      = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 8), 1);
-                        // speculatively store the first input byte into the second position of the writeX register (in case it turns out to be an escaped byte).
-   __m512i  writeX    = _mm512_slli_epi64(_mm512_and_epi64(wordX, all_FF), 8);
-                        // lookup just like the iclX above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-   __m512i  symbX     = _mm512_i64gather_epi64(posX, (((char*) symbolTable.hashTab) + 0), 1);
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-            posX      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(iclX, all_FF));
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-   __mmask8 matchX    = _mm512_cmpeq_epi64_mask(symbX, _mm512_and_epi64(wordX, posX)) & _mm512_cmplt_epi64_mask(iclX, all_ICL_FREE);
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-            codeX     = _mm512_mask_mov_epi64(codeX, matchX, _mm512_srli_epi64(iclX, 16));
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-            writeX    = _mm512_or_epi64(writeX, _mm512_and_epi64(codeX, all_FF));
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-            codeX     = _mm512_and_epi64(codeX, all_FFFF);
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(jobX, all_M19), writeX, 1);
-                        // increase the jobX.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-            jobX      = _mm512_add_epi64(jobX, _mm512_slli_epi64(_mm512_srli_epi64(codeX, FSST_LEN_BITS), 46));
-                        // increase the jobX.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-            jobX      = _mm512_add_epi64(jobX, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(codeX, 8), all_ONE)));
-                        // test which lanes are done now (jobX.cur==jobX.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the jobX register)
-            loadmaskX = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(jobX, 46), _mm512_and_epi64(_mm512_srli_epi64(jobX, 28), all_M18));
-                        // calculate the amount of lanes in jobX that are done
-            deltaX    = _mm_popcnt_u32((int) loadmaskX); 
-                        // write out the job state for the lanes that are done (we need the final 'jobX.out' value to compute the compressed string length)
-                        _mm512_mask_compressstoreu_epi64(output, loadmaskX, jobX); output += deltaX;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc
deleted file mode 100644
index f4b81c7970d..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll1.inc
+++ /dev/null
@@ -1,57 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-//
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
-            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-                        // get the first three bytes of the string. 
-   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
-                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
-            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
-                        // lookup in the 3-byte-prefix keyed hash table
-   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
-                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
-   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
-                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-            code1     = _mm512_and_epi64(code1, all_FFFF);
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
-                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
-                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
-                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
-            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
-                        // calculate the amount of lanes in job1 that are done
-            delta1    = _mm_popcnt_u32((int) loadmask1); 
-                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
-                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc
deleted file mode 100644
index aa33cd7e69c..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll2.inc
+++ /dev/null
@@ -1,114 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-//
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-//
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-// furnished to do so, subject to the following conditions:
-//
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-//
-//
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
-            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
-            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
-   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-                        // get the first three bytes of the string. 
-                        // get the first three bytes of the string. 
-   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
-   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
-                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
-                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
-            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
-            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
-                        // lookup in the 3-byte-prefix keyed hash table
-                        // lookup in the 3-byte-prefix keyed hash table
-   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
-   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
-                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
-                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
-   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
-   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
-                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
-   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
-            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
-   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
-            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
-            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-            code1     = _mm512_and_epi64(code1, all_FFFF);
-            code2     = _mm512_and_epi64(code2, all_FFFF);
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
-                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
-            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
-                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
-            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
-                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
-                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
-            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
-            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
-                        // calculate the amount of lanes in job1 that are done
-                        // calculate the amount of lanes in job2 that are done
-            delta1    = _mm_popcnt_u32((int) loadmask1); 
-            delta2    = _mm_popcnt_u32((int) loadmask2); 
-                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
-                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
-                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
-                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc
deleted file mode 100644
index e2057032abd..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll3.inc
+++ /dev/null
@@ -1,171 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-//
-//
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-//
-//
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-// furnished to do so, subject to the following conditions:
-// furnished to do so, subject to the following conditions:
-//
-//
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-//
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-//
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-//
-//
-//
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8).
-            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
-            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
-            job3      = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; 
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
-   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
-   __m512i  word3     = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); 
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-                        // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-   __m512i  code3     = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-                        // get the first three bytes of the string. 
-                        // get the first three bytes of the string. 
-                        // get the first three bytes of the string. 
-   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
-   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
-   __m512i  pos3      = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME);
-                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
-                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
-                        // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT
-            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
-            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
-            pos3      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4);
-                        // lookup in the 3-byte-prefix keyed hash table
-                        // lookup in the 3-byte-prefix keyed hash table
-                        // lookup in the 3-byte-prefix keyed hash table
-   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
-   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
-   __m512i  icl3      = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1);
-                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
-                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
-                        // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte).
-   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
-   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
-   __m512i  write3    = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8);
-                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-                        // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
-   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
-   __m512i  symb3     = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1);
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
-            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
-            pos3      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF));
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
-   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
-   __mmask8 match3    = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE);
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
-            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
-            code3     = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16));
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
-            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
-            write3    = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF));
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-            code1     = _mm512_and_epi64(code1, all_FFFF);
-            code2     = _mm512_and_epi64(code2, all_FFFF);
-            code3     = _mm512_and_epi64(code3, all_FFFF);
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1);
-                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-                        // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
-            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
-            job3      = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46));
-                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-                        // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
-            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
-            job3      = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE)));
-                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
-                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
-                        // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register)
-            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
-            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
-            loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18));
-                        // calculate the amount of lanes in job1 that are done
-                        // calculate the amount of lanes in job2 that are done
-                        // calculate the amount of lanes in job3 that are done
-            delta1    = _mm_popcnt_u32((int) loadmask1); 
-            delta2    = _mm_popcnt_u32((int) loadmask2); 
-            delta3    = _mm_popcnt_u32((int) loadmask3); 
-                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
-                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
-                        // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length)
-                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
-                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
-                        _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3;
diff --git a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc b/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc
deleted file mode 100644
index 15cca7c938b..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/fsst_avx512_unroll4.inc
+++ /dev/null
@@ -1,228 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-//
-//
-//
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-//
-//
-//
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-// furnished to do so, subject to the following conditions:
-// furnished to do so, subject to the following conditions:
-// furnished to do so, subject to the following conditions:
-//
-//
-//
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-//
-//
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-//
-//
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-//
-//
-//
-//
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8).
-                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8).
-            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
-            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
-            job3      = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; 
-            job4      = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; 
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
-   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
-   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
-   __m512i  word3     = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); 
-   __m512i  word4     = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); 
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
-                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-                        // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-                        // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
-   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-   __m512i  code3     = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-   __m512i  code4     = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16));
-                        // get the first three bytes of the string. 
-                        // get the first three bytes of the string. 
-                        // get the first three bytes of the string. 
-                        // get the first three bytes of the string. 
-   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
-   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
-   __m512i  pos3      = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME);
-   __m512i  pos4      = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME);
-                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
-                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
-                        // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT
-                        // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT
-            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
-            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
-            pos3      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4);
-            pos4      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4);
-                        // lookup in the 3-byte-prefix keyed hash table
-                        // lookup in the 3-byte-prefix keyed hash table
-                        // lookup in the 3-byte-prefix keyed hash table
-                        // lookup in the 3-byte-prefix keyed hash table
-   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
-   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
-   __m512i  icl3      = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1);
-   __m512i  icl4      = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1);
-                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
-                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
-                        // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte).
-                        // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte).
-   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
-   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
-   __m512i  write3    = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8);
-   __m512i  write4    = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8);
-                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-                        // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-                        // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
-   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
-   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
-   __m512i  symb3     = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1);
-   __m512i  symb4     = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1);
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
-            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
-            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
-            pos3      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF));
-            pos4      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF));
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
-   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
-   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
-   __mmask8 match3    = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE);
-   __mmask8 match4    = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE);
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
-            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
-            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
-            code3     = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16));
-            code4     = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16));
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
-            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
-            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
-            write3    = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF));
-            write4    = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF));
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
-            code1     = _mm512_and_epi64(code1, all_FFFF);
-            code2     = _mm512_and_epi64(code2, all_FFFF);
-            code3     = _mm512_and_epi64(code3, all_FFFF);
-            code4     = _mm512_and_epi64(code4, all_FFFF);
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1);
-                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1);
-                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-                        // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-                        // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
-            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
-            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
-            job3      = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46));
-            job4      = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46));
-                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-                        // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-                        // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
-            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
-            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
-            job3      = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE)));
-            job4      = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE)));
-                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
-                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
-                        // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register)
-                        // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register)
-            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
-            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
-            loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18));
-            loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18));
-                        // calculate the amount of lanes in job1 that are done
-                        // calculate the amount of lanes in job2 that are done
-                        // calculate the amount of lanes in job3 that are done
-                        // calculate the amount of lanes in job4 that are done
-            delta1    = _mm_popcnt_u32((int) loadmask1); 
-            delta2    = _mm_popcnt_u32((int) loadmask2); 
-            delta3    = _mm_popcnt_u32((int) loadmask3); 
-            delta4    = _mm_popcnt_u32((int) loadmask4); 
-                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
-                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
-                        // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length)
-                        // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length)
-                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
-                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
-                        _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3;
-                        _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4;
diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.cpp b/cpp/src/parquet/thirdparty/fsst/libfsst.cpp
deleted file mode 100644
index e3ba787b959..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/libfsst.cpp
+++ /dev/null
@@ -1,651 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-#include "libfsst.hpp"
-
-namespace libfsst {
-Symbol concat(Symbol a, Symbol b) {
-   Symbol s;
-   u32 length = a.length()+b.length();
-   if (length > Symbol::maxLength) length = Symbol::maxLength; 
-   s.set_code_len(FSST_CODE_MASK, length);
-   s.store_num((b.load_num() << (8*a.length())) | a.load_num());
-   return s;
-}
-}  // namespace libfsst
-
-namespace std {
-template <>
-class hash<libfsst::QSymbol> {
-   public:
-   size_t operator()(const libfsst::QSymbol& q) const {
-      uint64_t k = q.symbol.load_num();
-      const uint64_t m = 0xc6a4a7935bd1e995;
-      const int r = 47;
-      uint64_t h = 0x8445d61a4e774912 ^ (8*m);
-      k *= m;
-      k ^= k >> r;
-      k *= m;
-      h ^= k;
-      h *= m;
-      h ^= h >> r;
-      h *= m;
-      h ^= h >> r;
-      return h;
-   }
-};
-}
-
-namespace libfsst {
-bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
-
-std::ostream& operator<<(std::ostream& out, const Symbol& s) {
-   for (u32 i=0; i<s.length(); i++)
-      out << s.val.str[i];
-   return out;
-}
-
-SymbolTable *buildSymbolTable(Counters& counters, vector<const u8*> line, const size_t len[], bool zeroTerminated=false) {
-   SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
-   int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
-   size_t sampleFrac = 128;
-
-   // start by determining the terminator. We use the (lowest) most infrequent byte as terminator 
-   st->zeroTerminated = zeroTerminated;
-   if (zeroTerminated) {
-      st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
-   } else {
-      u16 byteHisto[256];
-      memset(byteHisto, 0, sizeof(byteHisto));
-      for(size_t i=0; i<line.size(); i++) {
-         const u8* cur = line[i];
-         const u8* end = cur + len[i];
-         while(cur < end) byteHisto[*cur++]++;
-      }
-      u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
-      while(i-- > 0) {
-         if (byteHisto[i] > minSize) continue;
-         st->terminator = i;
-         minSize = byteHisto[i];
-      }
-   }
-   assert(st->terminator != 256);
-
-   // a random number between 0 and 128
-   auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
-
-   // compress sample, and compute (pair-)frequencies
-   auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
-      int gain = 0;
-
-      for(size_t i=0; i<line.size(); i++) {
-         const u8* cur = line[i], *start = cur;
-         const u8* end = cur + len[i];
-
-         if (sampleFrac < 128) {
-            // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
-            if (rnd128(i) > sampleFrac) continue;
-         }
-         if (cur < end) {
-            u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
-            cur += st->symbols[code1].length();
-            gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
-            while (true) {
-               // count single symbol (i.e. an option is not extending it)
-               counters.count1Inc(code1);
-	
-               // as an alternative, consider just using the next byte..
-               if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
-                  counters.count1Inc(*start); 
-
-               if (cur==end) { 
-                  break;
-               } 
-
-               // now match a new symbol
-	       start = cur;
-               if (cur<end-7) {
-                  u64 word = fsst_unaligned_load(cur);
-                  size_t code = word & 0xFFFFFF;
-                  size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
-                  Symbol s = st->hashTab[idx];
-                  code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
-                  word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
-                  if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) {
-                     code2 = s.code(); 
-		     cur += s.length();
-                  } else if (code2 >= FSST_CODE_BASE) {
-                     cur += 2;
-                  } else {
-                     code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
-                     cur += 1;
-                  }
-               } else {
-                  code2 = st->findLongestSymbol(cur, end);
-                  cur += st->symbols[code2].length();
-               }
- 
-               // compute compressed output size
-               gain += ((int) (cur-start))-(1+isEscapeCode(code2));
-
-               if (sampleFrac < 128) { // no need to count pairs in final round
-	          // consider the symbol that is the concatenation of the two last symbols
-                  counters.count2Inc(code1, code2);
-
-                  // as an alternative, consider just extending with the next byte..
-                  if ((cur-start) > 1)  // ..but do not count single byte extensions doubly
-                     counters.count2Inc(code1, *start);
-               }
-               code1 = code2;
-            }
-         }
-      }
-      return gain; 
-   };
-
-   auto makeTable = [&](SymbolTable *st, Counters &counters) {
-      // hashmap of c (needed because we can generate duplicate candidates)
-      unordered_set<QSymbol> cands;
-
-      // artificially make terminater the most frequent symbol so it gets included
-      u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
-      counters.count1Set(terminator,65535); 
-
-      auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
-         if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
-         QSymbol q;
-         q.symbol = s;
-         q.gain = count * s.length();
-         auto it = cands.find(q);
-         if (it != cands.end()) {
-            q.gain += (*it).gain;
-            cands.erase(*it);
-         }
-         cands.insert(q);
-      };
-
-      // add candidate symbols based on counted frequency
-      for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) { 
-         u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
-         if (!cnt1) continue;
-
-         // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
-         Symbol s1 = st->symbols[pos1];
-         addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
-
-         if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
-             s1.length() == Symbol::maxLength || // symbol cannot be extended
-             s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
-            continue;
-         }
-         for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) { 
-            u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
-            if (!cnt2) continue;
-
-            // create a new symbol
-            Symbol s2 = st->symbols[pos2];
-            Symbol s3 = concat(s1, s2);
-            if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
-               addOrInc(cands, s3, cnt2);
-         }
-      }
-
-      // insert candidates into priority queue (by gain)
-      auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); };
-      priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
-      for (auto& q : cands)
-         pq.push(q);
-
-      // Create new symbol map using best candidates
-      st->clear();
-      while (st->nSymbols < 255 && !pq.empty()) {
-         QSymbol q = pq.top();
-         pq.pop();
-         st->add(q.symbol);
-      }
-   };
-
-   u8 bestCounters[512*sizeof(u16)];
-#ifdef NONOPT_FSST
-   for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
-      sampleFrac = frac;
-#else
-   for(sampleFrac=8; true; sampleFrac += 30) {
-#endif
-      memset(&counters, 0, sizeof(Counters));
-      long gain = compressCount(st, counters);
-      if (gain >= bestGain) { // a new best solution!
-         counters.backup1(bestCounters);
-         *bestTable = *st; bestGain = gain;
-      } 
-      if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
-      makeTable(st, counters);
-   }
-   delete st;
-   counters.restore1(bestCounters);
-   makeTable(bestTable, counters);
-   bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
-   return bestTable;
-}
-
-#ifndef NONOPT_FSST
-static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
-   size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
-   u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings 
-   SIMDjob input[512];  // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
-   SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
-   size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
-
-   while (curLine < nlines && outOff <= (1<<19)) {
-      size_t prevLine = curLine, chunk, curOff = 0;
- 
-      // bail out if the output buffer cannot hold the compressed next string fully
-      if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
-      else budget -= (len[curLine]-curOff)*2;
-
-      strOut[curLine] = (u8*) 0; 
-      lenOut[curLine] = 0;
-
-      do {
-         do {
-            chunk = len[curLine] - curOff;
-            if (chunk > 511) {
-               chunk = 511; // large strings need to be chopped up into segments of 511 bytes
-            }
-            // create a job in this batch
-            SIMDjob job;
-            job.cur = inOff;
-            job.end = job.cur + chunk;
-            job.pos = batchPos;
-            job.out = outOff;
-   
-            // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
-            outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
-            if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
-   
-            // register job in this batch
-            input[batchPos] = job;
-            jobLine[batchPos] = curLine;
-   
-            if (chunk == 0) {
-               empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
-            } else {
-               // copy string chunk into temp buffer 
-               memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
-               inOff += chunk;
-               curOff += chunk;
-               symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
-            }
-            if (++batchPos == 512) break;
-         } while(curOff < len[curLine]);
-   
-         if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines) || (((len[curLine])*2 + 7) > budget)) { // cannot accumulate more?
-            if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
-               // radix-sort jobs on length (longest string first) 
-               // -- this provides best load balancing and allows to skip empty jobs at the end
-               u16 sortpos[513]; 
-               memset(sortpos, 0, sizeof(sortpos));
-   
-               // calculate length histo 
-               for(size_t i=0; i<batchPos; i++) { 
-                  size_t len = input[i].end - input[i].cur; 
-                  sortpos[512UL - len]++;
-               }
-               // calculate running sum
-               for(size_t i=1; i<=512; i++) 
-                  sortpos[i] += sortpos[i-1]; 
-   
-               // move jobs to their final destination
-               SIMDjob inputOrdered[512];
-               for(size_t i=0; i<batchPos; i++) {
-                  size_t len = input[i].end - input[i].cur; 
-                  size_t pos = sortpos[511UL - len]++;
-                  inputOrdered[pos] = input[i]; 
-                }
-               // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..) 
-               for(size_t done = fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll);
-                   done < batchPos; done++) output[done] = inputOrdered[done]; 
-            } else {
-               memcpy(output, input, batchPos*sizeof(SIMDjob));
-            }
-   
-            // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
-            for(size_t i=0; i<batchPos; i++) {
-               SIMDjob job = output[i];
-               if (job.cur < job.end) { // finish encoding this string with scalar code
-                  u8* cur = symbolBase + job.cur;
-                  u8* end = symbolBase + job.end;
-                  u8* out = codeBase + job.out;
-                  while (cur < end) {
-                     u64 word = fsst_unaligned_load(cur);
-                     size_t code = symbolTable.shortCodes[word & 0xFFFF];
-                     size_t pos = word & 0xFFFFFF;
-                     size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
-                     Symbol s = symbolTable.hashTab[idx];
-                     out[1] = (u8) word; // speculatively write out escaped byte
-                     word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
-                     if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
-                        *out++ = (u8) s.code(); cur += s.length();
-                     } else {
-                        // could be a 2-byte or 1-byte code, or miss
-                        // handle everything with predication 
-                        *out = (u8) code; 
-                        out += 1+((code&FSST_CODE_BASE)>>8);
-                        cur += (code>>FSST_LEN_BITS); 
-                    }
-                  }
-                  job.out = out - codeBase;
-               } 
-               // postprocess job info
-               job.cur = 0;
-               job.end = job.out - input[job.pos].out; // misuse .end field as compressed size 
-               job.out = input[job.pos].out; // reset offset to start of encoded string
-               input[job.pos] = job; 
-            }
-   
-            // copy out the result data
-            for(size_t i=0; i<batchPos; i++) {
-               size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
-               size_t sz = input[i].end; // had stored compressed lengths here
-               if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
-               lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
-               memcpy(dst, codeBase+input[i].out, sz);
-               dst += sz;
-            }
-   
-            // go for the next batch of 512 chunks
-            inOff = outOff = batchPos = empty = 0;
-            budget = (size_t) (lim - dst);
-         } 
-      } while (curLine == prevLine && outOff <= (1<<19));
-   }
-   return curLine;
-}
-#endif
-
-// optimized adaptive *scalar* compression method
-static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, const size_t lenIn[], const u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
-   const u8 *cur = NULL, *end =  NULL, *lim = out + size;
-   size_t curLine, suffixLim = symbolTable.suffixLim;
-   u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
-
-   u8 buf[512+8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
-
-   // three variants are possible. dead code falls away since the bool arguments are constants
-   auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
-      while (cur < end) {
-         u64 word = fsst_unaligned_load(cur);
-         size_t code = symbolTable.shortCodes[word & 0xFFFF];
-         if (noSuffixOpt && ((u8) code) < suffixLim) {
-            // 2 byte code without having to worry about longer matches
-            *out++ = (u8) code; cur += 2;
-         } else {
-            size_t pos = word & 0xFFFFFF;
-            size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
-            Symbol s = symbolTable.hashTab[idx];
-            out[1] = (u8) word; // speculatively write out escaped byte
-            word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
-            if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
-               *out++ = (u8) s.code(); cur += s.length();
-            } else if (avoidBranch) {
-               // could be a 2-byte or 1-byte code, or miss
-               // handle everything with predication 
-               *out = (u8) code; 
-               out += 1+((code&FSST_CODE_BASE)>>8);
-               cur += (code>>FSST_LEN_BITS); 
-            } else if ((u8) code < byteLim) {
-               // 2 byte code after checking there is no longer pattern
-               *out++ = (u8) code; cur += 2;
-            } else {
-               // 1 byte code or miss. 
-               *out = (u8) code; 
-               out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
-               cur++;
-            }
-         }
-      }
-   };
-
-   for(curLine=0; curLine<nlines; curLine++) {
-      size_t chunk, curOff = 0;
-      strOut[curLine] = out;
-      do {
-         cur = strIn[curLine] + curOff; 
-         chunk = lenIn[curLine] - curOff;
-         if (chunk > 511) {
-            chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST 
-         }
-         if ((2*chunk+7) > (size_t) (lim-out)) {
-            return curLine; // out of memory
-         }
-         // copy the string to the 511-byte buffer
-         memcpy(buf, cur, chunk);
-         buf[chunk] = (u8) symbolTable.terminator;
-         cur = buf;
-         end = cur + chunk; 
-
-         // based on symboltable stats, choose a variant that is nice to the branch predictor
-         if (noSuffixOpt) {
-            compressVariant(true,false);
-         } else if (avoidBranch) {
-            compressVariant(false,true);
-         } else {
-          compressVariant(false, false);
-         }
-      } while((curOff += chunk) < lenIn[curLine]);
-      lenOut[curLine] = (size_t) (out - strOut[curLine]);
-   } 
-   return curLine;
-}
-
-#define FSST_SAMPLELINE ((size_t) 512)
-
-// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
-vector<const u8*> makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) {
-   size_t totSize = 0;
-   const size_t *lenIn = *lenRef;
-   vector<const u8*> sample;
-
-   for(size_t i=0; i<nlines; i++) 
-      totSize += lenIn[i];
-
-   if (totSize < FSST_SAMPLETARGET) { 
-      for(size_t i=0; i<nlines; i++) 
-         sample.push_back(strIn[i]);
-   } else {
-      size_t sampleRnd = FSST_HASH(4637947);
-      const u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
-      size_t *sampleLen =  new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
-      *lenRef = sampleLen;
-      size_t* sampleLenLim = sampleLen + nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE;
-
-      while(sampleBuf < sampleLim && sampleLen < sampleLenLim) {
-         // choose a non-empty line
-         sampleRnd = FSST_HASH(sampleRnd);
-         size_t linenr = sampleRnd % nlines;
-         while (lenIn[linenr] == 0) 
-            if (++linenr == nlines) linenr = 0;
-
-         // choose a chunk
-         size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
-         sampleRnd = FSST_HASH(sampleRnd);
-         size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
-
-         // add the chunk to the sample
-         size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
-         memcpy(sampleBuf, strIn[linenr]+chunk, len);
-         sample.push_back(sampleBuf);
-         sampleBuf += *sampleLen++ = len;
-      }
-   }
-   return sample;
-}
-
-extern "C" fsst_encoder_t* fsst_create(size_t n, const size_t lenIn[], const u8 *strIn[], int zeroTerminated) {
-   u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
-   const size_t *sampleLen = lenIn;
-   vector<const u8*> sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample
-   Encoder *encoder = new Encoder();
-   encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
-   if (sampleLen != lenIn) delete[] sampleLen; 
-   delete[] sampleBuf; 
-   return (fsst_encoder_t*) encoder;
-}
-
-/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */
-extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) {
-   Encoder *e = new Encoder();
-   e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
-   return (fsst_encoder_t*) e;
-}
-
-// export a symbol table in compact format. 
-extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
-   Encoder *e = (Encoder*) encoder;
-   // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
-   // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t
-   // (such functionality could be useful to append compressed data to an existing block).
-   //
-   // However, the hash function in the encoder hash table is endian-sensitive, and given its
-   // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
-   // Doing a endian-conversion during hashing will be slow and self-defeating.
-   //
-   // Overall, we could support reconstructing an encoder for incremental compression, but 
-   // should enforce equal-endianness. Bit of a bummer. Not going there now.
-   // 
-   // The version field is now there just for future-proofness, but not used yet
-   
-   // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
-   u64 version = (FSST_VERSION << 32) |  // version is 24 bits, most significant byte is 0 
-                 (((u64) e->symbolTable->suffixLim) << 24) | 
-                 (((u64) e->symbolTable->terminator) << 16) | 
-                 (((u64) e->symbolTable->nSymbols) << 8) | 
-                 FSST_ENDIAN_MARKER; // least significant byte is nonzero
-
-   version = swap64_if_be(version); // ensure version is little-endian encoded
-
-   /* do not assume unaligned reads here */
-   memcpy(buf, &version, 8);
-   buf[8] = e->symbolTable->zeroTerminated;
-   for(u32 i=0; i<8; i++)
-      buf[9+i] = (u8) e->symbolTable->lenHisto[i];
-   u32 pos = 17;
-
-   // emit only the used bytes of the symbols 
-   for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
-      for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
-         buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
-
-   return pos; // length of what was serialized
-}
-
-#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
-
-extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) {
-   u64 version = 0;
-   u32 code, pos = 17;
-   u8 lenHisto[8];
-
-   // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
-   memcpy(&version, buf, 8);
-   version = swap64_if_be(version); // version is always little-endian encoded
-
-   if ((version>>32) != FSST_VERSION) return 0;
-   decoder->zeroTerminated = buf[8]&1;
-   memcpy(lenHisto, buf+9, 8);
-
-   // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) 
-   decoder->len[0] = 1;
-   decoder->symbol[0] = 0;
-
-   // we use lenHisto[0] as 1-byte symbol run length (at the end)
-   code = decoder->zeroTerminated;
-   if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
-
-   // now get all symbols from the buffer
-   for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
-      for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++)  {
-         decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1  */
-         decoder->symbol[code] = 0;
-         for(u32 j=0; j<decoder->len[code]; j++) 
-            ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
-      }
-   }
-   if (decoder->zeroTerminated) lenHisto[0]++; 
-
-   // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
-   while(code<255) {
-       decoder->symbol[code] = FSST_CORRUPT;    
-       decoder->len[code++] = 8;
-   }
-   return pos;
-}
-
-// runtime check for simd
-inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
-#ifndef NONOPT_FSST
-   if (simd && fsst_hasAVX512())
-      return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
-#endif
-   (void) simd;
-   return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch);
-}
-size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
-   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
-}
-
-// adaptive choosing of scalar compression method based on symbol length histogram 
-inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
-   bool avoidBranch = false, noSuffixOpt = false;
-   if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
-      noSuffixOpt = true;
-   } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
-              (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
-              (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
-      avoidBranch = true;
-   }
-   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
-}
-size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
-   return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
-}
-}  // namespace libfsst
-
-using namespace libfsst;
-// the main compression function (everything automatic)
-extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) {
-   // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
-   size_t totLen = accumulate(lenIn, lenIn+nlines, 0);
-   int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); 
-   return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
-}
-
-/* deallocate encoder */
-extern "C" void fsst_destroy(fsst_encoder_t* encoder) {
-  Encoder *e = (Encoder*) encoder; 
-   delete e;
-}
-
-/* very lazy implementation relying on export and import */
-extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) {
-   u8 buf[sizeof(fsst_decoder_t)];
-   u32 cnt1 = fsst_export(encoder, buf);
-   fsst_decoder_t decoder;
-   u32 cnt2 = fsst_import(&decoder, buf);
-   assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; 
-   return decoder;
-}
diff --git a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp b/cpp/src/parquet/thirdparty/fsst/libfsst.hpp
deleted file mode 100644
index e4d7c0aa9b9..00000000000
--- a/cpp/src/parquet/thirdparty/fsst/libfsst.hpp
+++ /dev/null
@@ -1,474 +0,0 @@
-// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
-//
-// Copyright 2018-2020, CWI, TU Munich, FSU Jena
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
-// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
-// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-//
-// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <numeric>
-#include <memory>
-#include <queue>
-#include <string>
-#include <unordered_set>
-#include <vector>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stddef.h>
-
-using namespace std;
-
-#include "fsst.h" // the official FSST API -- also usable by C mortals
-
-/* unsigned integers */
-namespace libfsst {
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-}  // namespace libfsst
-
-#if UINTPTR_MAX == 0xffffffffU
-// We're on a 32-bit platform
-#define NONOPT_FSST
-#endif
-
-#define FSST_ENDIAN_MARKER ((u64) 1)
-#define FSST_VERSION_20190218 20190218
-#define FSST_VERSION ((u64) FSST_VERSION_20190218)
-
-// "symbols" are character sequences (up to 8 bytes)
-// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism:
-// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X.
-
-// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length
-#define FSST_LEN_BITS       12
-#define FSST_CODE_BITS      9
-#define FSST_CODE_BASE      256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */
-#define FSST_CODE_MAX       (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */
-#define FSST_CODE_MASK      (FSST_CODE_MAX-1UL)   /* all bits set: indicating a symbol that has not been assigned a code yet */
-
-namespace libfsst {
-constexpr inline uint64_t swap64_if_be(uint64_t v) noexcept {
-#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    return __builtin_bswap64(v);
-#else
-    return v; // little-endian (or unknown), so no swap needed
-#endif
-}
-
-inline uint64_t fsst_unaligned_load(u8 const* V) {
-    uint64_t Ret;
-    memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
-    return swap64_if_be(Ret);
-}
-
-struct Symbol {
-   static const unsigned maxLength = 8;
-
-   // the byte sequence that this symbol stands for
-   union { char str[maxLength]; u64 num; } val; // usually we process it as a num(ber), as this is fast
-
-   // icl = u64 ignoredBits:16,code:12,length:4,unused:32 -- but we avoid exposing this bit-field notation
-   u64 icl;  // use a single u64 to be sure "code" is accessed with one load and can be compared with one comparison
-
-   Symbol() : icl(0) { val.num = 0; }
-
-   explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { store_num(c); } // single-char symbol
-   explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {}
-   explicit Symbol(const u8* begin, const u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {}
-   explicit Symbol(const char* input, u32 len) {
-      val.num = 0;
-      if (len>=8) {
-          len = 8;
-          memcpy(val.str, input, 8);
-      } else {
-          memcpy(val.str, input, len);
-      }
-      set_code_len(FSST_CODE_MAX, len);
-   }
-   void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); }
-
-   u64 load_num() const { return swap64_if_be(val.num); }
-   void store_num(u64 v) { val.num = swap64_if_be(v); }
-
-   u32 length() const { return (u32) (icl >> 28); }
-   u16 code() const { return (icl >> 16) & FSST_CODE_MASK; }
-   u32 ignoredBits() const { return (u32) icl; }
-
-   u8 first() const { assert( length() >= 1); return 0xFF & load_num(); }
-   u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); }
-
-#define FSST_HASH_LOG2SIZE 10
-#define FSST_HASH_PRIME 2971215073LL
-#define FSST_SHIFT 15
-#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT))
-   size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes
-};
-
-// Symbol that can be put in a queue, ordered on gain
-struct QSymbol{
-   Symbol symbol;
-   mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols
-   bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); }
-};
-
-// we construct FSST symbol tables using a random sample of about 16KB (1<<14)
-#define FSST_SAMPLETARGET (1<<14)
-#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET)
-
-// two phases of compression, before and after optimize():
-//
-// (1) to encode values we probe (and maintain) three datastructures:
-// - u16 byteCodes[256] array at the position of the next byte  (s.length==1)
-// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2)
-// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2),
-// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are
-// pseudo codes representing a single byte these will become escapes)
-//
-// (2) when we finished looking for the best symbol table we call optimize() to reshape it:
-// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1)
-//   length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes
-//   (allows shortcut during compression)
-// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding
-//   to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one
-//   byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[]
-//   and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction
-//   is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was
-//   hence added to make symbolTable construction faster.
-//
-// this final layout allows for the fastest compression code, only currently present in compressBulk
-
-// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4
-#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket
-
-// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key
-//             ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster
-//
-// the gain field is only used in the symbol queue that sorts symbols on gain
-
-struct SymbolTable {
-   static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
-
-   // lookup table using the next two bytes (65536 codes), or just the next single byte
-   u16 shortCodes[65536]; // contains code for 2-byte symbol, otherwise code for pseudo byte (escaped byte)
-
-   // lookup table (only used during symbolTable construction, not during normal text compression)
-   u16 byteCodes[256]; // contains code for every 1-byte symbol, otherwise code for pseudo byte (escaped byte)
-
-   // 'symbols' is the current symbol  table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code'
-   Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols
-
-   // replicate long symbols in hashTab (avoid indirection).
-   Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes
-
-   u16 nSymbols;          // amount of symbols in the map (max 255)
-   u16 suffixLim;         // codes higher than this do not have a longer suffix
-   u16 terminator;        // code of 1-byte symbol, that can be used as a terminator during compression
-   bool zeroTerminated;   // whether we are expecting zero-terminated strings (we then also produce zero-terminated compressed strings)
-   u16 lenHisto[FSST_CODE_BITS]; // lenHisto[x] is the amount of symbols of byte-length (x+1) in this SymbolTable
-
-   SymbolTable() : nSymbols(0), suffixLim(FSST_CODE_MAX), terminator(0), zeroTerminated(false) {
-      // stuff done once at startup
-      for (u32 i=0; i<256; i++) {
-         symbols[i] = Symbol(i,i|(1<<FSST_LEN_BITS)); // pseudo symbols
-      }
-      Symbol unused = Symbol((u8) 0,FSST_CODE_MASK); // single-char symbol, exception code
-      for (u32 i=256; i<FSST_CODE_MAX; i++) {
-         symbols[i] = unused; // we start with all symbols unused
-      }
-      // empty hash table
-      Symbol s;
-      s.val.num = 0;
-      s.icl = FSST_ICL_FREE; //marks empty in hashtab
-      for(u32 i=0; i<hashTabSize; i++)
-         hashTab[i] = s;
-
-      // fill byteCodes[] with the pseudo code all bytes (escaped bytes)
-      for(u32 i=0; i<256; i++)
-         byteCodes[i] = (1<<FSST_LEN_BITS) | i;
-
-      // fill shortCodes[] with the pseudo code for the first byte of each two-byte pattern
-      for(u32 i=0; i<65536; i++)
-         shortCodes[i] = (1<<FSST_LEN_BITS) | (i&255);
-
-      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
-   }
-
-   void clear() {
-      // clear a symbolTable with minimal effort (only erase the used positions in it)
-      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
-      for(u32 i=FSST_CODE_BASE; i<FSST_CODE_BASE+nSymbols; i++) {
-          if (symbols[i].length() == 1) {
-              u16 val = symbols[i].first();
-              byteCodes[val] = (1<<FSST_LEN_BITS) | val;
-          } else if (symbols[i].length() == 2) {
-              u16 val = symbols[i].first2();
-              shortCodes[val] = (1<<FSST_LEN_BITS) | (val&255);
-          } else {
-              u32 idx = symbols[i].hash() & (hashTabSize-1);
-              hashTab[idx].val.num = 0;
-              hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab
-          }
-      }
-      nSymbols = 0; // no need to clean symbols[] as no symbols are used
-   }
-   bool hashInsert(Symbol s) {
-      u32 idx = s.hash() & (hashTabSize-1);
-      bool taken = (hashTab[idx].icl < FSST_ICL_FREE);
-      if (taken) return false; // collision in hash table
-      hashTab[idx].icl = s.icl;
-      hashTab[idx].store_num(s.load_num() & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl));
-      return true;
-   }
-   bool add(Symbol s) {
-      assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX);
-      u32 len = s.length();
-      s.set_code_len(FSST_CODE_BASE + nSymbols, len);
-      if (len == 1) {
-         byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<<FSST_LEN_BITS); // len=1 (<<FSST_LEN_BITS)
-      } else if (len == 2) {
-         shortCodes[s.first2()] = FSST_CODE_BASE + nSymbols + (2<<FSST_LEN_BITS); // len=2 (<<FSST_LEN_BITS)
-      } else if (!hashInsert(s)) {
-         return false;
-      }
-      symbols[FSST_CODE_BASE + nSymbols++] = s;
-      lenHisto[len-1]++;
-      return true;
-   }
-   /// Find longest expansion, return code (= position in symbol table)
-   u16 findLongestSymbol(Symbol s) const {
-      size_t idx = s.hash() & (hashTabSize-1);
-      if (hashTab[idx].icl <= s.icl && hashTab[idx].load_num() == (s.load_num() & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
-         return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol
-      }
-      if (s.length() >= 2) {
-         u16 code =  shortCodes[s.first2()] & FSST_CODE_MASK;
-         if (code >= FSST_CODE_BASE) return code;
-      }
-      return byteCodes[s.first()] & FSST_CODE_MASK;
-   }
-   u16 findLongestSymbol(const u8* cur, const u8* end) const {
-      return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol
-   }
-
-   // rationale for finalize:
-   // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable()
-   //   consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize()
-   //   (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass)
-   // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf.
-   //   we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations).
-   // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[]
-   //   Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte
-   //   symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time.
-   //
-   // In all, we change the layout and coding, as follows..
-   //
-   // before finalize():
-   // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255
-   // - The first 256 codes are pseudo symbols (all escaped bytes)
-   //
-   // after finalize():
-   // - table layout is symbols[0..nSymbols>, with nSymbols < 256.
-   // - Real codes are [0,nSymbols>. 8-th bit not set.
-   // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255
-   // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last)
-   // the two-byte codes are split in two sections:
-   // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression
-   //
-   // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore).
-   //
-   void finalize(u8 zeroTerminated) {
-       assert(nSymbols <= 255);
-       u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated);
-
-       // compute running sum of code lengths (starting offsets for each length)
-       rsum[0] = byteLim; // 1-byte codes are highest
-       rsum[1] = zeroTerminated;
-       for(u32 i=1; i<7; i++)
-          rsum[i+1] = rsum[i] + lenHisto[i];
-
-       // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim)
-       suffixLim = rsum[1];
-       symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only)
-
-       for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) {
-          Symbol s1 = symbols[FSST_CODE_BASE+i];
-          u32 len = s1.length(), opt = (len == 2)*nSymbols;
-          if (opt) {
-              u16 first2 = s1.first2();
-              for(u32 k=0; k<opt; k++) {
-                 Symbol s2 = symbols[FSST_CODE_BASE+k];
-                 if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s
-                    opt = 0;
-              }
-              newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim
-          } else
-              newCode[i] = rsum[len-1]++;
-          s1.set_code_len(newCode[i],len);
-          symbols[newCode[i]] = s1;
-       }
-       // renumber the codes in byteCodes[]
-       for(u32 i=0; i<256; i++)
-          if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
-             byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS);
-          else
-             byteCodes[i] = 511 + (1 << FSST_LEN_BITS);
-
-       // renumber the codes in shortCodes[]
-       for(u32 i=0; i<65536; i++)
-          if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
-             shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS));
-          else
-             shortCodes[i] = byteCodes[i&0xFF];
-
-       // replace the symbols in the hash table
-       for(u32 i=0; i<hashTabSize; i++)
-          if (hashTab[i].icl < FSST_ICL_FREE)
-             hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]];
-   }
-};
-
-#ifdef NONOPT_FSST
-struct Counters {
-   u16 count1[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample
-   u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample
-
-   void count1Set(u32 pos1, u16 val) {
-      count1[pos1] = val;
-   }
-   void count1Inc(u32 pos1) {
-      count1[pos1]++;
-   }
-   void count2Inc(u32 pos1, u32 pos2) {
-      count2[pos1][pos2]++;
-   }
-   u32 count1GetNext(u32 &pos1) {
-      return count1[pos1];
-   }
-   u32 count2GetNext(u32 pos1, u32 &pos2) {
-      return count2[pos1][pos2];
-   }
-   void backup1(u8 *buf) {
-      memcpy(buf, count1, FSST_CODE_MAX*sizeof(u16));
-   }
-   void restore1(u8 *buf) {
-      memcpy(count1, buf, FSST_CODE_MAX*sizeof(u16));
-   }
-};
-#else
-// we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons
-// first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure.
-// second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7)
-struct Counters {
-   // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes
-   u8 count1High[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample (16-bits)
-   u8 count1Low[FSST_CODE_MAX];    // it is split in a low and high byte: cnt = count1High*256 + count1Low
-   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
-   u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
-   // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
-
-   void count1Set(u32 pos1, u16 val) {
-      count1Low[pos1] = val&255;
-      count1High[pos1] = val>>8;
-   }
-   void count1Inc(u32 pos1) {
-      if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
-         count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
-   }
-   void count2Inc(u32 pos1, u32 pos2) {
-       if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
-          // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
-          count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
-   }
-   u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
-      // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
-      u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7]
-
-      u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes
-      high = (high >> (zero << 3)) & 255; // advance to nonzero counter
-      if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
-         return 0; // all zero
-
-      u32 low = count1Low[pos1];
-      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
-      return (u32) ((high << 8) + low);
-   }
-   u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
-      // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
-      u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15]
-      high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters
-
-      u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters
-      high = (high >> (zero << 2)) & 15;  // advance to nonzero counter
-      if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
-         return 0UL; // all zero
-
-      u32 low = count2Low[pos1][pos2];
-      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
-      return (u32) ((high << 8) + low);
-   }
-   void backup1(u8 *buf) {
-      memcpy(buf, count1High, FSST_CODE_MAX);
-      memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
-   }
-   void restore1(u8 *buf) {
-      memcpy(count1High, buf, FSST_CODE_MAX);
-      memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
-   }
-};
-#endif
-
-
-#define FSST_BUFSZ (3<<19) // 768KB
-
-// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression
-struct Encoder {
-   shared_ptr<SymbolTable> symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc)
-   union {
-      Counters counters;     // for counting symbol occurences during map construction
-      u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in)
-   };
-};
-
-// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call)
-struct SIMDjob {
-   u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)
-};
-
-extern bool
-fsst_hasAVX512(); // runtime check for avx512 capability
-
-extern size_t
-fsst_compressAVX512(
-   SymbolTable &symbolTable,
-   u8* codeBase,    // IN: base address for codes, i.e. compression output (points to simdbuf+256KB)
-   u8* symbolBase,  // IN: base address for string bytes, i.e. compression input (points to simdbuf)
-   SIMDjob* input,  // IN: input array (size n) with job information: what to encode, where to store it.
-   SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer.
-   size_t n,         // IN: size of arrays input and output (should be max 512)
-   size_t unroll);   // IN: degree of SIMD unrolling
-
-// Symbol manipulation
-Symbol concat(Symbol a, Symbol b);
-
-// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
-size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
-size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd);
-}  // namespace libfsst
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index 7ba1f4f876b..ad84ffc98ba 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -66,6 +66,8 @@ ARROW_CARES_BUILD_VERSION=1.17.2
 ARROW_CARES_BUILD_SHA256_CHECKSUM=4803c844ce20ce510ef0eb83f8ea41fa24ecaae9d280c468c582d2bb25b3913d
 ARROW_CRC32C_BUILD_VERSION=1.1.2
 ARROW_CRC32C_BUILD_SHA256_CHECKSUM=ac07840513072b7fcebda6e821068aa04889018f24e10e46181068fb214d7e56
+ARROW_FSST_BUILD_VERSION=89f49c580c6388acf3b6ed2a49e1bfde6c05e616
+ARROW_FSST_BUILD_SHA256_CHECKSUM=5921d2b837800e1c886903d18971994587328b3e5d08d32eb8e80c00890c304d
 ARROW_GBENCHMARK_BUILD_VERSION=v1.8.3
 ARROW_GBENCHMARK_BUILD_SHA256_CHECKSUM=6bc180a57d23d4d9515519f92b0c83d61b05b5bab188961f36ac7b06b0d9e9ce
 ARROW_GFLAGS_BUILD_VERSION=v2.2.2
@@ -144,6 +146,7 @@ DEPENDENCIES=(
   "ARROW_BZIP2_URL bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz"
   "ARROW_CARES_URL cares-${ARROW_CARES_BUILD_VERSION}.tar.gz https://github.com/c-ares/c-ares/releases/download/cares-${ARROW_CARES_BUILD_VERSION//./_}/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz"
   "ARROW_CRC32C_URL crc32c-${ARROW_CRC32C_BUILD_VERSION}.tar.gz https://github.com/google/crc32c/archive/refs/tags/${ARROW_CRC32C_BUILD_VERSION}.tar.gz"
+  "ARROW_FSST_URL fsst-${ARROW_FSST_BUILD_VERSION}.tar.gz https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz"
   "ARROW_GBENCHMARK_URL gbenchmark-${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz https://github.com/google/benchmark/archive/${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz"
   "ARROW_GFLAGS_URL gflags-${ARROW_GFLAGS_BUILD_VERSION}.tar.gz https://github.com/gflags/gflags/archive/${ARROW_GFLAGS_BUILD_VERSION}.tar.gz"
   "ARROW_GLOG_URL glog-${ARROW_GLOG_BUILD_VERSION}.tar.gz https://github.com/google/glog/archive/${ARROW_GLOG_BUILD_VERSION}.tar.gz"

From 68cc6f236b0f3a8f1f4996800d556207827fe2aa Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 08:36:45 +0000
Subject: [PATCH 04/16] update

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 8d67849ea47..2ce366f6c9a 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -2624,15 +2624,16 @@ function(build_fsst)
                        URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}")
 
   prepare_fetchcontent()
-  fetchcontent_makeavailable(fsst)
+  fetchcontent_getproperties(fsst)
+  if(NOT fsst_POPULATED)
+    fetchcontent_populate(fsst)
+  endif()
 
-  set(ARROW_FSST_INCLUDE_DIR
-      "${fsst_SOURCE_DIR}"
-      CACHE INTERNAL "FSST include directory")
+  set(ARROW_FSST_INCLUDE_DIR "${fsst_SOURCE_DIR}" PARENT_SCOPE)
   set(ARROW_FSST_SOURCES
       "${fsst_SOURCE_DIR}/libfsst.cpp;${fsst_SOURCE_DIR}/fsst_avx512.cpp"
-      CACHE INTERNAL "FSST source files")
-  set(FSST_VENDORED TRUE CACHE INTERNAL "Whether FSST is built from source")
+      PARENT_SCOPE)
+  set(FSST_VENDORED TRUE PARENT_SCOPE)
 endfunction()
 
 if(ARROW_WITH_FSST)

From 05506fab80de3293dc1363366694e9c2792c4ecd Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 08:54:13 +0000
Subject: [PATCH 05/16] update

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 2ce366f6c9a..ada1654484a 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -2637,6 +2637,9 @@ function(build_fsst)
 endfunction()
 
 if(ARROW_WITH_FSST)
+  if("${fsst_SOURCE}" STREQUAL "")
+    set(fsst_SOURCE "BUNDLED")
+  endif()
   resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE)
 endif()
 

From 5935b9e47abba492a5f013c7aa3b1cfb53ffbf61 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 09:31:16 +0000
Subject: [PATCH 06/16] update

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index ada1654484a..ceb2c20ac49 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -2637,9 +2637,7 @@ function(build_fsst)
 endfunction()
 
 if(ARROW_WITH_FSST)
-  if("${fsst_SOURCE}" STREQUAL "")
-    set(fsst_SOURCE "BUNDLED")
-  endif()
+  set(fsst_SOURCE "BUNDLED" CACHE STRING "Source of fsst dependency" FORCE)
   resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE)
 endif()
 

From ff3b8f73ebfab93a30333d6a8bd8c00fb85d5593 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 10:32:46 +0000
Subject: [PATCH 07/16] update

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index ceb2c20ac49..245868948b8 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -73,6 +73,8 @@ set(ARROW_THIRDPARTY_DEPENDENCIES
     ZLIB
     zstd)
 
+set(fsst_SOURCE "BUNDLED" CACHE STRING "Source of fsst dependency")
+
 # For backward compatibility. We use "BOOST_SOURCE" if "Boost_SOURCE"
 # isn't specified and "BOOST_SOURCE" is specified.
 # We renamed "BOOST" dependency name to "Boost" in 3.0.0 because
@@ -2637,7 +2639,9 @@ function(build_fsst)
 endfunction()
 
 if(ARROW_WITH_FSST)
-  set(fsst_SOURCE "BUNDLED" CACHE STRING "Source of fsst dependency" FORCE)
+  if(NOT fsst_SOURCE STREQUAL "BUNDLED")
+    message(FATAL_ERROR "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED.")
+  endif()
   resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE)
 endif()
 

From f5e2bdba8fbd6e5a21bc786d2474108a0873cf8c Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 11:12:49 +0000
Subject: [PATCH 08/16] update

---
 cpp/src/parquet/CMakeLists.txt   | 4 ++++
 cpp/src/parquet/encoding_test.cc | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index a2d300d684d..66c9d08cb76 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -205,6 +205,10 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR)
   list(APPEND PARQUET_PRIVATE_INCLUDE_DIRS ${ARROW_FSST_INCLUDE_DIR})
   list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR})
 endif()
+if(DEFINED ARROW_FSST_SOURCES)
+  set_source_files_properties(${ARROW_FSST_SOURCES}
+                              PROPERTIES COMPILE_OPTIONS "-Wno-error=shorten-64-to-32")
+endif()
 
 if(ARROW_HAVE_RUNTIME_AVX2)
   # AVX2 is used as a proxy for BMI2.
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index a43bafd0960..ebc86f14853 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -2886,7 +2886,7 @@ TEST(TestFsstEncoding, HeavyNullsDecodeSpaced) {
   }
   writer.Finish();
 
-  const int null_count = binary.null_count();
+  const int null_count = static_cast<int>(binary.null_count());
   ASSERT_EQ(static_cast<int>(values->length() - null_count),
             decoder->DecodeSpaced(decoded.data(), static_cast<int>(values->length()),
                                   null_count, valid_bits.data(), 0));

From 7c105d4d04b56101a1294971f170d90cc5dd38f1 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 11:47:43 +0000
Subject: [PATCH 09/16] update

---
 cpp/src/parquet/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 66c9d08cb76..38eb0dc59f1 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -207,7 +207,9 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR)
 endif()
 if(DEFINED ARROW_FSST_SOURCES)
   set_source_files_properties(${ARROW_FSST_SOURCES}
-                              PROPERTIES COMPILE_OPTIONS "-Wno-error=shorten-64-to-32")
+                              PROPERTIES
+                              COMPILE_OPTIONS
+                              "$<$<COMPILE_LANG_AND_ID:CXX,Clang,AppleClang>:-Wno-error=shorten-64-to-32>")
 endif()
 
 if(ARROW_HAVE_RUNTIME_AVX2)

From 511776f0b43697af94d12de760061a107c99368a Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 14:22:07 +0000
Subject: [PATCH 10/16] update

---
 cpp/src/parquet/CMakeLists.txt |  9 +++++----
 cpp/src/parquet/fsst_compat.h  | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)
 create mode 100644 cpp/src/parquet/fsst_compat.h

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 38eb0dc59f1..e2b1db09633 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -206,10 +206,11 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR)
   list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR})
 endif()
 if(DEFINED ARROW_FSST_SOURCES)
-  set_source_files_properties(${ARROW_FSST_SOURCES}
-                              PROPERTIES
-                              COMPILE_OPTIONS
-                              "$<$<COMPILE_LANG_AND_ID:CXX,Clang,AppleClang>:-Wno-error=shorten-64-to-32>")
+  set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
+               "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>"
+               "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:-Wno-error=missing-declarations;-Wno-missing-declarations>"
+               "$<$<CXX_COMPILER_ID:MSVC>:/wd4244>"
+               "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>")
 endif()
 
 if(ARROW_HAVE_RUNTIME_AVX2)
diff --git a/cpp/src/parquet/fsst_compat.h b/cpp/src/parquet/fsst_compat.h
new file mode 100644
index 00000000000..2da27d27301
--- /dev/null
+++ b/cpp/src/parquet/fsst_compat.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+// Provide minimal compatibility shims so third-party FSST sources
+// can be compiled with the compilers Arrow supports.
+
+#if defined(_WIN32) && !defined(_MSC_VER)
+#include <cpuid.h>
+
+// MinGW does not provide __cpuidex, but FSST only needs the CPUID
+// leaf/sub-leaf variant that __cpuid_count implements.
+static inline void arrow_fsst_cpuidex(int info[4], int function_id, int subfunction_id) {
+  __cpuid_count(function_id, subfunction_id, info[0], info[1], info[2], info[3]);
+}
+#define __cpuidex(info, function_id, subfunction_id) \
+  arrow_fsst_cpuidex(info, function_id, subfunction_id)
+#endif

From 0344d45478bf593af40c9c2543065ed6014dbd41 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 14:41:34 +0000
Subject: [PATCH 11/16] update

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 245868948b8..39bda3d2668 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -647,8 +647,7 @@ if(DEFINED ENV{ARROW_FSST_URL})
   set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}")
 else()
   set_urls(FSST_SOURCE_URL
-           "https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz"
-           "${THIRDPARTY_MIRROR_URL}/fsst-${ARROW_FSST_BUILD_VERSION}.tar.gz")
+           "https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz")
 endif()
 
 if(DEFINED ENV{ARROW_GBENCHMARK_URL})

From f9b43c48f2a1b596cb080378dc7fe146a7f30e80 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Mon, 24 Nov 2025 15:52:53 +0000
Subject: [PATCH 12/16] update

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 26 +++++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 39bda3d2668..33db56e1969 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -643,11 +643,14 @@ else()
   )
 endif()
 
+set(FSST_SOURCE_URL "")
+set(FSST_GIT_REPOSITORY "")
 if(DEFINED ENV{ARROW_FSST_URL})
   set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}")
+elseif(DEFINED ENV{ARROW_FSST_GIT_REPOSITORY})
+  set(FSST_GIT_REPOSITORY "$ENV{ARROW_FSST_GIT_REPOSITORY}")
 else()
-  set_urls(FSST_SOURCE_URL
-           "https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz")
+  set(FSST_GIT_REPOSITORY "https://github.com/cwida/fsst.git")
 endif()
 
 if(DEFINED ENV{ARROW_GBENCHMARK_URL})
@@ -2620,9 +2623,22 @@ endif()
 function(build_fsst)
   message(STATUS "Building FSST from source using FetchContent")
 
-  fetchcontent_declare(fsst
-                       URL ${FSST_SOURCE_URL}
-                       URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}")
+  if(FSST_SOURCE_URL)
+    fetchcontent_declare(fsst
+                         ${FC_DECLARE_COMMON_OPTIONS}
+                         URL ${FSST_SOURCE_URL}
+                         URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}")
+  else()
+    if(NOT FSST_GIT_REPOSITORY)
+      message(FATAL_ERROR "FSST_GIT_REPOSITORY is not set and no FSST_SOURCE_URL override was provided.")
+    endif()
+    fetchcontent_declare(fsst
+                         ${FC_DECLARE_COMMON_OPTIONS}
+                         GIT_REPOSITORY ${FSST_GIT_REPOSITORY}
+                         GIT_TAG ${ARROW_FSST_BUILD_VERSION}
+                         GIT_SHALLOW TRUE
+                         GIT_PROGRESS TRUE)
+  endif()
 
   prepare_fetchcontent()
   fetchcontent_getproperties(fsst)

From c49f19c771eaee4d46a56eaecde0df625adfd58f Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Tue, 25 Nov 2025 07:31:32 +0000
Subject: [PATCH 13/16] vendor fsst

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake |  39 +-
 cpp/thirdparty/fsst/fsst.h                  | 227 +++++++
 cpp/thirdparty/fsst/fsst_avx512.cpp         | 150 +++++
 cpp/thirdparty/fsst/fsst_avx512_unroll1.inc |  57 ++
 cpp/thirdparty/fsst/fsst_avx512_unroll2.inc | 114 ++++
 cpp/thirdparty/fsst/fsst_avx512_unroll3.inc | 171 +++++
 cpp/thirdparty/fsst/fsst_avx512_unroll4.inc | 228 +++++++
 cpp/thirdparty/fsst/libfsst.cpp             | 651 ++++++++++++++++++++
 cpp/thirdparty/fsst/libfsst.hpp             | 471 ++++++++++++++
 9 files changed, 2072 insertions(+), 36 deletions(-)
 create mode 100644 cpp/thirdparty/fsst/fsst.h
 create mode 100644 cpp/thirdparty/fsst/fsst_avx512.cpp
 create mode 100644 cpp/thirdparty/fsst/fsst_avx512_unroll1.inc
 create mode 100644 cpp/thirdparty/fsst/fsst_avx512_unroll2.inc
 create mode 100644 cpp/thirdparty/fsst/fsst_avx512_unroll3.inc
 create mode 100644 cpp/thirdparty/fsst/fsst_avx512_unroll4.inc
 create mode 100644 cpp/thirdparty/fsst/libfsst.cpp
 create mode 100644 cpp/thirdparty/fsst/libfsst.hpp

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 33db56e1969..c0c84511c1d 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -643,16 +643,6 @@ else()
   )
 endif()
 
-set(FSST_SOURCE_URL "")
-set(FSST_GIT_REPOSITORY "")
-if(DEFINED ENV{ARROW_FSST_URL})
-  set(FSST_SOURCE_URL "$ENV{ARROW_FSST_URL}")
-elseif(DEFINED ENV{ARROW_FSST_GIT_REPOSITORY})
-  set(FSST_GIT_REPOSITORY "$ENV{ARROW_FSST_GIT_REPOSITORY}")
-else()
-  set(FSST_GIT_REPOSITORY "https://github.com/cwida/fsst.git")
-endif()
-
 if(DEFINED ENV{ARROW_GBENCHMARK_URL})
   set(GBENCHMARK_SOURCE_URL "$ENV{ARROW_GBENCHMARK_URL}")
 else()
@@ -2621,34 +2611,11 @@ if(ARROW_USE_XSIMD)
 endif()
 
 function(build_fsst)
-  message(STATUS "Building FSST from source using FetchContent")
-
-  if(FSST_SOURCE_URL)
-    fetchcontent_declare(fsst
-                         ${FC_DECLARE_COMMON_OPTIONS}
-                         URL ${FSST_SOURCE_URL}
-                         URL_HASH "SHA256=${ARROW_FSST_BUILD_SHA256_CHECKSUM}")
-  else()
-    if(NOT FSST_GIT_REPOSITORY)
-      message(FATAL_ERROR "FSST_GIT_REPOSITORY is not set and no FSST_SOURCE_URL override was provided.")
-    endif()
-    fetchcontent_declare(fsst
-                         ${FC_DECLARE_COMMON_OPTIONS}
-                         GIT_REPOSITORY ${FSST_GIT_REPOSITORY}
-                         GIT_TAG ${ARROW_FSST_BUILD_VERSION}
-                         GIT_SHALLOW TRUE
-                         GIT_PROGRESS TRUE)
-  endif()
-
-  prepare_fetchcontent()
-  fetchcontent_getproperties(fsst)
-  if(NOT fsst_POPULATED)
-    fetchcontent_populate(fsst)
-  endif()
+  message(STATUS "Configuring vendored FSST sources")
 
-  set(ARROW_FSST_INCLUDE_DIR "${fsst_SOURCE_DIR}" PARENT_SCOPE)
+  set(ARROW_FSST_INCLUDE_DIR "${ARROW_SOURCE_DIR}/thirdparty/fsst" PARENT_SCOPE)
   set(ARROW_FSST_SOURCES
-      "${fsst_SOURCE_DIR}/libfsst.cpp;${fsst_SOURCE_DIR}/fsst_avx512.cpp"
+      "${ARROW_SOURCE_DIR}/thirdparty/fsst/libfsst.cpp;${ARROW_SOURCE_DIR}/thirdparty/fsst/fsst_avx512.cpp"
       PARENT_SCOPE)
   set(FSST_VENDORED TRUE PARENT_SCOPE)
 endfunction()
diff --git a/cpp/thirdparty/fsst/fsst.h b/cpp/thirdparty/fsst/fsst.h
new file mode 100644
index 00000000000..71085d57201
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst.h
@@ -0,0 +1,227 @@
+/* 
+ * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
+ *
+ * ===================================================================================================================================
+ * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+ *
+ * Copyright 2018-2020, CWI, TU Munich, FSU Jena
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files 
+ * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, 
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
+ * furnished to do so, subject to the following conditions:
+ *
+ * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+ * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+ * ===================================================================================================================================
+ *
+ * FSST: Fast Static Symbol Table compression 
+ * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
+ *
+ * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
+ * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual
+ * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is 
+ * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
+ *
+ * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) 
+ * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression 
+ * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could 
+ * be seen as strings again and fit in whatever your program is that manipulates strings.
+ *
+ * useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
+ * 
+ * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of 
+ * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. 
+ *
+ * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
+ * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
+ *
+ * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are 
+ * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length.
+ */
+#ifndef FSST_INCLUDED_H
+#define FSST_INCLUDED_H
+
+#ifdef _MSC_VER
+#define __restrict__ 
+#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#define __ORDER_LITTLE_ENDIAN__ 2
+#include <intrin.h>
+static inline int __builtin_ctzl(unsigned long long x) {
+    unsigned long ret;
+    _BitScanForward64(&ret, x);
+    return (int)ret;
+}
+#endif
+
+#ifdef __cplusplus
+#define FSST_FALLTHROUGH [[fallthrough]]
+#include <cstring>
+extern "C" {
+#else
+#define FSST_FALLTHROUGH 
+#endif
+
+#include <stddef.h>
+
+/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */
+#define FSST_ESC 255
+
+/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */
+typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */
+
+/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */
+typedef struct {
+   unsigned long long version;     /* version id */
+   unsigned char zeroTerminated;   /* terminator is a single-byte code that does not appear in longer symbols */
+   unsigned char len[255];         /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */
+   unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ 
+} fsst_decoder_t;
+
+/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */
+fsst_encoder_t*  
+fsst_create(
+   size_t n,         /* IN: number of strings in batch to sample from. */
+   const size_t lenIn[],   /* IN: byte-lengths of the inputs */
+   const unsigned char *strIn[],  /* IN: string start pointers. */
+   int zeroTerminated       /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */
+);
+
+/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ 
+fsst_encoder_t*    
+fsst_duplicate(
+   fsst_encoder_t *encoder  /* IN: the symbol table to duplicate. */ 
+);
+
+#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */
+
+/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */
+unsigned int                /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */
+fsst_export(
+   fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ 
+   unsigned char *buf       /* OUT: pointer to a byte-buffer where to serialize this symbol table. */
+); 
+
+/* Deallocate encoder. */
+void
+fsst_destroy(fsst_encoder_t*);
+
+/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */
+unsigned int                /* OUT: number of bytes consumed in buf (0 on failure). */
+fsst_import(
+   fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ 
+   unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */
+); 
+
+/* Return a decoder structure from an encoder. */
+fsst_decoder_t    
+fsst_decoder(
+   fsst_encoder_t *encoder   
+);
+
+/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */
+/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */
+size_t                      /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ 
+fsst_compress(
+   fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */
+   size_t nstrings,         /* IN: number of strings in batch to compress. */
+   const size_t lenIn[],          /* IN: byte-lengths of the inputs */
+   const unsigned char *strIn[],  /* IN: input string start pointers. */
+   size_t outsize,          /* IN: byte-length of output buffer. */
+   unsigned char *output,   /* OUT: memory buffer to put the compressed strings in (one after the other). */
+   size_t lenOut[],         /* OUT: byte-lengths of the compressed strings. */
+   unsigned char *strOut[]  /* OUT: output string start pointers. Will all point into [output,output+size). */
+);
+
+/* Decompress a single string, inlined for speed. */
+inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */
+fsst_decompress(
+   const fsst_decoder_t *decoder,  /* IN: use this symbol table for compression. */
+   size_t lenIn,             /* IN: byte-length of compressed string. */
+   const unsigned char *strIn,     /* IN: compressed string. */
+   size_t size,              /* IN: byte-length of output buffer. */
+   unsigned char *output     /* OUT: memory buffer to put the decompressed string in. */
+) {
+   unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
+   unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
+   unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; 
+   size_t code, posOut = 0, posIn = 0;
+#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
+#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+   while (posOut+32 <= size && posIn+4 <= lenIn) {
+      unsigned int nextBlock, escapeMask;
+      memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int));
+      escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u);
+      if (escapeMask == 0) {
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+     } else { 
+         unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3;
+         switch(firstEscapePos) { /* Duff's device */
+         case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */
+         }
+      }
+   }
+   if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop
+      if (posIn+2 <= lenIn) { 
+	 strOut[posOut] = strIn[posIn+1]; 
+         if (strIn[posIn] != FSST_ESC) {
+            code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+            if (strIn[posIn] != FSST_ESC) {
+               code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+            } else { 
+               posIn += 2; strOut[posOut++] = strIn[posIn-1]; 
+            }
+         } else {
+            posIn += 2; posOut++; 
+         } 
+      }
+      if (posIn < lenIn) { // last code cannot be an escape
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+      }
+   }
+#else
+   while (posOut+8 <= size && posIn < lenIn)
+      if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */
+         FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */
+         posOut += len[code];
+      } else { 
+         strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */
+         posIn++; posOut++; 
+      }
+#endif
+#endif
+   while (posIn < lenIn)
+      if ((code = strIn[posIn++]) < FSST_ESC) {
+         size_t posWrite = posOut, endWrite = posOut + len[code];
+         unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite;
+         if ((posOut = endWrite) > size) endWrite = size;
+         for(; posWrite < endWrite; posWrite++)  /* only write if there is room */
+            strOut[posWrite] = symbolPointer[posWrite];
+      } else {
+         if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */
+         posIn++; posOut++; 
+      } 
+   if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0;
+   return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* FSST_INCLUDED_H */
diff --git a/cpp/thirdparty/fsst/fsst_avx512.cpp b/cpp/thirdparty/fsst/fsst_avx512.cpp
new file mode 100644
index 00000000000..150683d2f9a
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512.cpp
@@ -0,0 +1,150 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+#include "libfsst.hpp"
+
+#if defined(__x86_64__) || defined(_M_X64)
+#include <immintrin.h>
+
+#ifdef _WIN32
+namespace libfsst {
+bool fsst_hasAVX512() {
+   int info[4];
+   __cpuidex(info, 0x00000007, 0);
+   return (info[1]>>16)&1;
+}
+}  // namespace libfsst
+#else
+#include <cpuid.h>
+namespace libfsst {
+bool fsst_hasAVX512() {
+   int info[4];
+    __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]);
+   return (info[1]>>16)&1;
+}
+}  // namespace libfsst
+#endif
+#else
+namespace libfsst {
+bool fsst_hasAVX512() { return false; }
+}  // namespace libfsst
+#endif
+
+namespace libfsst {
+
+// BULK COMPRESSION OF STRINGS
+//
+// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes.
+// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up.
+//
+// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4 
+// unroll3 performs best on my hardware
+//
+// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes).
+// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs). 
+//
+// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits)
+// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order.
+//
+// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings.
+// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process.
+// - so 'processed' is the amount of strings we started processing and it is between [480,512].
+// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished.
+// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method.
+//
+// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings.
+// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings.
+// 
+// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to.
+// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch).
+//
+// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS
+// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however, 
+// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them. 
+// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. 
+// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores.
+
+size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) {
+   size_t processed = 0;
+   // define some constants (all_x means that all 8 lanes contain 64-bits value X)
+#ifdef __AVX512F__
+ //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c
+   __m512i all_MASK     = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1));
+   __m512i all_PRIME    = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME));
+   __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE));
+#define    all_HASH       _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE)
+#define    all_ONE        _mm512_srli_epi64(all_MASK, 63)
+#define    all_M19        _mm512_srli_epi64(all_MASK, 45)
+#define    all_M18        _mm512_srli_epi64(all_MASK, 46)
+#define    all_M28        _mm512_srli_epi64(all_MASK, 36)
+#define    all_FFFFFF     _mm512_srli_epi64(all_MASK, 40)
+#define    all_FFFF       _mm512_srli_epi64(all_MASK, 48)
+#define    all_FF         _mm512_srli_epi64(all_MASK, 56)
+
+   SIMDjob *inputEnd = input+n;
+   assert(n >= unroll*8 && n <= 512); // should be close to 512 
+   __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4
+   __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll
+   u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll
+
+   if (unroll >= 4) {
+      while (input+delta1+delta2+delta3+delta4 < inputEnd) {
+         #include "fsst_avx512_unroll4.inc"
+      }
+   } else if (unroll == 3) {
+      while (input+delta1+delta2+delta3 < inputEnd) {
+         #include "fsst_avx512_unroll3.inc"
+      }
+   } else if (unroll == 2) {
+      while (input+delta1+delta2 < inputEnd) {
+         #include "fsst_avx512_unroll2.inc"
+      }
+   } else {
+      while (input+delta1 < inputEnd) {
+         #include "fsst_avx512_unroll1.inc"
+      }
+   }
+
+   // flush the job states of the unfinished strings at the end of output[] 
+   processed = n - (inputEnd - input);
+   u32 unfinished = 0;
+   if (unroll > 1) { 
+      if (unroll > 2) { 
+         if (unroll > 3) { 
+            _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); 
+            unfinished += _mm_popcnt_u32((int) loadmask4);
+         }
+         _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); 
+         unfinished += _mm_popcnt_u32((int) loadmask3);
+      }
+      _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); 
+      unfinished += _mm_popcnt_u32((int) loadmask2);
+   }
+   _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); 
+#else
+   (void) symbolTable;
+   (void) codeBase;
+   (void) symbolBase;
+   (void) input;
+   (void) output;
+   (void) n;
+   (void) unroll;
+#endif
+   return processed;
+}
+}  // namespace libfsst
+
diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc
new file mode 100644
index 00000000000..f4b81c7970d
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc
@@ -0,0 +1,57 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc
new file mode 100644
index 00000000000..aa33cd7e69c
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc
@@ -0,0 +1,114 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc
new file mode 100644
index 00000000000..e2057032abd
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc
@@ -0,0 +1,171 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+            job3      = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+   __m512i  word3     = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code3     = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+   __m512i  pos3      = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+                        // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+            pos3      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl3      = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+   __m512i  write3    = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb3     = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+            pos3      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+   __mmask8 match3    = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+            code3     = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+            write3    = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+            code3     = _mm512_and_epi64(code3, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+            job3      = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+            job3      = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+                        // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+            loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+                        // calculate the amount of lanes in job3 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+            delta3    = _mm_popcnt_u32((int) loadmask3); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3;
diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc
new file mode 100644
index 00000000000..15cca7c938b
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc
@@ -0,0 +1,228 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+            job3      = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; 
+            job4      = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+   __m512i  word3     = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); 
+   __m512i  word4     = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code3     = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code4     = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+   __m512i  pos3      = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME);
+   __m512i  pos4      = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+                        // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT
+                        // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+            pos3      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4);
+            pos4      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl3      = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl4      = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+   __m512i  write3    = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8);
+   __m512i  write4    = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb3     = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb4     = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+            pos3      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF));
+            pos4      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+   __mmask8 match3    = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE);
+   __mmask8 match4    = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+            code3     = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16));
+            code4     = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+            write3    = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF));
+            write4    = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+            code3     = _mm512_and_epi64(code3, all_FFFF);
+            code4     = _mm512_and_epi64(code4, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+            job3      = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46));
+            job4      = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+            job3      = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE)));
+            job4      = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+                        // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register)
+                        // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+            loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18));
+            loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+                        // calculate the amount of lanes in job3 that are done
+                        // calculate the amount of lanes in job4 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+            delta3    = _mm_popcnt_u32((int) loadmask3); 
+            delta4    = _mm_popcnt_u32((int) loadmask4); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4;
diff --git a/cpp/thirdparty/fsst/libfsst.cpp b/cpp/thirdparty/fsst/libfsst.cpp
new file mode 100644
index 00000000000..e3ba787b959
--- /dev/null
+++ b/cpp/thirdparty/fsst/libfsst.cpp
@@ -0,0 +1,651 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+#include "libfsst.hpp"
+
+namespace libfsst {
+Symbol concat(Symbol a, Symbol b) {
+   Symbol s;
+   u32 length = a.length()+b.length();
+   if (length > Symbol::maxLength) length = Symbol::maxLength; 
+   s.set_code_len(FSST_CODE_MASK, length);
+   s.store_num((b.load_num() << (8*a.length())) | a.load_num());
+   return s;
+}
+}  // namespace libfsst
+
+namespace std {
+template <>
+class hash<libfsst::QSymbol> {
+   public:
+   size_t operator()(const libfsst::QSymbol& q) const {
+      uint64_t k = q.symbol.load_num();
+      const uint64_t m = 0xc6a4a7935bd1e995;
+      const int r = 47;
+      uint64_t h = 0x8445d61a4e774912 ^ (8*m);
+      k *= m;
+      k ^= k >> r;
+      k *= m;
+      h ^= k;
+      h *= m;
+      h ^= h >> r;
+      h *= m;
+      h ^= h >> r;
+      return h;
+   }
+};
+}
+
+namespace libfsst {
+bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
+
+std::ostream& operator<<(std::ostream& out, const Symbol& s) {
+   for (u32 i=0; i<s.length(); i++)
+      out << s.val.str[i];
+   return out;
+}
+
+SymbolTable *buildSymbolTable(Counters& counters, vector<const u8*> line, const size_t len[], bool zeroTerminated=false) {
+   SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
+   int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
+   size_t sampleFrac = 128;
+
+   // start by determining the terminator. We use the (lowest) most infrequent byte as terminator 
+   st->zeroTerminated = zeroTerminated;
+   if (zeroTerminated) {
+      st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
+   } else {
+      u16 byteHisto[256];
+      memset(byteHisto, 0, sizeof(byteHisto));
+      for(size_t i=0; i<line.size(); i++) {
+         const u8* cur = line[i];
+         const u8* end = cur + len[i];
+         while(cur < end) byteHisto[*cur++]++;
+      }
+      u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
+      while(i-- > 0) {
+         if (byteHisto[i] > minSize) continue;
+         st->terminator = i;
+         minSize = byteHisto[i];
+      }
+   }
+   assert(st->terminator != 256);
+
+   // a random number between 0 and 128
+   auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
+
+   // compress sample, and compute (pair-)frequencies
+   auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
+      int gain = 0;
+
+      for(size_t i=0; i<line.size(); i++) {
+         const u8* cur = line[i], *start = cur;
+         const u8* end = cur + len[i];
+
+         if (sampleFrac < 128) {
+            // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
+            if (rnd128(i) > sampleFrac) continue;
+         }
+         if (cur < end) {
+            u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
+            cur += st->symbols[code1].length();
+            gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
+            while (true) {
+               // count single symbol (i.e. an option is not extending it)
+               counters.count1Inc(code1);
+	
+               // as an alternative, consider just using the next byte..
+               if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
+                  counters.count1Inc(*start); 
+
+               if (cur==end) { 
+                  break;
+               } 
+
+               // now match a new symbol
+	       start = cur;
+               if (cur<end-7) {
+                  u64 word = fsst_unaligned_load(cur);
+                  size_t code = word & 0xFFFFFF;
+                  size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
+                  Symbol s = st->hashTab[idx];
+                  code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
+                  word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                  if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) {
+                     code2 = s.code(); 
+		     cur += s.length();
+                  } else if (code2 >= FSST_CODE_BASE) {
+                     cur += 2;
+                  } else {
+                     code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
+                     cur += 1;
+                  }
+               } else {
+                  code2 = st->findLongestSymbol(cur, end);
+                  cur += st->symbols[code2].length();
+               }
+ 
+               // compute compressed output size
+               gain += ((int) (cur-start))-(1+isEscapeCode(code2));
+
+               if (sampleFrac < 128) { // no need to count pairs in final round
+	          // consider the symbol that is the concatenation of the two last symbols
+                  counters.count2Inc(code1, code2);
+
+                  // as an alternative, consider just extending with the next byte..
+                  if ((cur-start) > 1)  // ..but do not count single byte extensions doubly
+                     counters.count2Inc(code1, *start);
+               }
+               code1 = code2;
+            }
+         }
+      }
+      return gain; 
+   };
+
+   auto makeTable = [&](SymbolTable *st, Counters &counters) {
+      // hashmap of c (needed because we can generate duplicate candidates)
+      unordered_set<QSymbol> cands;
+
+      // artificially make terminater the most frequent symbol so it gets included
+      u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
+      counters.count1Set(terminator,65535); 
+
+      auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
+         if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
+         QSymbol q;
+         q.symbol = s;
+         q.gain = count * s.length();
+         auto it = cands.find(q);
+         if (it != cands.end()) {
+            q.gain += (*it).gain;
+            cands.erase(*it);
+         }
+         cands.insert(q);
+      };
+
+      // add candidate symbols based on counted frequency
+      for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) { 
+         u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
+         if (!cnt1) continue;
+
+         // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
+         Symbol s1 = st->symbols[pos1];
+         addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
+
+         if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
+             s1.length() == Symbol::maxLength || // symbol cannot be extended
+             s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
+            continue;
+         }
+         for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) { 
+            u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
+            if (!cnt2) continue;
+
+            // create a new symbol
+            Symbol s2 = st->symbols[pos2];
+            Symbol s3 = concat(s1, s2);
+            if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
+               addOrInc(cands, s3, cnt2);
+         }
+      }
+
+      // insert candidates into priority queue (by gain)
+      auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); };
+      priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
+      for (auto& q : cands)
+         pq.push(q);
+
+      // Create new symbol map using best candidates
+      st->clear();
+      while (st->nSymbols < 255 && !pq.empty()) {
+         QSymbol q = pq.top();
+         pq.pop();
+         st->add(q.symbol);
+      }
+   };
+
+   u8 bestCounters[512*sizeof(u16)];
+#ifdef NONOPT_FSST
+   for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
+      sampleFrac = frac;
+#else
+   for(sampleFrac=8; true; sampleFrac += 30) {
+#endif
+      memset(&counters, 0, sizeof(Counters));
+      long gain = compressCount(st, counters);
+      if (gain >= bestGain) { // a new best solution!
+         counters.backup1(bestCounters);
+         *bestTable = *st; bestGain = gain;
+      } 
+      if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
+      makeTable(st, counters);
+   }
+   delete st;
+   counters.restore1(bestCounters);
+   makeTable(bestTable, counters);
+   bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
+   return bestTable;
+}
+
+#ifndef NONOPT_FSST
+static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
+   size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
+   u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings 
+   SIMDjob input[512];  // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
+   SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
+   size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
+
+   while (curLine < nlines && outOff <= (1<<19)) {
+      size_t prevLine = curLine, chunk, curOff = 0;
+ 
+      // bail out if the output buffer cannot hold the compressed next string fully
+      if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
+      else budget -= (len[curLine]-curOff)*2;
+
+      strOut[curLine] = (u8*) 0; 
+      lenOut[curLine] = 0;
+
+      do {
+         do {
+            chunk = len[curLine] - curOff;
+            if (chunk > 511) {
+               chunk = 511; // large strings need to be chopped up into segments of 511 bytes
+            }
+            // create a job in this batch
+            SIMDjob job;
+            job.cur = inOff;
+            job.end = job.cur + chunk;
+            job.pos = batchPos;
+            job.out = outOff;
+   
+            // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
+            outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
+            if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
+   
+            // register job in this batch
+            input[batchPos] = job;
+            jobLine[batchPos] = curLine;
+   
+            if (chunk == 0) {
+               empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
+            } else {
+               // copy string chunk into temp buffer 
+               memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
+               inOff += chunk;
+               curOff += chunk;
+               symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
+            }
+            if (++batchPos == 512) break;
+         } while(curOff < len[curLine]);
+   
+         if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines) || (((len[curLine])*2 + 7) > budget)) { // cannot accumulate more?
+            if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
+               // radix-sort jobs on length (longest string first) 
+               // -- this provides best load balancing and allows to skip empty jobs at the end
+               u16 sortpos[513]; 
+               memset(sortpos, 0, sizeof(sortpos));
+   
+               // calculate length histo 
+               for(size_t i=0; i<batchPos; i++) { 
+                  size_t len = input[i].end - input[i].cur; 
+                  sortpos[512UL - len]++;
+               }
+               // calculate running sum
+               for(size_t i=1; i<=512; i++) 
+                  sortpos[i] += sortpos[i-1]; 
+   
+               // move jobs to their final destination
+               SIMDjob inputOrdered[512];
+               for(size_t i=0; i<batchPos; i++) {
+                  size_t len = input[i].end - input[i].cur; 
+                  size_t pos = sortpos[511UL - len]++;
+                  inputOrdered[pos] = input[i]; 
+                }
+               // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..) 
+               for(size_t done = fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll);
+                   done < batchPos; done++) output[done] = inputOrdered[done]; 
+            } else {
+               memcpy(output, input, batchPos*sizeof(SIMDjob));
+            }
+   
+            // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
+            for(size_t i=0; i<batchPos; i++) {
+               SIMDjob job = output[i];
+               if (job.cur < job.end) { // finish encoding this string with scalar code
+                  u8* cur = symbolBase + job.cur;
+                  u8* end = symbolBase + job.end;
+                  u8* out = codeBase + job.out;
+                  while (cur < end) {
+                     u64 word = fsst_unaligned_load(cur);
+                     size_t code = symbolTable.shortCodes[word & 0xFFFF];
+                     size_t pos = word & 0xFFFFFF;
+                     size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
+                     Symbol s = symbolTable.hashTab[idx];
+                     out[1] = (u8) word; // speculatively write out escaped byte
+                     word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                     if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
+                        *out++ = (u8) s.code(); cur += s.length();
+                     } else {
+                        // could be a 2-byte or 1-byte code, or miss
+                        // handle everything with predication 
+                        *out = (u8) code; 
+                        out += 1+((code&FSST_CODE_BASE)>>8);
+                        cur += (code>>FSST_LEN_BITS); 
+                    }
+                  }
+                  job.out = out - codeBase;
+               } 
+               // postprocess job info
+               job.cur = 0;
+               job.end = job.out - input[job.pos].out; // misuse .end field as compressed size 
+               job.out = input[job.pos].out; // reset offset to start of encoded string
+               input[job.pos] = job; 
+            }
+   
+            // copy out the result data
+            for(size_t i=0; i<batchPos; i++) {
+               size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
+               size_t sz = input[i].end; // had stored compressed lengths here
+               if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
+               lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
+               memcpy(dst, codeBase+input[i].out, sz);
+               dst += sz;
+            }
+   
+            // go for the next batch of 512 chunks
+            inOff = outOff = batchPos = empty = 0;
+            budget = (size_t) (lim - dst);
+         } 
+      } while (curLine == prevLine && outOff <= (1<<19));
+   }
+   return curLine;
+}
+#endif
+
+// optimized adaptive *scalar* compression method
+static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, const size_t lenIn[], const u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
+   const u8 *cur = NULL, *end =  NULL, *lim = out + size;
+   size_t curLine, suffixLim = symbolTable.suffixLim;
+   u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
+
+   u8 buf[512+8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
+
+   // three variants are possible. dead code falls away since the bool arguments are constants
+   auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
+      while (cur < end) {
+         u64 word = fsst_unaligned_load(cur);
+         size_t code = symbolTable.shortCodes[word & 0xFFFF];
+         if (noSuffixOpt && ((u8) code) < suffixLim) {
+            // 2 byte code without having to worry about longer matches
+            *out++ = (u8) code; cur += 2;
+         } else {
+            size_t pos = word & 0xFFFFFF;
+            size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
+            Symbol s = symbolTable.hashTab[idx];
+            out[1] = (u8) word; // speculatively write out escaped byte
+            word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+            if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
+               *out++ = (u8) s.code(); cur += s.length();
+            } else if (avoidBranch) {
+               // could be a 2-byte or 1-byte code, or miss
+               // handle everything with predication 
+               *out = (u8) code; 
+               out += 1+((code&FSST_CODE_BASE)>>8);
+               cur += (code>>FSST_LEN_BITS); 
+            } else if ((u8) code < byteLim) {
+               // 2 byte code after checking there is no longer pattern
+               *out++ = (u8) code; cur += 2;
+            } else {
+               // 1 byte code or miss. 
+               *out = (u8) code; 
+               out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
+               cur++;
+            }
+         }
+      }
+   };
+
+   for(curLine=0; curLine<nlines; curLine++) {
+      size_t chunk, curOff = 0;
+      strOut[curLine] = out;
+      do {
+         cur = strIn[curLine] + curOff; 
+         chunk = lenIn[curLine] - curOff;
+         if (chunk > 511) {
+            chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST 
+         }
+         if ((2*chunk+7) > (size_t) (lim-out)) {
+            return curLine; // out of memory
+         }
+         // copy the string to the 511-byte buffer
+         memcpy(buf, cur, chunk);
+         buf[chunk] = (u8) symbolTable.terminator;
+         cur = buf;
+         end = cur + chunk; 
+
+         // based on symboltable stats, choose a variant that is nice to the branch predictor
+         if (noSuffixOpt) {
+            compressVariant(true,false);
+         } else if (avoidBranch) {
+            compressVariant(false,true);
+         } else {
+          compressVariant(false, false);
+         }
+      } while((curOff += chunk) < lenIn[curLine]);
+      lenOut[curLine] = (size_t) (out - strOut[curLine]);
+   } 
+   return curLine;
+}
+
+#define FSST_SAMPLELINE ((size_t) 512)
+
+// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
+vector<const u8*> makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) {
+   size_t totSize = 0;
+   const size_t *lenIn = *lenRef;
+   vector<const u8*> sample;
+
+   for(size_t i=0; i<nlines; i++) 
+      totSize += lenIn[i];
+
+   if (totSize < FSST_SAMPLETARGET) { 
+      for(size_t i=0; i<nlines; i++) 
+         sample.push_back(strIn[i]);
+   } else {
+      size_t sampleRnd = FSST_HASH(4637947);
+      const u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
+      size_t *sampleLen =  new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
+      *lenRef = sampleLen;
+      size_t* sampleLenLim = sampleLen + nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE;
+
+      while(sampleBuf < sampleLim && sampleLen < sampleLenLim) {
+         // choose a non-empty line
+         sampleRnd = FSST_HASH(sampleRnd);
+         size_t linenr = sampleRnd % nlines;
+         while (lenIn[linenr] == 0) 
+            if (++linenr == nlines) linenr = 0;
+
+         // choose a chunk
+         size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
+         sampleRnd = FSST_HASH(sampleRnd);
+         size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
+
+         // add the chunk to the sample
+         size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
+         memcpy(sampleBuf, strIn[linenr]+chunk, len);
+         sample.push_back(sampleBuf);
+         sampleBuf += *sampleLen++ = len;
+      }
+   }
+   return sample;
+}
+
+extern "C" fsst_encoder_t* fsst_create(size_t n, const size_t lenIn[], const u8 *strIn[], int zeroTerminated) {
+   u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
+   const size_t *sampleLen = lenIn;
+   vector<const u8*> sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample
+   Encoder *encoder = new Encoder();
+   encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
+   if (sampleLen != lenIn) delete[] sampleLen; 
+   delete[] sampleBuf; 
+   return (fsst_encoder_t*) encoder;
+}
+
+/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */
+extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) {
+   Encoder *e = new Encoder();
+   e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
+   return (fsst_encoder_t*) e;
+}
+
+// export a symbol table in compact format. 
+extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
+   Encoder *e = (Encoder*) encoder;
+   // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
+   // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t
+   // (such functionality could be useful to append compressed data to an existing block).
+   //
+   // However, the hash function in the encoder hash table is endian-sensitive, and given its
+   // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
+   // Doing a endian-conversion during hashing will be slow and self-defeating.
+   //
+   // Overall, we could support reconstructing an encoder for incremental compression, but 
+   // should enforce equal-endianness. Bit of a bummer. Not going there now.
+   // 
+   // The version field is now there just for future-proofness, but not used yet
+   
+   // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
+   u64 version = (FSST_VERSION << 32) |  // version is 24 bits, most significant byte is 0 
+                 (((u64) e->symbolTable->suffixLim) << 24) | 
+                 (((u64) e->symbolTable->terminator) << 16) | 
+                 (((u64) e->symbolTable->nSymbols) << 8) | 
+                 FSST_ENDIAN_MARKER; // least significant byte is nonzero
+
+   version = swap64_if_be(version); // ensure version is little-endian encoded
+
+   /* do not assume unaligned reads here */
+   memcpy(buf, &version, 8);
+   buf[8] = e->symbolTable->zeroTerminated;
+   for(u32 i=0; i<8; i++)
+      buf[9+i] = (u8) e->symbolTable->lenHisto[i];
+   u32 pos = 17;
+
+   // emit only the used bytes of the symbols 
+   for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
+      for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
+         buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
+
+   return pos; // length of what was serialized
+}
+
+#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
+
+extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) {
+   u64 version = 0;
+   u32 code, pos = 17;
+   u8 lenHisto[8];
+
+   // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
+   memcpy(&version, buf, 8);
+   version = swap64_if_be(version); // version is always little-endian encoded
+
+   if ((version>>32) != FSST_VERSION) return 0;
+   decoder->zeroTerminated = buf[8]&1;
+   memcpy(lenHisto, buf+9, 8);
+
+   // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) 
+   decoder->len[0] = 1;
+   decoder->symbol[0] = 0;
+
+   // we use lenHisto[0] as 1-byte symbol run length (at the end)
+   code = decoder->zeroTerminated;
+   if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
+
+   // now get all symbols from the buffer
+   for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
+      for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++)  {
+         decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1  */
+         decoder->symbol[code] = 0;
+         for(u32 j=0; j<decoder->len[code]; j++) 
+            ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
+      }
+   }
+   if (decoder->zeroTerminated) lenHisto[0]++; 
+
+   // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
+   while(code<255) {
+       decoder->symbol[code] = FSST_CORRUPT;    
+       decoder->len[code++] = 8;
+   }
+   return pos;
+}
+
+// runtime check for simd
+inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+#ifndef NONOPT_FSST
+   if (simd && fsst_hasAVX512())
+      return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+#endif
+   (void) simd;
+   return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch);
+}
+size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
+}
+
+// adaptive choosing of scalar compression method based on symbol length histogram 
+inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
+   bool avoidBranch = false, noSuffixOpt = false;
+   if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
+      noSuffixOpt = true;
+   } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
+              (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
+              (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
+      avoidBranch = true;
+   }
+   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
+}
+size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
+   return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+}
+}  // namespace libfsst
+
+using namespace libfsst;
+// the main compression function (everything automatic)
+extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) {
+   // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
+   size_t totLen = accumulate(lenIn, lenIn+nlines, 0);
+   int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); 
+   return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
+}
+
+/* deallocate encoder */
+extern "C" void fsst_destroy(fsst_encoder_t* encoder) {
+  Encoder *e = (Encoder*) encoder; 
+   delete e;
+}
+
+/* very lazy implementation relying on export and import */
+extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) {
+   u8 buf[sizeof(fsst_decoder_t)];
+   u32 cnt1 = fsst_export(encoder, buf);
+   fsst_decoder_t decoder;
+   u32 cnt2 = fsst_import(&decoder, buf);
+   assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; 
+   return decoder;
+}
diff --git a/cpp/thirdparty/fsst/libfsst.hpp b/cpp/thirdparty/fsst/libfsst.hpp
new file mode 100644
index 00000000000..f61bc0175b6
--- /dev/null
+++ b/cpp/thirdparty/fsst/libfsst.hpp
@@ -0,0 +1,471 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// 
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files   
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,   
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is   
+// furnished to do so, subject to the following conditions:
+// 
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
+//                 
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst 
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+
+using namespace std;
+
+#include "fsst.h" // the official FSST API -- also usable by C mortals
+
+/* unsigned integers */
+namespace libfsst {
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+}  // namespace libfsst
+
+#if UINTPTR_MAX == 0xffffffffU
+// We're on a 32-bit platform
+#define NONOPT_FSST
+#endif
+
+#define FSST_ENDIAN_MARKER ((u64) 1)
+#define FSST_VERSION_20190218 20190218
+#define FSST_VERSION ((u64) FSST_VERSION_20190218)
+
+// "symbols" are character sequences (up to 8 bytes)
+// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism:
+// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X.
+
+// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length
+#define FSST_LEN_BITS       12
+#define FSST_CODE_BITS      9 
+#define FSST_CODE_BASE      256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */
+#define FSST_CODE_MAX       (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */
+#define FSST_CODE_MASK      (FSST_CODE_MAX-1UL)   /* all bits set: indicating a symbol that has not been assigned a code yet */
+
+namespace libfsst {
+constexpr inline uint64_t swap64_if_be(uint64_t v) noexcept {
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    return __builtin_bswap64(v);
+#else
+    return v; // little-endian (or unknown), so no swap needed
+#endif
+}
+
+inline uint64_t fsst_unaligned_load(u8 const* V) {
+    uint64_t Ret;
+    memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
+    return swap64_if_be(Ret);
+}
+
+struct Symbol {
+   static const unsigned maxLength = 8;
+
+   // the byte sequence that this symbol stands for
+   union { char str[maxLength]; u64 num; } val; // usually we process it as a num(ber), as this is fast
+
+   // icl = u64 ignoredBits:16,code:12,length:4,unused:32 -- but we avoid exposing this bit-field notation
+   u64 icl;  // use a single u64 to be sure "code" is accessed with one load and can be compared with one comparison
+
+   Symbol() : icl(0) { val.num = 0; }
+
+   explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { store_num(c); } // single-char symbol
+   explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {}
+   explicit Symbol(const u8* begin, const u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {}
+   explicit Symbol(const char* input, u32 len) {
+      val.num = 0;
+      if (len>=8) {
+          len = 8;
+          memcpy(val.str, input, 8);
+      } else {
+          memcpy(val.str, input, len);
+      }
+      set_code_len(FSST_CODE_MAX, len);
+   }
+   void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); }
+
+   u64 load_num() const { return swap64_if_be(val.num); }
+   void store_num(u64 v) { val.num = swap64_if_be(v); }
+
+   u32 length() const { return (u32) (icl >> 28); }
+   u16 code() const { return (icl >> 16) & FSST_CODE_MASK; }
+   u32 ignoredBits() const { return (u32) icl; }
+
+   u8 first() const { assert( length() >= 1); return 0xFF & load_num(); }
+   u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); }
+
+#define FSST_HASH_LOG2SIZE 10 
+#define FSST_HASH_PRIME 2971215073LL
+#define FSST_SHIFT 15
+#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT))
+   size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes
+};
+
+// Symbol that can be put in a queue, ordered on gain
+struct QSymbol{
+   Symbol symbol;
+   mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols
+   bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); }
+};
+
+// we construct FSST symbol tables using a random sample of about 16KB (1<<14) 
+#define FSST_SAMPLETARGET (1<<14)
+#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET)
+
+// two phases of compression, before and after optimize():
+//
+// (1) to encode values we probe (and maintain) three datastructures:
+// - u16 byteCodes[256] array at the position of the next byte  (s.length==1)
+// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2)
+// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), 
+// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are 
+// pseudo codes representing a single byte these will become escapes)
+//
+// (2) when we finished looking for the best symbol table we call optimize() to reshape it:
+// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1)
+//   length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes 
+//   (allows shortcut during compression)
+// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding 
+//   to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one
+//   byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[]
+//   and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction
+//   is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was
+//   hence added to make symbolTable construction faster.
+//
+// this final layout allows for the fastest compression code, only currently present in compressBulk
+
+// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4
+#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket
+
+// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key
+//             ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster
+//
+// the gain field is only used in the symbol queue that sorts symbols on gain
+
+struct SymbolTable {
+   static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
+
+   // lookup table using the next two bytes (65536 codes), or just the next single byte
+   u16 shortCodes[65536]; // contains code for 2-byte symbol, otherwise code for pseudo byte (escaped byte)
+
+   // lookup table (only used during symbolTable construction, not during normal text compression)
+   u16 byteCodes[256]; // contains code for every 1-byte symbol, otherwise code for pseudo byte (escaped byte)
+
+   // 'symbols' is the current symbol  table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code'
+   Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols   
+
+   // replicate long symbols in hashTab (avoid indirection). 
+   Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes
+
+   u16 nSymbols;          // amount of symbols in the map (max 255)
+   u16 suffixLim;         // codes higher than this do not have a longer suffix
+   u16 terminator;        // code of 1-byte symbol, that can be used as a terminator during compression
+   bool zeroTerminated;   // whether we are expecting zero-terminated strings (we then also produce zero-terminated compressed strings)
+   u16 lenHisto[FSST_CODE_BITS]; // lenHisto[x] is the amount of symbols of byte-length (x+1) in this SymbolTable
+
+   SymbolTable() : nSymbols(0), suffixLim(FSST_CODE_MAX), terminator(0), zeroTerminated(false) {
+      // stuff done once at startup
+      for (u32 i=0; i<256; i++) {
+         symbols[i] = Symbol(i,i|(1<<FSST_LEN_BITS)); // pseudo symbols
+      }
+      Symbol unused = Symbol((u8) 0,FSST_CODE_MASK); // single-char symbol, exception code
+      for (u32 i=256; i<FSST_CODE_MAX; i++) {
+         symbols[i] = unused; // we start with all symbols unused
+      }
+      // empty hash table
+      Symbol s;
+      s.val.num = 0;
+      s.icl = FSST_ICL_FREE; //marks empty in hashtab
+      for(u32 i=0; i<hashTabSize; i++)
+         hashTab[i] = s;
+
+      // fill byteCodes[] with the pseudo code all bytes (escaped bytes)
+      for(u32 i=0; i<256; i++)
+         byteCodes[i] = (1<<FSST_LEN_BITS) | i;
+
+      // fill shortCodes[] with the pseudo code for the first byte of each two-byte pattern
+      for(u32 i=0; i<65536; i++)
+         shortCodes[i] = (1<<FSST_LEN_BITS) | (i&255);
+
+      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
+   }
+
+   void clear() {
+      // clear a symbolTable with minimal effort (only erase the used positions in it)
+      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
+      for(u32 i=FSST_CODE_BASE; i<FSST_CODE_BASE+nSymbols; i++) {
+          if (symbols[i].length() == 1) {
+              u16 val = symbols[i].first();
+              byteCodes[val] = (1<<FSST_LEN_BITS) | val;
+          } else if (symbols[i].length() == 2) {
+              u16 val = symbols[i].first2();
+              shortCodes[val] = (1<<FSST_LEN_BITS) | (val&255);
+          } else {
+              u32 idx = symbols[i].hash() & (hashTabSize-1);
+              hashTab[idx].val.num = 0;
+              hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab
+          }           
+      } 
+      nSymbols = 0; // no need to clean symbols[] as no symbols are used
+   }
+   bool hashInsert(Symbol s) {
+      u32 idx = s.hash() & (hashTabSize-1);
+      bool taken = (hashTab[idx].icl < FSST_ICL_FREE);
+      if (taken) return false; // collision in hash table
+      hashTab[idx].icl = s.icl;
+      hashTab[idx].store_num(s.load_num() & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl));
+      return true;
+   }
+   bool add(Symbol s) {
+      assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX);
+      u32 len = s.length();
+      s.set_code_len(FSST_CODE_BASE + nSymbols, len);
+      if (len == 1) {
+         byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<<FSST_LEN_BITS); // len=1 (<<FSST_LEN_BITS)
+      } else if (len == 2) {
+         shortCodes[s.first2()] = FSST_CODE_BASE + nSymbols + (2<<FSST_LEN_BITS); // len=2 (<<FSST_LEN_BITS)
+      } else if (!hashInsert(s)) {
+         return false;
+      }
+      symbols[FSST_CODE_BASE + nSymbols++] = s;
+      lenHisto[len-1]++;
+      return true;
+   }
+   /// Find longest expansion, return code (= position in symbol table)
+   u16 findLongestSymbol(Symbol s) const {
+      size_t idx = s.hash() & (hashTabSize-1);
+      if (hashTab[idx].icl <= s.icl && hashTab[idx].load_num() == (s.load_num() & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
+         return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol 
+      }
+      if (s.length() >= 2) {
+         u16 code =  shortCodes[s.first2()] & FSST_CODE_MASK;
+         if (code >= FSST_CODE_BASE) return code; 
+      }
+      return byteCodes[s.first()] & FSST_CODE_MASK;
+   }
+   u16 findLongestSymbol(const u8* cur, const u8* end) const {
+      return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol
+   }
+
+   // rationale for finalize:
+   // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable()
+   //   consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize()
+   //   (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass)
+   // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf.
+   //   we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). 
+   // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[]
+   //   Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte
+   //   symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time.
+   //
+   // In all, we change the layout and coding, as follows..
+   //
+   // before finalize(): 
+   // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255
+   // - The first 256 codes are pseudo symbols (all escaped bytes)
+   //
+   // after finalize(): 
+   // - table layout is symbols[0..nSymbols>, with nSymbols < 256. 
+   // - Real codes are [0,nSymbols>. 8-th bit not set. 
+   // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255
+   // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last)
+   // the two-byte codes are split in two sections: 
+   // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression
+   //
+   // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore).
+   //
+   void finalize(u8 zeroTerminated) {
+       assert(nSymbols <= 255);
+       u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated);
+
+       // compute running sum of code lengths (starting offsets for each length) 
+       rsum[0] = byteLim; // 1-byte codes are highest
+       rsum[1] = zeroTerminated;
+       for(u32 i=1; i<7; i++)
+          rsum[i+1] = rsum[i] + lenHisto[i];
+
+       // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim)
+       suffixLim = rsum[1];
+       symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only)
+
+       for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) {  
+          Symbol s1 = symbols[FSST_CODE_BASE+i];
+          u32 len = s1.length(), opt = (len == 2)*nSymbols;
+          if (opt) {
+              u16 first2 = s1.first2();
+              for(u32 k=0; k<opt; k++) {  
+                 Symbol s2 = symbols[FSST_CODE_BASE+k];
+                 if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s
+                    opt = 0;
+              }
+              newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim 
+          } else 
+              newCode[i] = rsum[len-1]++;
+          s1.set_code_len(newCode[i],len);
+          symbols[newCode[i]] = s1; 
+       }
+       // renumber the codes in byteCodes[] 
+       for(u32 i=0; i<256; i++) 
+          if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
+             byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS);
+          else 
+             byteCodes[i] = 511 + (1 << FSST_LEN_BITS);
+       
+       // renumber the codes in shortCodes[] 
+       for(u32 i=0; i<65536; i++)
+          if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
+             shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS));
+          else 
+             shortCodes[i] = byteCodes[i&0xFF];
+
+       // replace the symbols in the hash table
+       for(u32 i=0; i<hashTabSize; i++)
+          if (hashTab[i].icl < FSST_ICL_FREE)
+             hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]];
+   }
+};
+
+#ifdef NONOPT_FSST
+struct Counters {
+   u16 count1[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample 
+   u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample 
+
+   void count1Set(u32 pos1, u16 val) { 
+      count1[pos1] = val;
+   }
+   void count1Inc(u32 pos1) { 
+      count1[pos1]++;
+   }
+   void count2Inc(u32 pos1, u32 pos2) {  
+      count2[pos1][pos2]++;
+   }
+   u32 count1GetNext(u32 &pos1) { 
+      return count1[pos1];
+   }
+   u32 count2GetNext(u32 pos1, u32 &pos2) { 
+      return count2[pos1][pos2];
+   }
+   void backup1(u8 *buf) {
+      memcpy(buf, count1, FSST_CODE_MAX*sizeof(u16));
+   }
+   void restore1(u8 *buf) {
+      memcpy(count1, buf, FSST_CODE_MAX*sizeof(u16));
+   }
+};
+#else
+// we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons
+// first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure.
+// second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7)
+struct Counters {
+   // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes
+   u8 count1High[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample (16-bits)
+   u8 count1Low[FSST_CODE_MAX];    // it is split in a low and high byte: cnt = count1High*256 + count1Low
+   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
+   u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
+   // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
+   
+   void count1Set(u32 pos1, u16 val) { 
+      count1Low[pos1] = val&255;
+      count1High[pos1] = val>>8;
+   }
+   void count1Inc(u32 pos1) { 
+      if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
+         count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
+   }
+   void count2Inc(u32 pos1, u32 pos2) {  
+       if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
+          // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
+          count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
+   }
+   u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
+      // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
+      u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7]
+
+      u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes
+      high = (high >> (zero << 3)) & 255; // advance to nonzero counter
+      if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
+         return 0; // all zero
+
+      u32 low = count1Low[pos1];
+      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
+      return (u32) ((high << 8) + low);
+   }
+   u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
+      // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
+      u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15]
+      high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters
+
+      u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters
+      high = (high >> (zero << 2)) & 15;  // advance to nonzero counter
+      if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
+         return 0UL; // all zero
+
+      u32 low = count2Low[pos1][pos2];
+      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
+      return (u32) ((high << 8) + low);
+   }
+   void backup1(u8 *buf) {
+      memcpy(buf, count1High, FSST_CODE_MAX);
+      memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
+   }
+   void restore1(u8 *buf) {
+      memcpy(count1High, buf, FSST_CODE_MAX);
+      memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
+   }
+}; 
+#endif
+
+
+#define FSST_BUFSZ (3<<19) // 768KB
+
+// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression 
+struct Encoder {
+   shared_ptr<SymbolTable> symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc)
+   union {
+      Counters counters;     // for counting symbol occurences during map construction
+      u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) 
+   };
+};
+
+// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call)
+struct SIMDjob {
+   u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)  
+};
+
+extern bool 
+fsst_hasAVX512(); // runtime check for avx512 capability
+
+extern size_t 
+fsst_compressAVX512(
+   SymbolTable &symbolTable, 
+   u8* codeBase,    // IN: base address for codes, i.e. compression output (points to simdbuf+256KB)
+   u8* symbolBase,  // IN: base address for string bytes, i.e. compression input (points to simdbuf)
+   SIMDjob* input,  // IN: input array (size n) with job information: what to encode, where to store it.
+   SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer.
+   size_t n,         // IN: size of arrays input and output (should be max 512)
+   size_t unroll);   // IN: degree of SIMD unrolling
+
+// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
+size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
+size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd);
+}  // namespace libfsst

From 603a076d567a7711a4085f80f626fd9104e917c9 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Tue, 25 Nov 2025 08:52:21 +0000
Subject: [PATCH 14/16] update

---
 NOTICE.txt                        |  4 ++++
 cpp/src/parquet/CMakeLists.txt    |  6 ++++--
 cpp/src/parquet/decoder.cc        |  2 +-
 cpp/src/parquet/encoder.cc        | 21 +++++++++++++++++----
 cpp/src/parquet/encoding_test.cc  |  4 ++--
 cpp/thirdparty/fsst/LICENSE       | 21 +++++++++++++++++++++
 dev/release/rat_exclude_files.txt |  1 +
 7 files changed, 50 insertions(+), 9 deletions(-)
 create mode 100644 cpp/thirdparty/fsst/LICENSE

diff --git a/NOTICE.txt b/NOTICE.txt
index 9b98364d2ab..a27486d2a5d 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -21,6 +21,10 @@ This product includes software from the mman-win32 project
  * Copyright https://code.google.com/p/mman-win32/
  * Licensed under the MIT License;
 
+This product includes software from the Fast Static Symbol Table (FSST) project (MIT)
+ * Copyright (c) 2018-2020 CWI, TU Munich, FSU Jena
+ * https://github.com/cwida/fsst
+
 This product includes software from the LevelDB project
  * Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  * Use of this source code is governed by a BSD-style license that can be
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index e2b1db09633..843b90c5fd4 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -209,8 +209,10 @@ if(DEFINED ARROW_FSST_SOURCES)
   set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
                "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>"
                "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:-Wno-error=missing-declarations;-Wno-missing-declarations>"
-               "$<$<CXX_COMPILER_ID:MSVC>:/wd4244>"
-               "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>")
+               "$<$<CXX_COMPILER_ID:MSVC>:/wd4244>")
+  set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
+               "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include>"
+               "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>")
 endif()
 
 if(ARROW_HAVE_RUNTIME_AVX2)
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 42838fd059f..db4058b6838 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -53,7 +53,7 @@
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
-#include "fsst.h"
+#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/types.h"
 
 #ifdef _MSC_VER
diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc
index 00daa366f6a..852fdece61a 100644
--- a/cpp/src/parquet/encoder.cc
+++ b/cpp/src/parquet/encoder.cc
@@ -50,7 +50,7 @@
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
-#include "fsst.h"
+#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/types.h"
 
 #ifdef _MSC_VER
@@ -1778,9 +1778,11 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayTyp
 
     const int64_t decoder_bytes = static_cast<int64_t>(sizeof(fsst_decoder_t));
     const int64_t length_prefix_bytes =
-        static_cast<int64_t>(unencoded_values_.size()) * static_cast<int64_t>(sizeof(uint32_t));
+        static_cast<int64_t>(unencoded_values_.size()) *
+        static_cast<int64_t>(sizeof(uint32_t));
     const int64_t estimated_buffer_size =
-        decoder_bytes + total_input_size * kFsstCompressionExpansion + length_prefix_bytes;
+        decoder_bytes + total_input_size * kFsstCompressionExpansion +
+        length_prefix_bytes;
 
     PARQUET_ASSIGN_OR_THROW(auto output_buffer,
                             AllocateResizableBuffer(estimated_buffer_size, pool_));
@@ -1819,6 +1821,10 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayTyp
 
     PARQUET_THROW_NOT_OK(output_buffer->Resize(total_output_size));
     UpdateCompressionStats(total_input_size, total_output_size);
+    if (encoder_ != nullptr) {
+      fsst_destroy(encoder_);
+      encoder_ = nullptr;
+    }
     unencoded_values_.clear();
     pending_unencoded_bytes_ = 0;
     unencoded_byte_array_data_bytes_ = 0;
@@ -1860,7 +1866,9 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayTyp
   void PutSpaced(const ByteArray* src, int num_values, const uint8_t* valid_bits,
                  int64_t valid_bits_offset) override {
     if (valid_bits != NULLPTR) {
-      PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(ByteArray), pool_));
+      PARQUET_ASSIGN_OR_THROW(
+          auto buffer,
+          ::arrow::AllocateBuffer(num_values * sizeof(ByteArray), pool_));
       auto buffer_ptr = reinterpret_cast<ByteArray*>(buffer->mutable_data());
       int num_valid_values = ::arrow::util::internal::SpacedCompress<ByteArray>(
           src, num_values, valid_bits, valid_bits_offset, buffer_ptr);
@@ -1872,6 +1880,11 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayTyp
 
  private:
   void BuildSymbolTable() {
+    if (encoder_ != nullptr) {
+      fsst_destroy(encoder_);
+      encoder_ = nullptr;
+    }
+
     if (unencoded_values_.empty()) {
       return;
     }
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index ebc86f14853..c5c859e8de7 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -47,13 +47,13 @@
 #include "parquet/column_page.h"
 #include "parquet/column_reader.h"
 #include "parquet/encoding.h"
+#include "parquet/exception.h"
 #include "parquet/file_reader.h"
 #include "parquet/file_writer.h"
-#include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/test_util.h"
-#include "fsst.h"
+#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/types.h"
 
 using arrow::default_memory_pool;
diff --git a/cpp/thirdparty/fsst/LICENSE b/cpp/thirdparty/fsst/LICENSE
new file mode 100644
index 00000000000..0cd2c8c4652
--- /dev/null
+++ b/cpp/thirdparty/fsst/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018-2020 CWI, TU Munich, FSU Jena
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 5212dc47e0b..0544ab3b1f0 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -31,6 +31,7 @@ cpp/src/generated/substrait/*
 cpp/thirdparty/flatbuffers/include/flatbuffers/base.h
 cpp/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h
 cpp/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h
+cpp/thirdparty/fsst/*
 dev/requirements*.txt
 dev/archery/MANIFEST.in
 dev/archery/requirements*.txt

From e19cc5e029200b8039664bec94d9f9a39bf4a9d2 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Tue, 25 Nov 2025 09:27:27 +0000
Subject: [PATCH 15/16] lint

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 16 ++++++++++----
 cpp/src/parquet/CMakeLists.txt              | 23 ++++++++++++++-------
 cpp/src/parquet/decoder.cc                  |  2 +-
 cpp/src/parquet/fsst_compat.h               |  6 +++---
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index c0c84511c1d..978c14abb9f 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -73,7 +73,9 @@ set(ARROW_THIRDPARTY_DEPENDENCIES
     ZLIB
     zstd)
 
-set(fsst_SOURCE "BUNDLED" CACHE STRING "Source of fsst dependency")
+set(fsst_SOURCE
+    "BUNDLED"
+    CACHE STRING "Source of fsst dependency")
 
 # For backward compatibility. We use "BOOST_SOURCE" if "Boost_SOURCE"
 # isn't specified and "BOOST_SOURCE" is specified.
@@ -2613,16 +2615,22 @@ endif()
 function(build_fsst)
   message(STATUS "Configuring vendored FSST sources")
 
-  set(ARROW_FSST_INCLUDE_DIR "${ARROW_SOURCE_DIR}/thirdparty/fsst" PARENT_SCOPE)
+  set(ARROW_FSST_INCLUDE_DIR
+      "${ARROW_SOURCE_DIR}/thirdparty/fsst"
+      PARENT_SCOPE)
   set(ARROW_FSST_SOURCES
       "${ARROW_SOURCE_DIR}/thirdparty/fsst/libfsst.cpp;${ARROW_SOURCE_DIR}/thirdparty/fsst/fsst_avx512.cpp"
       PARENT_SCOPE)
-  set(FSST_VENDORED TRUE PARENT_SCOPE)
+  set(FSST_VENDORED
+      TRUE
+      PARENT_SCOPE)
 endfunction()
 
 if(ARROW_WITH_FSST)
   if(NOT fsst_SOURCE STREQUAL "BUNDLED")
-    message(FATAL_ERROR "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED.")
+    message(
+      FATAL_ERROR
+        "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED.")
   endif()
   resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE)
 endif()
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index 843b90c5fd4..c9ade80a7f5 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -206,13 +206,19 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR)
   list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR})
 endif()
 if(DEFINED ARROW_FSST_SOURCES)
-  set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
-               "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>"
-               "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:-Wno-error=missing-declarations;-Wno-missing-declarations>"
-               "$<$<CXX_COMPILER_ID:MSVC>:/wd4244>")
-  set_property(SOURCE ${ARROW_FSST_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
-               "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include>"
-               "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>")
+  set_property(
+    SOURCE ${ARROW_FSST_SOURCES}
+    APPEND
+    PROPERTY COMPILE_OPTIONS
+             "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>"
+             "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:-Wno-error=missing-declarations;-Wno-missing-declarations>"
+             "$<$<CXX_COMPILER_ID:MSVC>:/wd4244>")
+  set_property(
+    SOURCE ${ARROW_FSST_SOURCES}
+    APPEND
+    PROPERTY COMPILE_OPTIONS
+             "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include>"
+             "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>")
 endif()
 
 if(ARROW_HAVE_RUNTIME_AVX2)
@@ -333,7 +339,8 @@ add_arrow_lib(parquet
 if(PARQUET_PRIVATE_INCLUDE_DIRS)
   foreach(_parquet_target parquet_objlib parquet_shared parquet_static)
     if(TARGET ${_parquet_target})
-      target_include_directories(${_parquet_target} PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS})
+      target_include_directories(
+        ${_parquet_target} PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS})
     endif()
   endforeach()
 endif()
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index db4058b6838..ae82f3a78a6 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
+#include <deque>
 #include <iostream>
 #include <limits>
 #include <memory>
@@ -29,7 +30,6 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
-#include <deque>
 
 #include "arrow/array.h"
 #include "arrow/array/builder_binary.h"
diff --git a/cpp/src/parquet/fsst_compat.h b/cpp/src/parquet/fsst_compat.h
index 2da27d27301..1c3ef3fae0b 100644
--- a/cpp/src/parquet/fsst_compat.h
+++ b/cpp/src/parquet/fsst_compat.h
@@ -21,13 +21,13 @@
 // can be compiled with the compilers Arrow supports.
 
 #if defined(_WIN32) && !defined(_MSC_VER)
-#include <cpuid.h>
+#  include <cpuid.h>
 
 // MinGW does not provide __cpuidex, but FSST only needs the CPUID
 // leaf/sub-leaf variant that __cpuid_count implements.
 static inline void arrow_fsst_cpuidex(int info[4], int function_id, int subfunction_id) {
   __cpuid_count(function_id, subfunction_id, info[0], info[1], info[2], info[3]);
 }
-#define __cpuidex(info, function_id, subfunction_id) \
-  arrow_fsst_cpuidex(info, function_id, subfunction_id)
+#  define __cpuidex(info, function_id, subfunction_id) \
+    arrow_fsst_cpuidex(info, function_id, subfunction_id)
 #endif

From d4176ae0885ca22d5ce99a1388b7f6bbf82de590 Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Tue, 25 Nov 2025 09:53:07 +0000
Subject: [PATCH 16/16] lint

---
 cpp/cmake_modules/ThirdpartyToolchain.cmake |  5 +-
 cpp/src/parquet/CMakeLists.txt              | 29 +++++----
 cpp/src/parquet/decoder.cc                  | 69 ++++++++++-----------
 cpp/src/parquet/encoder.cc                  | 26 ++++----
 cpp/src/parquet/encoding_test.cc            | 22 +++----
 5 files changed, 70 insertions(+), 81 deletions(-)

diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 978c14abb9f..dddbf2271bb 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -2628,9 +2628,8 @@ endfunction()
 
 if(ARROW_WITH_FSST)
   if(NOT fsst_SOURCE STREQUAL "BUNDLED")
-    message(
-      FATAL_ERROR
-        "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED.")
+    message(FATAL_ERROR "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED."
+    )
   endif()
   resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE)
 endif()
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index c9ade80a7f5..a38705cadf8 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -206,19 +206,18 @@ if(DEFINED ARROW_FSST_INCLUDE_DIR)
   list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR})
 endif()
 if(DEFINED ARROW_FSST_SOURCES)
-  set_property(
-    SOURCE ${ARROW_FSST_SOURCES}
-    APPEND
-    PROPERTY COMPILE_OPTIONS
-             "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>"
-             "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:-Wno-error=missing-declarations;-Wno-missing-declarations>"
-             "$<$<CXX_COMPILER_ID:MSVC>:/wd4244>")
-  set_property(
-    SOURCE ${ARROW_FSST_SOURCES}
-    APPEND
-    PROPERTY COMPILE_OPTIONS
-             "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include>"
-             "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>")
+  set_property(SOURCE ${ARROW_FSST_SOURCES}
+               APPEND
+               PROPERTY COMPILE_OPTIONS
+                        "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>"
+                        "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:-Wno-error=missing-declarations;-Wno-missing-declarations>"
+                        "$<$<CXX_COMPILER_ID:MSVC>:/wd4244>")
+  set_property(SOURCE ${ARROW_FSST_SOURCES}
+               APPEND
+               PROPERTY COMPILE_OPTIONS
+                        "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include>"
+                        "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>"
+  )
 endif()
 
 if(ARROW_HAVE_RUNTIME_AVX2)
@@ -339,8 +338,8 @@ add_arrow_lib(parquet
 if(PARQUET_PRIVATE_INCLUDE_DIRS)
   foreach(_parquet_target parquet_objlib parquet_shared parquet_static)
     if(TARGET ${_parquet_target})
-      target_include_directories(
-        ${_parquet_target} PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS})
+      target_include_directories(${_parquet_target}
+                                 PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS})
     endif()
   endforeach()
 endif()
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index ae82f3a78a6..03105e4fea4 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -50,10 +50,10 @@
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
 
+#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
-#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/types.h"
 
 #ifdef _MSC_VER
@@ -2378,8 +2378,7 @@ class FsstDecoder : public DecoderImpl, virtual public TypedDecoder<ByteArrayTyp
   }
 
   int DecodeSpaced(ByteArray* buffer, int num_values, int null_count,
-                   const uint8_t* valid_bits,
-                   int64_t valid_bits_offset) override {
+                   const uint8_t* valid_bits, int64_t valid_bits_offset) override {
     if (null_count == 0) {
       return Decode(buffer, num_values);
     }
@@ -2409,19 +2408,18 @@ class FsstDecoder : public DecoderImpl, virtual public TypedDecoder<ByteArrayTyp
                   int64_t valid_bits_offset,
                   typename EncodingTraits<ByteArrayType>::Accumulator* builder) override {
     int values_decoded = 0;
-    PARQUET_THROW_NOT_OK(
-        DecodeArrowDense(num_values, null_count, valid_bits, valid_bits_offset, builder,
-                         &values_decoded));
+    PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+                                          valid_bits_offset, builder, &values_decoded));
     return values_decoded;
   }
 
-  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                  int64_t valid_bits_offset,
-                  typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) override {
+  int DecodeArrow(
+      int num_values, int null_count, const uint8_t* valid_bits,
+      int64_t valid_bits_offset,
+      typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) override {
     int values_decoded = 0;
-    PARQUET_THROW_NOT_OK(
-        DecodeArrowDict(num_values, null_count, valid_bits, valid_bits_offset, builder,
-                        &values_decoded));
+    PARQUET_THROW_NOT_OK(DecodeArrowDict(num_values, null_count, valid_bits,
+                                         valid_bits_offset, builder, &values_decoded));
     return values_decoded;
   }
 
@@ -2482,32 +2480,31 @@ class FsstDecoder : public DecoderImpl, virtual public TypedDecoder<ByteArrayTyp
     RETURN_NOT_OK(builder->Reserve(num_values));
 
     int value_index = 0;
-    RETURN_NOT_OK(VisitBitRuns(
-        valid_bits, valid_bits_offset, num_values,
-        [&](int64_t position, int64_t run_length, bool is_valid) {
-          if (is_valid) {
-            for (int64_t i = 0; i < run_length; ++i) {
-              const auto& value = temp_values_[value_index++];
-              RETURN_NOT_OK(builder->Append(value.ptr, static_cast<int32_t>(value.len)));
-            }
-          } else {
-            RETURN_NOT_OK(builder->AppendNulls(run_length));
-          }
-          return Status::OK();
-        }));
+    RETURN_NOT_OK(VisitBitRuns(valid_bits, valid_bits_offset, num_values,
+                               [&](int64_t position, int64_t run_length, bool is_valid) {
+                                 if (is_valid) {
+                                   for (int64_t i = 0; i < run_length; ++i) {
+                                     const auto& value = temp_values_[value_index++];
+                                     RETURN_NOT_OK(builder->Append(
+                                         value.ptr, static_cast<int32_t>(value.len)));
+                                   }
+                                 } else {
+                                   RETURN_NOT_OK(builder->AppendNulls(run_length));
+                                 }
+                                 return Status::OK();
+                               }));
 
     *out_values_decoded = decoded;
     return Status::OK();
   }
 
   uint8_t* EnsureDecodeBuffer(int64_t capacity) {
-    const int64_t min_capacity =
-        std::max<int64_t>(capacity, kInitialDecodeBufferSize);
+    const int64_t min_capacity = std::max<int64_t>(capacity, kInitialDecodeBufferSize);
     const int64_t target = ::arrow::bit_util::NextPower2(min_capacity);
 
     if (!decode_buffer_) {
-      PARQUET_ASSIGN_OR_THROW(
-          decode_buffer_, ::arrow::AllocateResizableBuffer(target, pool_));
+      PARQUET_ASSIGN_OR_THROW(decode_buffer_,
+                              ::arrow::AllocateResizableBuffer(target, pool_));
     } else if (decode_buffer_->size() < target) {
       PARQUET_THROW_NOT_OK(decode_buffer_->Resize(target, false));
     }
@@ -2516,26 +2513,24 @@ class FsstDecoder : public DecoderImpl, virtual public TypedDecoder<ByteArrayTyp
 
   size_t DecompressValue(const uint8_t* compressed_ptr, uint32_t compressed_len,
                          uint8_t** value_ptr) {
-    EnsureDecodeBuffer(decode_buffer_size_ +
-                       OutputUpperBound(compressed_len));
+    EnsureDecodeBuffer(decode_buffer_size_ + OutputUpperBound(compressed_len));
 
     while (true) {
       uint8_t* destination = decode_buffer_->mutable_data() + decode_buffer_size_;
       const size_t available =
           static_cast<size_t>(decode_buffer_->size() - decode_buffer_size_);
 
-      const size_t decompressed =
-          fsst_decompress(&decoder_, compressed_len, compressed_ptr, available,
-                          destination);
+      const size_t decompressed = fsst_decompress(&decoder_, compressed_len,
+                                                  compressed_ptr, available, destination);
 
       if (decompressed > 0 || compressed_len == 0) {
         *value_ptr = destination;
         return decompressed;
       }
 
-      int64_t new_capacity = std::max<int64_t>(
-          decode_buffer_->size() * 2,
-          decode_buffer_size_ + OutputUpperBound(compressed_len));
+      int64_t new_capacity =
+          std::max<int64_t>(decode_buffer_->size() * 2,
+                            decode_buffer_size_ + OutputUpperBound(compressed_len));
       if (new_capacity <= decode_buffer_->size()) {
         throw ParquetException("FSST decompression failed");
       }
diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc
index 852fdece61a..e0a2f1ec826 100644
--- a/cpp/src/parquet/encoder.cc
+++ b/cpp/src/parquet/encoder.cc
@@ -47,10 +47,10 @@
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
 
+#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
-#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/types.h"
 
 #ifdef _MSC_VER
@@ -1758,10 +1758,8 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayTyp
 
   int64_t EstimatedDataEncodedSize() override {
     const int64_t total_size = pending_unencoded_bytes_;
-    const double scaled =
-        static_cast<double>(total_size) * compression_ratio_hint_;
-    const int64_t estimated_payload =
-        static_cast<int64_t>(std::ceil(scaled));
+    const double scaled = static_cast<double>(total_size) * compression_ratio_hint_;
+    const int64_t estimated_payload = static_cast<int64_t>(std::ceil(scaled));
     return static_cast<int64_t>(sizeof(fsst_decoder_t)) +
            std::max<int64_t>(0, estimated_payload);
   }
@@ -1777,12 +1775,11 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayTyp
     const int64_t total_input_size = pending_unencoded_bytes_;
 
     const int64_t decoder_bytes = static_cast<int64_t>(sizeof(fsst_decoder_t));
-    const int64_t length_prefix_bytes =
-        static_cast<int64_t>(unencoded_values_.size()) *
-        static_cast<int64_t>(sizeof(uint32_t));
-    const int64_t estimated_buffer_size =
-        decoder_bytes + total_input_size * kFsstCompressionExpansion +
-        length_prefix_bytes;
+    const int64_t length_prefix_bytes = static_cast<int64_t>(unencoded_values_.size()) *
+                                        static_cast<int64_t>(sizeof(uint32_t));
+    const int64_t estimated_buffer_size = decoder_bytes +
+                                          total_input_size * kFsstCompressionExpansion +
+                                          length_prefix_bytes;
 
     PARQUET_ASSIGN_OR_THROW(auto output_buffer,
                             AllocateResizableBuffer(estimated_buffer_size, pool_));
@@ -1867,8 +1864,7 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayTyp
                  int64_t valid_bits_offset) override {
     if (valid_bits != NULLPTR) {
       PARQUET_ASSIGN_OR_THROW(
-          auto buffer,
-          ::arrow::AllocateBuffer(num_values * sizeof(ByteArray), pool_));
+          auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(ByteArray), pool_));
       auto buffer_ptr = reinterpret_cast<ByteArray*>(buffer->mutable_data());
       int num_valid_values = ::arrow::util::internal::SpacedCompress<ByteArray>(
           src, num_values, valid_bits, valid_bits_offset, buffer_ptr);
@@ -1897,8 +1893,8 @@ class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayTyp
       input_ptrs.push_back(val.ptr);
     }
 
-    encoder_ = fsst_create(unencoded_values_.size(), input_lengths.data(),
-                          input_ptrs.data(), 0);
+    encoder_ =
+        fsst_create(unencoded_values_.size(), input_lengths.data(), input_ptrs.data(), 0);
 
     if (!encoder_) {
       throw ParquetException("Failed to create FSST encoder");
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index c5c859e8de7..38f21d112c5 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -44,6 +44,7 @@
 #include "arrow/util/endian.h"
 #include "arrow/util/span.h"
 #include "arrow/util/string.h"
+#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/column_page.h"
 #include "parquet/column_reader.h"
 #include "parquet/encoding.h"
@@ -53,7 +54,6 @@
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/test_util.h"
-#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/types.h"
 
 using arrow::default_memory_pool;
@@ -2611,8 +2611,8 @@ TEST(TestFsstEncoding, BasicRoundTrip) {
   constexpr int32_t kMinLength = 4;
   constexpr int32_t kMaxLength = 48;
 
-  auto values = rag.BinaryWithRepeats(kNumValues, kNumUnique, kMinLength, kMaxLength,
-                                      0.0);
+  auto values =
+      rag.BinaryWithRepeats(kNumValues, kNumUnique, kMinLength, kMaxLength, 0.0);
 
   auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
   ASSERT_NO_THROW(encoder->Put(*values));
@@ -2711,13 +2711,13 @@ TEST(TestFsstEncoding, MultiPageRoundTrip) {
                           ->build();
 
   std::unique_ptr<ParquetFileWriter> writer;
-  ASSERT_NO_THROW(writer =
-                      ParquetFileWriter::Open(output_stream, parquet_schema, writer_props));
+  ASSERT_NO_THROW(
+      writer = ParquetFileWriter::Open(output_stream, parquet_schema, writer_props));
   ASSERT_NE(nullptr, writer);
   auto* row_group_writer = writer->AppendRowGroup();
   ASSERT_NE(nullptr, row_group_writer);
-  auto* column_writer = static_cast<TypedColumnWriter<ByteArrayType>*>(
-      row_group_writer->NextColumn());
+  auto* column_writer =
+      static_cast<TypedColumnWriter<ByteArrayType>*>(row_group_writer->NextColumn());
   ASSERT_NE(nullptr, column_writer);
 
   auto write_page = [&](const std::vector<std::string>& source) {
@@ -2760,16 +2760,16 @@ TEST(TestFsstEncoding, MultiPageRoundTrip) {
   ASSERT_NO_THROW(reader = make_reader());
   ASSERT_NE(nullptr, reader);
   auto row_group_reader = reader->RowGroup(0);
-  auto column_reader =
-      std::static_pointer_cast<TypedColumnReader<ByteArrayType>>(row_group_reader->Column(0));
+  auto column_reader = std::static_pointer_cast<TypedColumnReader<ByteArrayType>>(
+      row_group_reader->Column(0));
 
   std::vector<ByteArray> decoded(kTotalValues);
   int64_t values_read = 0;
   while (values_read < kTotalValues) {
     int64_t batch_length = std::min<int64_t>(1024, kTotalValues - values_read);
     int64_t batch_read = 0;
-    column_reader->ReadBatch(batch_length, nullptr, nullptr,
-                             decoded.data() + values_read, &batch_read);
+    column_reader->ReadBatch(batch_length, nullptr, nullptr, decoded.data() + values_read,
+                             &batch_read);
     ASSERT_GT(batch_read, 0);
     values_read += batch_read;
   }