diff --git a/NOTICE.txt b/NOTICE.txt
index 9b98364d2ab..a27486d2a5d 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -21,6 +21,10 @@ This product includes software from the mman-win32 project
  * Copyright https://code.google.com/p/mman-win32/
  * Licensed under the MIT License;
 
+This product includes software from the Fast Static Symbol Table (FSST) project (MIT)
+ * Copyright (c) 2018-2020 CWI, TU Munich, FSU Jena
+ * https://github.com/cwida/fsst
+
 This product includes software from the LevelDB project
  * Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  * Use of this source code is governed by a BSD-style license that can be
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 4ced2a66bf5..dddbf2271bb 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -49,6 +49,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES
     Boost
     Brotli
     BZip2
+    fsst
     c-ares
     gflags
     glog
@@ -72,6 +73,10 @@ set(ARROW_THIRDPARTY_DEPENDENCIES
     ZLIB
     zstd)
 
+set(fsst_SOURCE
+    "BUNDLED"
+    CACHE STRING "Source of fsst dependency")
+
 # For backward compatibility. We use "BOOST_SOURCE" if "Boost_SOURCE"
 # isn't specified and "BOOST_SOURCE" is specified.
 # We renamed "BOOST" dependency name to "Boost" in 3.0.0 because
@@ -183,6 +188,8 @@ macro(build_dependency DEPENDENCY_NAME)
     build_brotli()
   elseif("${DEPENDENCY_NAME}" STREQUAL "BZip2")
     build_bzip2()
+  elseif("${DEPENDENCY_NAME}" STREQUAL "fsst")
+    build_fsst()
   elseif("${DEPENDENCY_NAME}" STREQUAL "c-ares")
     build_cares()
   elseif("${DEPENDENCY_NAME}" STREQUAL "gflags")
@@ -382,6 +389,7 @@ endif()
 if(ARROW_PARQUET)
   set(ARROW_WITH_RAPIDJSON ON)
   set(ARROW_WITH_THRIFT ON)
+  set(ARROW_WITH_FSST ON)
 endif()
 
 if(ARROW_WITH_THRIFT)
@@ -2604,6 +2612,28 @@ if(ARROW_USE_XSIMD)
   endif()
 endif()
 
+function(build_fsst)
+  message(STATUS "Configuring vendored FSST sources")
+
+  set(ARROW_FSST_INCLUDE_DIR
+      "${ARROW_SOURCE_DIR}/thirdparty/fsst"
+      PARENT_SCOPE)
+  set(ARROW_FSST_SOURCES
+      "${ARROW_SOURCE_DIR}/thirdparty/fsst/libfsst.cpp;${ARROW_SOURCE_DIR}/thirdparty/fsst/fsst_avx512.cpp"
+      PARENT_SCOPE)
+  set(FSST_VENDORED
+      TRUE
+      PARENT_SCOPE)
+endfunction()
+
+if(ARROW_WITH_FSST)
+  if(NOT fsst_SOURCE STREQUAL "BUNDLED")
+    message(FATAL_ERROR "FSST must currently be built from source. Set fsst_SOURCE=BUNDLED."
+    )
+  endif()
+  resolve_dependency(fsst IS_RUNTIME_DEPENDENCY FALSE)
+endif()
+
 macro(build_zlib)
   message(STATUS "Building ZLIB from source")
 
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index dc7d40d2a38..a38705cadf8 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -42,6 +42,10 @@ function(ADD_PARQUET_TEST REL_TEST_NAME)
                         ${ARGN})
 
   set(TEST_ARGUMENTS PREFIX "parquet" LABELS "parquet-tests")
+  set(_PARQUET_TEST_EXTRA_ARGS ${ARG_UNPARSED_ARGUMENTS})
+  if(PARQUET_TEST_EXTRA_INCLUDES)
+    list(APPEND _PARQUET_TEST_EXTRA_ARGS EXTRA_INCLUDES ${PARQUET_TEST_EXTRA_INCLUDES})
+  endif()
 
   if(ARROW_TEST_LINKAGE STREQUAL "static")
     add_test_case(${REL_TEST_NAME}
@@ -49,14 +53,14 @@ function(ADD_PARQUET_TEST REL_TEST_NAME)
                   parquet_static
                   ${PARQUET_TEST_LINK_LIBS}
                   ${TEST_ARGUMENTS}
-                  ${ARG_UNPARSED_ARGUMENTS})
+                  ${_PARQUET_TEST_EXTRA_ARGS})
   else()
     add_test_case(${REL_TEST_NAME}
                   STATIC_LINK_LIBS
                   parquet_shared
                   ${PARQUET_TEST_LINK_LIBS}
                   ${TEST_ARGUMENTS}
-                  ${ARG_UNPARSED_ARGUMENTS})
+                  ${_PARQUET_TEST_EXTRA_ARGS})
   endif()
 endfunction()
 
@@ -134,6 +138,9 @@ elseif(NOT MSVC)
   list(APPEND PARQUET_TEST_LINK_LIBS ${CMAKE_DL_LIBS})
 endif()
 
+set(PARQUET_TEST_EXTRA_INCLUDES)
+set(PARQUET_PRIVATE_INCLUDE_DIRS)
+
 #
 # Generated Thrift sources
 set(PARQUET_THRIFT_SOURCE_DIR "${ARROW_SOURCE_DIR}/src/generated/")
@@ -191,6 +198,28 @@ set(PARQUET_SRCS
     stream_writer.cc
     types.cc)
 
+if(DEFINED ARROW_FSST_SOURCES)
+  list(APPEND PARQUET_SRCS ${ARROW_FSST_SOURCES})
+endif()
+if(DEFINED ARROW_FSST_INCLUDE_DIR)
+  list(APPEND PARQUET_PRIVATE_INCLUDE_DIRS ${ARROW_FSST_INCLUDE_DIR})
+  list(APPEND PARQUET_TEST_EXTRA_INCLUDES ${ARROW_FSST_INCLUDE_DIR})
+endif()
+if(DEFINED ARROW_FSST_SOURCES)
+  set_property(SOURCE ${ARROW_FSST_SOURCES}
+               APPEND
+               PROPERTY COMPILE_OPTIONS
+                        "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>>:-Wno-error=shorten-64-to-32;-Wno-shorten-64-to-32>"
+                        "$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:-Wno-error=missing-declarations;-Wno-missing-declarations>"
+                        "$<$<CXX_COMPILER_ID:MSVC>:/wd4244>")
+  set_property(SOURCE ${ARROW_FSST_SOURCES}
+               APPEND
+               PROPERTY COMPILE_OPTIONS
+                        "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include>"
+                        "$<$<AND:$<PLATFORM_ID:Windows>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:${CMAKE_CURRENT_SOURCE_DIR}/fsst_compat.h>"
+  )
+endif()
+
 if(ARROW_HAVE_RUNTIME_AVX2)
   # AVX2 is used as a proxy for BMI2.
   list(APPEND PARQUET_SRCS level_comparison_avx2.cc level_conversion_bmi2.cc)
@@ -306,6 +335,15 @@ add_arrow_lib(parquet
               STATIC_INSTALL_INTERFACE_LIBS
               ${PARQUET_STATIC_INSTALL_INTERFACE_LIBS})
 
+if(PARQUET_PRIVATE_INCLUDE_DIRS)
+  foreach(_parquet_target parquet_objlib parquet_shared parquet_static)
+    if(TARGET ${_parquet_target})
+      target_include_directories(${_parquet_target}
+                                 PRIVATE ${PARQUET_PRIVATE_INCLUDE_DIRS})
+    endif()
+  endforeach()
+endif()
+
 if(WIN32 AND NOT (ARROW_TEST_LINKAGE STREQUAL "static"))
   add_library(parquet_test_support STATIC
               "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp")
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 8ecb774022f..59c3b9934a2 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -862,7 +862,8 @@ class ColumnReaderImplBase {
         case Encoding::RLE:
         case Encoding::DELTA_BINARY_PACKED:
         case Encoding::DELTA_BYTE_ARRAY:
-        case Encoding::DELTA_LENGTH_BYTE_ARRAY: {
+        case Encoding::DELTA_LENGTH_BYTE_ARRAY:
+        case Encoding::FSST: {
           auto decoder = MakeTypedDecoder<DType>(encoding, descr_, pool_);
           current_decoder_ = decoder.get();
           decoders_[static_cast<int>(encoding)] = std::move(decoder);
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index d0a857dd22a..03105e4fea4 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -21,6 +21,8 @@
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
+#include <deque>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <string>
@@ -48,6 +50,7 @@
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
 
+#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
@@ -2307,6 +2310,253 @@ class ByteStreamSplitDecoder<FLBAType> : public ByteStreamSplitDecoderBase<FLBAT
   }
 };
 
+// ----------------------------------------------------------------------
+
+class FsstDecoder : public DecoderImpl, virtual public TypedDecoder<ByteArrayType> {
+ public:
+  using Base = DecoderImpl;
+  using Base::num_values_;
+
+  explicit FsstDecoder(const ColumnDescriptor* descr,
+                       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+      : DecoderImpl(descr, Encoding::FSST), pool_(pool) {}
+
+  void SetData(int num_values, const uint8_t* data, int len) override {
+    const auto header_size = static_cast<int>(sizeof(fsst_decoder_t));
+    if (len < header_size) {
+      throw ParquetException("FSST page too small to contain decoder header");
+    }
+    num_values_ = num_values;
+    memcpy(&decoder_, data, sizeof(fsst_decoder_t));
+    next_ = data + header_size;
+    remaining_bytes_ = len - header_size;
+    decode_buffer_size_ = 0;
+  }
+
+  int Decode(ByteArray* buffer, int max_values) override {
+    max_values = std::min(max_values, num_values_);
+    if (max_values == 0) {
+      return 0;
+    }
+
+    const int64_t estimated_output_size =
+        decode_buffer_size_ +
+        static_cast<int64_t>(remaining_bytes_) * kMaxDecompressionExpansion;
+    EnsureDecodeBuffer(estimated_output_size);
+
+    int decoded = 0;
+
+    while (decoded < max_values) {
+      if (ARROW_PREDICT_FALSE(remaining_bytes_ < static_cast<int>(kLengthPrefixSize))) {
+        throw ParquetException("FSST data truncated before length prefix");
+      }
+
+      const uint32_t compressed_len = SafeLoadAs<uint32_t>(next_);
+      next_ += kLengthPrefixSize;
+      remaining_bytes_ -= static_cast<int>(kLengthPrefixSize);
+
+      if (ARROW_PREDICT_FALSE(compressed_len > static_cast<uint32_t>(remaining_bytes_))) {
+        throw ParquetException("FSST compressed length exceeds available data");
+      }
+
+      const uint8_t* compressed_ptr = next_;
+      next_ += compressed_len;
+      remaining_bytes_ -= static_cast<int>(compressed_len);
+
+      uint8_t* value_ptr = nullptr;
+      const size_t decompressed_len =
+          DecompressValue(compressed_ptr, compressed_len, &value_ptr);
+
+      buffer[decoded].ptr = value_ptr;
+      buffer[decoded].len = static_cast<uint32_t>(decompressed_len);
+      decode_buffer_size_ += static_cast<int64_t>(decompressed_len);
+      ++decoded;
+    }
+
+    num_values_ -= decoded;
+    return decoded;
+  }
+
+  int DecodeSpaced(ByteArray* buffer, int num_values, int null_count,
+                   const uint8_t* valid_bits, int64_t valid_bits_offset) override {
+    if (null_count == 0) {
+      return Decode(buffer, num_values);
+    }
+
+    const int values_to_decode = num_values - null_count;
+    temp_values_.resize(values_to_decode);
+    const int decoded = Decode(temp_values_.data(), values_to_decode);
+    if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) {
+      throw ParquetException("Expected to decode ", values_to_decode,
+                             " values but decoded ", decoded, " values.");
+    }
+
+    int value_index = 0;
+    for (int i = 0; i < num_values; ++i) {
+      if (bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
+        buffer[i] = temp_values_[value_index++];
+      } else {
+        buffer[i].ptr = nullptr;
+        buffer[i].len = 0;
+      }
+    }
+
+    return value_index;
+  }
+
+  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                  int64_t valid_bits_offset,
+                  typename EncodingTraits<ByteArrayType>::Accumulator* builder) override {
+    int values_decoded = 0;
+    PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+                                          valid_bits_offset, builder, &values_decoded));
+    return values_decoded;
+  }
+
+  int DecodeArrow(
+      int num_values, int null_count, const uint8_t* valid_bits,
+      int64_t valid_bits_offset,
+      typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) override {
+    int values_decoded = 0;
+    PARQUET_THROW_NOT_OK(DecodeArrowDict(num_values, null_count, valid_bits,
+                                         valid_bits_offset, builder, &values_decoded));
+    return values_decoded;
+  }
+
+ private:
+  Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset,
+                          typename EncodingTraits<ByteArrayType>::Accumulator* out,
+                          int* out_values_decoded) {
+    const int values_to_decode = num_values - null_count;
+    temp_values_.resize(values_to_decode);
+
+    const int decoded = Decode(temp_values_.data(), values_to_decode);
+    if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) {
+      throw ParquetException("Expected to decode ", values_to_decode,
+                             " values but decoded ", decoded, " values.");
+    }
+
+    auto visit_binary_helper = [&](auto* helper) {
+      auto* values_ptr = temp_values_.data();
+      int value_index = 0;
+
+      RETURN_NOT_OK(
+          VisitBitRuns(valid_bits, valid_bits_offset, num_values,
+                       [&](int64_t position, int64_t run_length, bool is_valid) {
+                         if (is_valid) {
+                           for (int64_t i = 0; i < run_length; ++i) {
+                             const auto& value = values_ptr[value_index++];
+                             RETURN_NOT_OK(helper->AppendValue(
+                                 value.ptr, static_cast<int32_t>(value.len)));
+                           }
+                         } else {
+                           RETURN_NOT_OK(helper->AppendNulls(run_length));
+                         }
+                         return Status::OK();
+                       }));
+
+      *out_values_decoded = decoded;
+      return Status::OK();
+    };
+
+    return DispatchArrowBinaryHelper<ByteArrayType>(
+        out, num_values, /*estimated_data_length=*/{}, visit_binary_helper);
+  }
+
+  Status DecodeArrowDict(int num_values, int null_count, const uint8_t* valid_bits,
+                         int64_t valid_bits_offset,
+                         typename EncodingTraits<ByteArrayType>::DictAccumulator* builder,
+                         int* out_values_decoded) {
+    const int values_to_decode = num_values - null_count;
+    temp_values_.resize(values_to_decode);
+
+    const int decoded = Decode(temp_values_.data(), values_to_decode);
+    if (ARROW_PREDICT_FALSE(decoded != values_to_decode)) {
+      throw ParquetException("Expected to decode ", values_to_decode,
+                             " values but decoded ", decoded, " values.");
+    }
+
+    RETURN_NOT_OK(builder->Reserve(num_values));
+
+    int value_index = 0;
+    RETURN_NOT_OK(VisitBitRuns(valid_bits, valid_bits_offset, num_values,
+                               [&](int64_t position, int64_t run_length, bool is_valid) {
+                                 if (is_valid) {
+                                   for (int64_t i = 0; i < run_length; ++i) {
+                                     const auto& value = temp_values_[value_index++];
+                                     RETURN_NOT_OK(builder->Append(
+                                         value.ptr, static_cast<int32_t>(value.len)));
+                                   }
+                                 } else {
+                                   RETURN_NOT_OK(builder->AppendNulls(run_length));
+                                 }
+                                 return Status::OK();
+                               }));
+
+    *out_values_decoded = decoded;
+    return Status::OK();
+  }
+
+  uint8_t* EnsureDecodeBuffer(int64_t capacity) {
+    const int64_t min_capacity = std::max<int64_t>(capacity, kInitialDecodeBufferSize);
+    const int64_t target = ::arrow::bit_util::NextPower2(min_capacity);
+
+    if (!decode_buffer_) {
+      PARQUET_ASSIGN_OR_THROW(decode_buffer_,
+                              ::arrow::AllocateResizableBuffer(target, pool_));
+    } else if (decode_buffer_->size() < target) {
+      PARQUET_THROW_NOT_OK(decode_buffer_->Resize(target, false));
+    }
+    return decode_buffer_->mutable_data();
+  }
+
+  size_t DecompressValue(const uint8_t* compressed_ptr, uint32_t compressed_len,
+                         uint8_t** value_ptr) {
+    EnsureDecodeBuffer(decode_buffer_size_ + OutputUpperBound(compressed_len));
+
+    while (true) {
+      uint8_t* destination = decode_buffer_->mutable_data() + decode_buffer_size_;
+      const size_t available =
+          static_cast<size_t>(decode_buffer_->size() - decode_buffer_size_);
+
+      const size_t decompressed = fsst_decompress(&decoder_, compressed_len,
+                                                  compressed_ptr, available, destination);
+
+      if (decompressed > 0 || compressed_len == 0) {
+        *value_ptr = destination;
+        return decompressed;
+      }
+
+      int64_t new_capacity =
+          std::max<int64_t>(decode_buffer_->size() * 2,
+                            decode_buffer_size_ + OutputUpperBound(compressed_len));
+      if (new_capacity <= decode_buffer_->size()) {
+        throw ParquetException("FSST decompression failed");
+      }
+      EnsureDecodeBuffer(new_capacity);
+    }
+  }
+
+  static int64_t OutputUpperBound(uint32_t compressed_len) {
+    const int64_t expanded =
+        static_cast<int64_t>(compressed_len) * kMaxDecompressionExpansion;
+    return std::max<int64_t>(expanded, kInitialDecodeBufferSize);
+  }
+
+  static constexpr size_t kLengthPrefixSize = sizeof(uint32_t);
+  static constexpr int64_t kInitialDecodeBufferSize = 1024;
+  static constexpr int64_t kMaxDecompressionExpansion = 8;
+
+  fsst_decoder_t decoder_{};
+  const uint8_t* next_ = nullptr;
+  int remaining_bytes_ = 0;
+  ::arrow::MemoryPool* pool_;
+  std::shared_ptr<::arrow::ResizableBuffer> decode_buffer_;
+  int64_t decode_buffer_size_ = 0;
+  std::vector<ByteArray> temp_values_;
+};
+
 }  // namespace
 
 // ----------------------------------------------------------------------
@@ -2373,6 +2623,13 @@ std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encodin
         throw ParquetException(
             "DELTA_BYTE_ARRAY only supports BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY");
     }
+  } else if (encoding == Encoding::FSST) {
+    switch (type_num) {
+      case Type::BYTE_ARRAY:
+        return std::make_unique<FsstDecoder>(descr, pool);
+      default:
+        throw ParquetException("FSST encoding only supports BYTE_ARRAY");
+    }
   } else if (encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
     if (type_num == Type::BYTE_ARRAY) {
       return std::make_unique<DeltaLengthByteArrayDecoder>(descr, pool);
diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc
index f9367555d97..e0a2f1ec826 100644
--- a/cpp/src/parquet/encoder.cc
+++ b/cpp/src/parquet/encoder.cc
@@ -18,8 +18,11 @@
 #include "parquet/encoding.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
+#include <iomanip>
+#include <iostream>
 #include <limits>
 #include <memory>
 #include <string>
@@ -44,6 +47,7 @@
 #include "arrow/util/ubsan.h"
 #include "arrow/visit_data_inline.h"
 
+#include "fsst.h"  // NOLINT(build/include_subdir)
 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
@@ -101,7 +105,7 @@ class EncoderImpl : virtual public Encoder {
   MemoryPool* memory_pool() const override { return pool_; }
 
   int64_t ReportUnencodedDataBytes() override {
-    if (descr_->physical_type() != Type::BYTE_ARRAY) {
+    if (descr_ != nullptr && descr_->physical_type() != Type::BYTE_ARRAY) {
       throw ParquetException("ReportUnencodedDataBytes is only supported for BYTE_ARRAY");
     }
     int64_t bytes = unencoded_byte_array_data_bytes_;
@@ -1737,6 +1741,182 @@ std::shared_ptr<Buffer> RleBooleanEncoder::FlushValues() {
   return buffer;
 }
 
+// ----------------------------------------------------------------------
+
+class FsstEncoder : public EncoderImpl, virtual public TypedEncoder<ByteArrayType> {
+ public:
+  explicit FsstEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+      : EncoderImpl(descr, Encoding::FSST, pool),
+        encoder_(nullptr),
+        unencoded_values_() {}
+
+  ~FsstEncoder() override {
+    if (encoder_) {
+      fsst_destroy(encoder_);
+    }
+  }
+
+  int64_t EstimatedDataEncodedSize() override {
+    const int64_t total_size = pending_unencoded_bytes_;
+    const double scaled = static_cast<double>(total_size) * compression_ratio_hint_;
+    const int64_t estimated_payload = static_cast<int64_t>(std::ceil(scaled));
+    return static_cast<int64_t>(sizeof(fsst_decoder_t)) +
+           std::max<int64_t>(0, estimated_payload);
+  }
+
+  std::shared_ptr<Buffer> FlushValues() override {
+    if (unencoded_values_.empty()) {
+      PARQUET_ASSIGN_OR_THROW(auto buffer, AllocateBuffer(0, pool_));
+      return buffer;
+    }
+
+    BuildSymbolTable();
+
+    const int64_t total_input_size = pending_unencoded_bytes_;
+
+    const int64_t decoder_bytes = static_cast<int64_t>(sizeof(fsst_decoder_t));
+    const int64_t length_prefix_bytes = static_cast<int64_t>(unencoded_values_.size()) *
+                                        static_cast<int64_t>(sizeof(uint32_t));
+    const int64_t estimated_buffer_size = decoder_bytes +
+                                          total_input_size * kFsstCompressionExpansion +
+                                          length_prefix_bytes;
+
+    PARQUET_ASSIGN_OR_THROW(auto output_buffer,
+                            AllocateResizableBuffer(estimated_buffer_size, pool_));
+    uint8_t* out_ptr = output_buffer->mutable_data();
+
+    fsst_decoder_t decoder = fsst_decoder(encoder_);
+    memcpy(out_ptr, &decoder, sizeof(fsst_decoder_t));
+    out_ptr += sizeof(fsst_decoder_t);
+
+    int64_t total_output_size = sizeof(fsst_decoder_t);
+    std::vector<size_t> out_lengths(1);
+    std::vector<unsigned char*> out_ptrs(1);
+
+    for (size_t i = 0; i < unencoded_values_.size(); ++i) {
+      const int64_t remaining_capacity =
+          estimated_buffer_size - total_output_size - sizeof(uint32_t);
+      if (ARROW_PREDICT_FALSE(remaining_capacity <= 0)) {
+        throw ParquetException("FSST compression buffer exhausted");
+      }
+      size_t available = static_cast<size_t>(remaining_capacity);
+      size_t input_length = static_cast<size_t>(unencoded_values_[i].len);
+      const unsigned char* input_ptr = unencoded_values_[i].ptr;
+      size_t num_compressed =
+          fsst_compress(encoder_, 1, &input_length, &input_ptr, available,
+                        out_ptr + sizeof(uint32_t), out_lengths.data(), out_ptrs.data());
+
+      if (num_compressed == 0) {
+        throw ParquetException("FSST compression buffer too small");
+      }
+
+      uint32_t len = static_cast<uint32_t>(out_lengths[0]);
+      memcpy(out_ptr, &len, sizeof(uint32_t));
+      out_ptr += sizeof(uint32_t) + out_lengths[0];
+      total_output_size += sizeof(uint32_t) + out_lengths[0];
+    }
+
+    PARQUET_THROW_NOT_OK(output_buffer->Resize(total_output_size));
+    UpdateCompressionStats(total_input_size, total_output_size);
+    if (encoder_ != nullptr) {
+      fsst_destroy(encoder_);
+      encoder_ = nullptr;
+    }
+    unencoded_values_.clear();
+    pending_unencoded_bytes_ = 0;
+    unencoded_byte_array_data_bytes_ = 0;
+    return output_buffer;
+  }
+
+  using TypedEncoder<ByteArrayType>::Put;
+
+  void Put(const ByteArray* src, int num_values) override {
+    for (int i = 0; i < num_values; ++i) {
+      unencoded_values_.push_back(src[i]);
+      const int64_t length = static_cast<int64_t>(src[i].len);
+      unencoded_byte_array_data_bytes_ += length;
+      pending_unencoded_bytes_ += length;
+    }
+  }
+
+  void Put(const ::arrow::Array& values) override {
+    AssertVarLengthBinary(values);
+
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BinaryType>(
+        *values.data(),
+        [&](::std::string_view view) {
+          if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+            return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+          }
+          ByteArray val;
+          val.len = static_cast<uint32_t>(view.size());
+          val.ptr = reinterpret_cast<const uint8_t*>(view.data());
+          unencoded_values_.push_back(val);
+          const int64_t length = static_cast<int64_t>(val.len);
+          unencoded_byte_array_data_bytes_ += length;
+          pending_unencoded_bytes_ += length;
+          return Status::OK();
+        },
+        []() { return Status::OK(); }));
+  }
+
+  void PutSpaced(const ByteArray* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(
+          auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(ByteArray), pool_));
+      auto buffer_ptr = reinterpret_cast<ByteArray*>(buffer->mutable_data());
+      int num_valid_values = ::arrow::util::internal::SpacedCompress<ByteArray>(
+          src, num_values, valid_bits, valid_bits_offset, buffer_ptr);
+      Put(buffer_ptr, num_valid_values);
+    } else {
+      Put(src, num_values);
+    }
+  }
+
+ private:
+  void BuildSymbolTable() {
+    if (encoder_ != nullptr) {
+      fsst_destroy(encoder_);
+      encoder_ = nullptr;
+    }
+
+    if (unencoded_values_.empty()) {
+      return;
+    }
+
+    std::vector<size_t> input_lengths;
+    std::vector<const unsigned char*> input_ptrs;
+
+    for (const auto& val : unencoded_values_) {
+      input_lengths.push_back(val.len);
+      input_ptrs.push_back(val.ptr);
+    }
+
+    encoder_ =
+        fsst_create(unencoded_values_.size(), input_lengths.data(), input_ptrs.data(), 0);
+
+    if (!encoder_) {
+      throw ParquetException("Failed to create FSST encoder");
+    }
+  }
+
+  void UpdateCompressionStats(int64_t input_bytes, int64_t payload_bytes) {
+    const double ratio = static_cast<double>(std::max<int64_t>(0, payload_bytes)) /
+                         static_cast<double>(std::max<int64_t>(int64_t{1}, input_bytes));
+    compression_ratio_hint_ = std::max(kMinimumCompressionRatio, ratio);
+  }
+
+  static constexpr double kDefaultCompressionRatio = 1.0;
+  static constexpr double kMinimumCompressionRatio = 0.01;
+  // Allocate worst-case 4x expansion.
+  static constexpr int64_t kFsstCompressionExpansion = 4;
+  fsst_encoder_t* encoder_;
+  std::vector<ByteArray> unencoded_values_;
+  double compression_ratio_hint_ = kDefaultCompressionRatio;
+  int64_t pending_unencoded_bytes_ = 0;
+};
+
 }  // namespace
 
 // ----------------------------------------------------------------------
@@ -1821,6 +2001,13 @@ std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encodin
       default:
         throw ParquetException("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY");
     }
+  } else if (encoding == Encoding::FSST) {
+    switch (type_num) {
+      case Type::BYTE_ARRAY:
+        return std::make_unique<FsstEncoder>(descr, pool);
+      default:
+        throw ParquetException("FSST encoding only supports BYTE_ARRAY");
+    }
   } else if (encoding == Encoding::RLE) {
     switch (type_num) {
       case Type::BOOLEAN:
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index 66a3f7647fa..38f21d112c5 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <cstring>
 #include <limits>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -28,7 +29,9 @@
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_dict.h"
 #include "arrow/array/concatenate.h"
+#include "arrow/buffer.h"
 #include "arrow/compute/cast.h"
+#include "arrow/io/memory.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
@@ -41,7 +44,13 @@
 #include "arrow/util/endian.h"
 #include "arrow/util/span.h"
 #include "arrow/util/string.h"
+#include "fsst.h"  // NOLINT(build/include_subdir)
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
 #include "parquet/encoding.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/file_writer.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/test_util.h"
@@ -2593,4 +2602,305 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) {
   }
 }
 
+// ----------------------------------------------------------------------
+
+TEST(TestFsstEncoding, BasicRoundTrip) {
+  ::arrow::random::RandomArrayGenerator rag(0);
+  constexpr int64_t kNumValues = 2048;
+  constexpr int64_t kNumUnique = 128;
+  constexpr int32_t kMinLength = 4;
+  constexpr int32_t kMaxLength = 48;
+
+  auto values =
+      rag.BinaryWithRepeats(kNumValues, kNumUnique, kMinLength, kMaxLength, 0.0);
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  ASSERT_NO_THROW(encoder->Put(*values));
+  auto encoded = encoder->FlushValues();
+  ASSERT_NE(nullptr, encoded);
+  ASSERT_GT(encoded->size(), 0);
+
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+  decoder->SetData(static_cast<int>(values->length()), encoded->data(),
+                   static_cast<int>(encoded->size()));
+
+  std::vector<ByteArray> decoded(values->length());
+  ASSERT_EQ(static_cast<int>(values->length()),
+            decoder->Decode(decoded.data(), static_cast<int>(decoded.size())));
+
+  const auto& binary = checked_cast<const ::arrow::BinaryArray&>(*values);
+  for (int64_t i = 0; i < values->length(); ++i) {
+    auto view = binary.GetView(i);
+    ASSERT_EQ(view.size(), static_cast<size_t>(decoded[i].len)) << i;
+    ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i;
+  }
+}
+
+TEST(TestFsstEncoding, FlushEmptyReturnsEmptyBuffer) {
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  auto buffer = encoder->FlushValues();
+  ASSERT_NE(nullptr, buffer);
+  ASSERT_EQ(0, buffer->size());
+}
+
+TEST(TestFsstEncoding, ReportUnencodedBytesResetsCounter) {
+  ::arrow::random::RandomArrayGenerator rag(42);
+  auto values = rag.String(256, 1, 24, 0.0);
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  ASSERT_NO_THROW(encoder->Put(*values));
+
+  const auto& binary = checked_cast<const ::arrow::BinaryArray&>(*values);
+  const int64_t expected = binary.total_values_length();
+  EXPECT_EQ(expected, encoder->ReportUnencodedDataBytes());
+  EXPECT_EQ(0, encoder->ReportUnencodedDataBytes());
+
+  auto encoded = encoder->FlushValues();
+  ASSERT_NE(nullptr, encoded);
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+  decoder->SetData(static_cast<int>(values->length()), encoded->data(),
+                   static_cast<int>(encoded->size()));
+
+  std::vector<ByteArray> decoded(values->length());
+  ASSERT_EQ(static_cast<int>(values->length()),
+            decoder->Decode(decoded.data(), static_cast<int>(decoded.size())));
+
+  for (int64_t i = 0; i < values->length(); ++i) {
+    auto view = binary.GetView(i);
+    ASSERT_EQ(view.size(), static_cast<size_t>(decoded[i].len)) << i;
+    ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i;
+  }
+
+  EXPECT_EQ(0, encoder->ReportUnencodedDataBytes());
+}
+
+TEST(TestFsstEncoding, MultiPageRoundTrip) {
+  constexpr int64_t kStringsPerPage = 10000;
+  constexpr int64_t kNumPages = 2;
+  constexpr int64_t kTotalValues = kStringsPerPage * kNumPages;
+  constexpr int64_t kPageSizeBytes = 512 * 1024;
+
+  std::vector<std::string> page1;
+  std::vector<std::string> page2;
+  page1.reserve(kStringsPerPage);
+  page2.reserve(kStringsPerPage);
+
+  for (int64_t i = 0; i < kStringsPerPage; ++i) {
+    page1.push_back("https://www.example.com/user/profile/page" + std::to_string(i) +
+                    "/settings/preferences/advanced/options");
+    page2.push_back("ftp://ftp.downloads.org/pub/software/release" + std::to_string(i) +
+                    "/package/installer/binaries/archive");
+  }
+
+  std::vector<std::string> expected;
+  expected.reserve(kTotalValues);
+  expected.insert(expected.end(), page1.begin(), page1.end());
+  expected.insert(expected.end(), page2.begin(), page2.end());
+
+  ASSERT_OK_AND_ASSIGN(auto output_stream, ::arrow::io::BufferOutputStream::Create());
+
+  schema::NodeVector fields;
+  fields.push_back(schema::PrimitiveNode::Make("url", Repetition::REQUIRED,
+                                               Type::BYTE_ARRAY, ConvertedType::UTF8));
+  auto parquet_schema = std::static_pointer_cast<schema::GroupNode>(
+      schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+
+  auto writer_props = WriterProperties::Builder()
+                          .encoding(Encoding::FSST)
+                          ->data_pagesize(kPageSizeBytes)
+                          ->build();
+
+  std::unique_ptr<ParquetFileWriter> writer;
+  ASSERT_NO_THROW(
+      writer = ParquetFileWriter::Open(output_stream, parquet_schema, writer_props));
+  ASSERT_NE(nullptr, writer);
+  auto* row_group_writer = writer->AppendRowGroup();
+  ASSERT_NE(nullptr, row_group_writer);
+  auto* column_writer =
+      static_cast<TypedColumnWriter<ByteArrayType>*>(row_group_writer->NextColumn());
+  ASSERT_NE(nullptr, column_writer);
+
+  auto write_page = [&](const std::vector<std::string>& source) {
+    std::vector<ByteArray> batch;
+    batch.reserve(source.size());
+    for (const auto& value : source) {
+      batch.emplace_back(value);
+    }
+    column_writer->WriteBatch(static_cast<int64_t>(batch.size()), nullptr, nullptr,
+                              batch.data());
+  };
+
+  write_page(page1);
+  write_page(page2);
+
+  column_writer->Close();
+  row_group_writer->Close();
+  ASSERT_NO_THROW(writer->Close());
+
+  ASSERT_OK_AND_ASSIGN(auto buffer, output_stream->Finish());
+
+  auto make_reader = [&buffer]() -> std::unique_ptr<ParquetFileReader> {
+    auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(buffer);
+    return ParquetFileReader::Open(buffer_reader);
+  };
+
+  std::unique_ptr<ParquetFileReader> page_reader_file;
+  ASSERT_NO_THROW(page_reader_file = make_reader());
+  ASSERT_NE(nullptr, page_reader_file);
+  auto page_row_group = page_reader_file->RowGroup(0);
+  auto page_reader = page_row_group->GetColumnPageReader(0);
+
+  int data_page_count = 0;
+  while (page_reader->NextPage() != nullptr) {
+    ++data_page_count;
+  }
+  EXPECT_GT(data_page_count, 1);
+
+  std::unique_ptr<ParquetFileReader> reader;
+  ASSERT_NO_THROW(reader = make_reader());
+  ASSERT_NE(nullptr, reader);
+  auto row_group_reader = reader->RowGroup(0);
+  auto column_reader = std::static_pointer_cast<TypedColumnReader<ByteArrayType>>(
+      row_group_reader->Column(0));
+
+  std::vector<ByteArray> decoded(kTotalValues);
+  int64_t values_read = 0;
+  while (values_read < kTotalValues) {
+    int64_t batch_length = std::min<int64_t>(1024, kTotalValues - values_read);
+    int64_t batch_read = 0;
+    column_reader->ReadBatch(batch_length, nullptr, nullptr, decoded.data() + values_read,
+                             &batch_read);
+    ASSERT_GT(batch_read, 0);
+    values_read += batch_read;
+  }
+  ASSERT_EQ(values_read, kTotalValues);
+
+  for (int64_t i = 0; i < kTotalValues; ++i) {
+    ASSERT_EQ(expected[i].size(), static_cast<size_t>(decoded[i].len)) << i;
+    ASSERT_EQ(0, memcmp(expected[i].data(), decoded[i].ptr, expected[i].size())) << i;
+  }
+}
+
+TEST(TestFsstEncoding, MultipleFlushesProduceIndependentPages) {
+  ::arrow::random::RandomArrayGenerator rag(99);
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+
+  const std::vector<std::tuple<int64_t, int32_t, int32_t>> batches = {
+      {64, 3, 12}, {512, 2, 24}, {17, 1, 8}};
+
+  for (const auto& [size, min_len, max_len] : batches) {
+    auto batch = rag.String(size, min_len, max_len, 0.0);
+
+    ASSERT_NO_THROW(encoder->Put(*batch));
+    auto buffer = encoder->FlushValues();
+    ASSERT_NE(nullptr, buffer);
+    ASSERT_GT(buffer->size(), 0);
+
+    auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+    decoder->SetData(static_cast<int>(batch->length()), buffer->data(),
+                     static_cast<int>(buffer->size()));
+
+    std::vector<ByteArray> decoded(batch->length());
+    ASSERT_EQ(static_cast<int>(batch->length()),
+              decoder->Decode(decoded.data(), static_cast<int>(decoded.size())));
+
+    const auto& binary = checked_cast<const ::arrow::BinaryArray&>(*batch);
+    for (int64_t i = 0; i < batch->length(); ++i) {
+      auto view = binary.GetView(i);
+      ASSERT_EQ(view.size(), static_cast<size_t>(decoded[i].len)) << i;
+      ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i;
+    }
+
+    encoder->ReportUnencodedDataBytes();
+  }
+}
+
+TEST(TestFsstEncoding, DecodeRejectsOverstatedLength) {
+  ::arrow::random::RandomArrayGenerator rag(123);
+  auto values = rag.String(64, 3, 32, 0.0);
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  ASSERT_NO_THROW(encoder->Put(*values));
+  auto encoded = encoder->FlushValues();
+  ASSERT_NE(nullptr, encoded);
+  ASSERT_GT(encoded->size(),
+            static_cast<int64_t>(sizeof(fsst_decoder_t) + sizeof(uint32_t)));
+
+  ASSERT_OK_AND_ASSIGN(auto corrupted,
+                       ::arrow::AllocateBuffer(encoded->size(), default_memory_pool()));
+  std::memcpy(corrupted->mutable_data(), encoded->data(), encoded->size());
+
+  auto* length_ptr =
+      reinterpret_cast<uint32_t*>(corrupted->mutable_data() + sizeof(fsst_decoder_t));
+  *length_ptr = static_cast<uint32_t>(encoded->size());
+
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+  decoder->SetData(static_cast<int>(values->length()), corrupted->data(),
+                   static_cast<int>(corrupted->size()));
+
+  std::vector<ByteArray> decoded(values->length());
+  EXPECT_THROW(decoder->Decode(decoded.data(), static_cast<int>(decoded.size())),
+               ParquetException);
+}
+
+TEST(TestFsstEncoding, SetDataRejectsShortHeader) {
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+
+  const int num_values = 1;
+  const int header_bytes = static_cast<int>(sizeof(fsst_decoder_t));
+  std::vector<uint8_t> truncated(static_cast<size_t>(header_bytes - 1), 0);
+
+  EXPECT_THROW(
+      decoder->SetData(num_values, truncated.data(), static_cast<int>(truncated.size())),
+      ParquetException);
+}
+
+TEST(TestFsstEncoding, HeavyNullsDecodeSpaced) {
+  ::arrow::random::RandomArrayGenerator rag(321);
+  constexpr int64_t kNumValues = 4096;
+  constexpr double kNullProb = 0.85;
+
+  auto values = rag.String(kNumValues, 1, 24, kNullProb);
+
+  auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::FSST);
+  ASSERT_NO_THROW(encoder->Put(*values));
+  auto encoded = encoder->FlushValues();
+  ASSERT_NE(nullptr, encoded);
+
+  auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::FSST);
+  decoder->SetData(static_cast<int>(values->length()), encoded->data(),
+                   static_cast<int>(encoded->size()));
+
+  const auto& binary = checked_cast<const ::arrow::BinaryArray&>(*values);
+  std::vector<ByteArray> decoded(values->length());
+  std::vector<uint8_t> valid_bits(::arrow::bit_util::BytesForBits(values->length()), 0);
+
+  ::arrow::internal::BitmapWriter writer(valid_bits.data(), 0, values->length());
+  for (int64_t i = 0; i < values->length(); ++i) {
+    if (!binary.IsNull(i)) {
+      writer.Set();
+    } else {
+      writer.Clear();
+    }
+    writer.Next();
+  }
+  writer.Finish();
+
+  const int null_count = static_cast<int>(binary.null_count());
+  ASSERT_EQ(static_cast<int>(values->length() - null_count),
+            decoder->DecodeSpaced(decoded.data(), static_cast<int>(values->length()),
+                                  null_count, valid_bits.data(), 0));
+
+  for (int64_t i = 0; i < values->length(); ++i) {
+    if (binary.IsNull(i)) {
+      EXPECT_EQ(nullptr, decoded[i].ptr);
+      EXPECT_EQ(0u, decoded[i].len);
+      continue;
+    }
+    auto view = binary.GetView(i);
+    ASSERT_EQ(view.size(), static_cast<size_t>(decoded[i].len)) << i;
+    ASSERT_EQ(0, memcmp(decoded[i].ptr, view.data(), view.size())) << i;
+  }
+}
+
 }  // namespace parquet::test
diff --git a/cpp/src/parquet/fsst_compat.h b/cpp/src/parquet/fsst_compat.h
new file mode 100644
index 00000000000..1c3ef3fae0b
--- /dev/null
+++ b/cpp/src/parquet/fsst_compat.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+// Provide minimal compatibility shims so third-party FSST sources
+// can be compiled with the compilers Arrow supports.
+
+#if defined(_WIN32) && !defined(_MSC_VER)
+#  include <cpuid.h>
+
+// MinGW does not provide __cpuidex, but FSST only needs the CPUID
+// leaf/sub-leaf variant that __cpuid_count implements.
+static inline void arrow_fsst_cpuidex(int info[4], int function_id, int subfunction_id) {
+  __cpuid_count(function_id, subfunction_id, info[0], info[1], info[2], info[3]);
+}
+#  define __cpuidex(info, function_id, subfunction_id) \
+    arrow_fsst_cpuidex(info, function_id, subfunction_id)
+#endif
diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift
index e3cc5adb964..b94868ec378 100644
--- a/cpp/src/parquet/parquet.thrift
+++ b/cpp/src/parquet/parquet.thrift
@@ -629,6 +629,13 @@ enum Encoding {
       Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11.
    */
   BYTE_STREAM_SPLIT = 9;
+
+  /** Fast Static Symbol Table (FSST) encoding for BYTE_ARRAY data.
+      FSST compresses strings using a symbol table that maps 1-8 byte sequences
+      to single-byte codes. It allows random access to compressed data and works
+      well with dictionary encoding by compressing dictionary values.
+   */
+  FSST = 10;
 }
 
 /**
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 7109c3d1c83..7e5324814b0 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -259,6 +259,8 @@ std::string EncodingToString(Encoding::type t) {
       return "RLE_DICTIONARY";
     case Encoding::BYTE_STREAM_SPLIT:
       return "BYTE_STREAM_SPLIT";
+    case Encoding::FSST:
+      return "FSST";
     default:
       return "UNKNOWN";
   }
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index c2040e555fd..0113ec86aa1 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -531,8 +531,9 @@ struct Encoding {
     DELTA_BYTE_ARRAY = 7,
     RLE_DICTIONARY = 8,
     BYTE_STREAM_SPLIT = 9,
+    FSST = 10,
     // Should always be last element (except UNKNOWN)
-    UNDEFINED = 10,
+    UNDEFINED = 11,
     UNKNOWN = 999
   };
 };
diff --git a/cpp/thirdparty/fsst/LICENSE b/cpp/thirdparty/fsst/LICENSE
new file mode 100644
index 00000000000..0cd2c8c4652
--- /dev/null
+++ b/cpp/thirdparty/fsst/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018-2020 CWI, TU Munich, FSU Jena
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/cpp/thirdparty/fsst/fsst.h b/cpp/thirdparty/fsst/fsst.h
new file mode 100644
index 00000000000..71085d57201
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst.h
@@ -0,0 +1,227 @@
+/* 
+ * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
+ *
+ * ===================================================================================================================================
+ * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+ *
+ * Copyright 2018-2020, CWI, TU Munich, FSU Jena
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files 
+ * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, 
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 
+ * furnished to do so, subject to the following conditions:
+ *
+ * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+ * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+ * ===================================================================================================================================
+ *
+ * FSST: Fast Static Symbol Table compression 
+ * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
+ *
+ * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
+ * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual
+ * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is 
+ * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
+ *
+ * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) 
+ * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression 
+ * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could 
+ * be seen as strings again and fit in whatever your program is that manipulates strings.
+ *
+ * useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
+ * 
+ * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of 
+ * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. 
+ *
+ * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
+ * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
+ *
+ * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are 
+ * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length.
+ */
+#ifndef FSST_INCLUDED_H
+#define FSST_INCLUDED_H
+
+#ifdef _MSC_VER
+#define __restrict__ 
+#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#define __ORDER_LITTLE_ENDIAN__ 2
+#include <intrin.h>
+static inline int __builtin_ctzl(unsigned long long x) {
+    unsigned long ret;
+    _BitScanForward64(&ret, x);
+    return (int)ret;
+}
+#endif
+
+#ifdef __cplusplus
+#define FSST_FALLTHROUGH [[fallthrough]]
+#include <cstring>
+extern "C" {
+#else
+#define FSST_FALLTHROUGH 
+#endif
+
+#include <stddef.h>
+
+/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */
+#define FSST_ESC 255
+
+/* Data structure needed for compressing strings - use fsst_duplicate() to create thread-local copies. Use fsst_destroy() to free. */
+typedef void* fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */
+
+/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */
+typedef struct {
+   unsigned long long version;     /* version id */
+   unsigned char zeroTerminated;   /* terminator is a single-byte code that does not appear in longer symbols */
+   unsigned char len[255];         /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */
+   unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ 
+} fsst_decoder_t;
+
+/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */
+fsst_encoder_t*  
+fsst_create(
+   size_t n,         /* IN: number of strings in batch to sample from. */
+   const size_t lenIn[],   /* IN: byte-lengths of the inputs */
+   const unsigned char *strIn[],  /* IN: string start pointers. */
+   int zeroTerminated       /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */
+);
+
+/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ 
+fsst_encoder_t*    
+fsst_duplicate(
+   fsst_encoder_t *encoder  /* IN: the symbol table to duplicate. */ 
+);
+
+#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by fsst_export() resp. fsst_import() */
+
+/* Space-efficient symbol table serialization (smaller than sizeof(fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */
+unsigned int                /* OUT: number of bytes written in buf, at most sizeof(fsst_decoder_t) */
+fsst_export(
+   fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ 
+   unsigned char *buf       /* OUT: pointer to a byte-buffer where to serialize this symbol table. */
+); 
+
+/* Deallocate encoder. */
+void
+fsst_destroy(fsst_encoder_t*);
+
+/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */
+unsigned int                /* OUT: number of bytes consumed in buf (0 on failure). */
+fsst_import(
+   fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ 
+   unsigned char const *buf /* IN: pointer to a byte-buffer where fsst_export() serialized this symbol table. */
+); 
+
+/* Return a decoder structure from an encoder. */
+fsst_decoder_t    
+fsst_decoder(
+   fsst_encoder_t *encoder   
+);
+
+/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */
+/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */
+size_t                      /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ 
+fsst_compress(
+   fsst_encoder_t *encoder, /* IN: encoder obtained from fsst_create(). */
+   size_t nstrings,         /* IN: number of strings in batch to compress. */
+   const size_t lenIn[],          /* IN: byte-lengths of the inputs */
+   const unsigned char *strIn[],  /* IN: input string start pointers. */
+   size_t outsize,          /* IN: byte-length of output buffer. */
+   unsigned char *output,   /* OUT: memory buffer to put the compressed strings in (one after the other). */
+   size_t lenOut[],         /* OUT: byte-lengths of the compressed strings. */
+   unsigned char *strOut[]  /* OUT: output string start pointers. Will all point into [output,output+size). */
+);
+
+/* Decompress a single string, inlined for speed. */
+inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */
+fsst_decompress(
+   const fsst_decoder_t *decoder,  /* IN: use this symbol table for compression. */
+   size_t lenIn,             /* IN: byte-length of compressed string. */
+   const unsigned char *strIn,     /* IN: compressed string. */
+   size_t size,              /* IN: byte-length of output buffer. */
+   unsigned char *output     /* OUT: memory buffer to put the decompressed string in. */
+) {
+   unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
+   unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
+   unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; 
+   size_t code, posOut = 0, posIn = 0;
+#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
+#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+   while (posOut+32 <= size && posIn+4 <= lenIn) {
+      unsigned int nextBlock, escapeMask;
+      memcpy(&nextBlock, strIn+posIn, sizeof(unsigned int));
+      escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u);
+      if (escapeMask == 0) {
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+     } else { 
+         unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3;
+         switch(firstEscapePos) { /* Duff's device */
+         case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
+                 // fall through
+         case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */
+         }
+      }
+   }
+   if (posOut+32 <= size) { // handle the possibly 3 last bytes without a loop
+      if (posIn+2 <= lenIn) { 
+	 strOut[posOut] = strIn[posIn+1]; 
+         if (strIn[posIn] != FSST_ESC) {
+            code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+            if (strIn[posIn] != FSST_ESC) {
+               code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+            } else { 
+               posIn += 2; strOut[posOut++] = strIn[posIn-1]; 
+            }
+         } else {
+            posIn += 2; posOut++; 
+         } 
+      }
+      if (posIn < lenIn) { // last code cannot be an escape
+         code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; 
+      }
+   }
+#else
+   while (posOut+8 <= size && posIn < lenIn)
+      if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */
+         FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */
+         posOut += len[code];
+      } else { 
+         strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */
+         posIn++; posOut++; 
+      }
+#endif
+#endif
+   while (posIn < lenIn)
+      if ((code = strIn[posIn++]) < FSST_ESC) {
+         size_t posWrite = posOut, endWrite = posOut + len[code];
+         unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite;
+         if ((posOut = endWrite) > size) endWrite = size;
+         for(; posWrite < endWrite; posWrite++)  /* only write if there is room */
+            strOut[posWrite] = symbolPointer[posWrite];
+      } else {
+         if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */
+         posIn++; posOut++; 
+      } 
+   if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0;
+   return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* FSST_INCLUDED_H */
diff --git a/cpp/thirdparty/fsst/fsst_avx512.cpp b/cpp/thirdparty/fsst/fsst_avx512.cpp
new file mode 100644
index 00000000000..150683d2f9a
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512.cpp
@@ -0,0 +1,150 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+#include "libfsst.hpp"
+
+#if defined(__x86_64__) || defined(_M_X64)
+#include <immintrin.h>
+
+#ifdef _WIN32
+namespace libfsst {
+bool fsst_hasAVX512() {
+   int info[4];
+   __cpuidex(info, 0x00000007, 0);
+   return (info[1]>>16)&1;
+}
+}  // namespace libfsst
+#else
+#include <cpuid.h>
+namespace libfsst {
+bool fsst_hasAVX512() {
+   int info[4];
+    __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]);
+   return (info[1]>>16)&1;
+}
+}  // namespace libfsst
+#endif
+#else
+namespace libfsst {
+bool fsst_hasAVX512() { return false; }
+}  // namespace libfsst
+#endif
+
+namespace libfsst {
+
+// BULK COMPRESSION OF STRINGS
+//
+// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes.
+// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up.
+//
+// In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4 
+// unroll3 performs best on my hardware
+//
+// In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes).
+// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs). 
+//
+// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits)
+// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order.
+//
+// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings.
+// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process.
+// - so 'processed' is the amount of strings we started processing and it is between [480,512].
+// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished.
+// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method.
+//
+// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings.
+// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings.
+// 
+// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to.
+// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch).
+//
+// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS
+// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however, 
+// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them. 
+// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. 
+// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores.
+
+size_t fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) {
+   size_t processed = 0;
+   // define some constants (all_x means that all 8 lanes contain 64-bits value X)
+#ifdef __AVX512F__
+ //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c
+   __m512i all_MASK     = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1));
+   __m512i all_PRIME    = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME));
+   __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE));
+#define    all_HASH       _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE)
+#define    all_ONE        _mm512_srli_epi64(all_MASK, 63)
+#define    all_M19        _mm512_srli_epi64(all_MASK, 45)
+#define    all_M18        _mm512_srli_epi64(all_MASK, 46)
+#define    all_M28        _mm512_srli_epi64(all_MASK, 36)
+#define    all_FFFFFF     _mm512_srli_epi64(all_MASK, 40)
+#define    all_FFFF       _mm512_srli_epi64(all_MASK, 48)
+#define    all_FF         _mm512_srli_epi64(all_MASK, 56)
+
+   SIMDjob *inputEnd = input+n;
+   assert(n >= unroll*8 && n <= 512); // should be close to 512 
+   __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4
+   __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll
+   u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll
+
+   if (unroll >= 4) {
+      while (input+delta1+delta2+delta3+delta4 < inputEnd) {
+         #include "fsst_avx512_unroll4.inc"
+      }
+   } else if (unroll == 3) {
+      while (input+delta1+delta2+delta3 < inputEnd) {
+         #include "fsst_avx512_unroll3.inc"
+      }
+   } else if (unroll == 2) {
+      while (input+delta1+delta2 < inputEnd) {
+         #include "fsst_avx512_unroll2.inc"
+      }
+   } else {
+      while (input+delta1 < inputEnd) {
+         #include "fsst_avx512_unroll1.inc"
+      }
+   }
+
+   // flush the job states of the unfinished strings at the end of output[] 
+   processed = n - (inputEnd - input);
+   u32 unfinished = 0;
+   if (unroll > 1) { 
+      if (unroll > 2) { 
+         if (unroll > 3) { 
+            _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); 
+            unfinished += _mm_popcnt_u32((int) loadmask4);
+         }
+         _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); 
+         unfinished += _mm_popcnt_u32((int) loadmask3);
+      }
+      _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); 
+      unfinished += _mm_popcnt_u32((int) loadmask2);
+   }
+   _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); 
+#else
+   (void) symbolTable;
+   (void) codeBase;
+   (void) symbolBase;
+   (void) input;
+   (void) output;
+   (void) n;
+   (void) unroll;
+#endif
+   return processed;
+}
+}  // namespace libfsst
+
diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc
new file mode 100644
index 00000000000..f4b81c7970d
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512_unroll1.inc
@@ -0,0 +1,57 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc
new file mode 100644
index 00000000000..aa33cd7e69c
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512_unroll2.inc
@@ -0,0 +1,114 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc
new file mode 100644
index 00000000000..e2057032abd
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512_unroll3.inc
@@ -0,0 +1,171 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+            job3      = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+   __m512i  word3     = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code3     = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+   __m512i  pos3      = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+                        // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+            pos3      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl3      = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+   __m512i  write3    = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb3     = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+            pos3      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+   __mmask8 match3    = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+            code3     = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+            write3    = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+            code3     = _mm512_and_epi64(code3, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+            job3      = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+            job3      = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+                        // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+            loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+                        // calculate the amount of lanes in job3 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+            delta3    = _mm_popcnt_u32((int) loadmask3); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3;
diff --git a/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc b/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc
new file mode 100644
index 00000000000..15cca7c938b
--- /dev/null
+++ b/cpp/thirdparty/fsst/fsst_avx512_unroll4.inc
@@ -0,0 +1,228 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+//
+//
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+//
+//
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+// furnished to do so, subject to the following conditions:
+//
+//
+//
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+//
+//
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E1PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E2PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E3PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, E4PRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//
+//
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+//
+//
+//
+//
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask1=11111111, delta1=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask2=11111111, delta2=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask3=11111111, delta3=8).
+                        // load new jobs in the empty lanes (initially, all lanes are empty, so loadmask4=11111111, delta4=8).
+            job1      = _mm512_mask_expandloadu_epi64(job1, loadmask1, input); input += delta1; 
+            job2      = _mm512_mask_expandloadu_epi64(job2, loadmask2, input); input += delta2; 
+            job3      = _mm512_mask_expandloadu_epi64(job3, loadmask3, input); input += delta3; 
+            job4      = _mm512_mask_expandloadu_epi64(job4, loadmask4, input); input += delta4; 
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+                        // load the next 8 input string bytes (uncompressed data, aka 'symbols').
+   __m512i  word1     = _mm512_i64gather_epi64(_mm512_srli_epi64(job1, 46), symbolBase, 1); 
+   __m512i  word2     = _mm512_i64gather_epi64(_mm512_srli_epi64(job2, 46), symbolBase, 1); 
+   __m512i  word3     = _mm512_i64gather_epi64(_mm512_srli_epi64(job3, 46), symbolBase, 1); 
+   __m512i  word4     = _mm512_i64gather_epi64(_mm512_srli_epi64(job4, 46), symbolBase, 1); 
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // load 16-bits codes from the 2-byte-prefix keyed lookup table. It also store 1-byte codes in all free slots.
+                        // code1: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code2: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code3: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+                        // code4: Lowest 8 bits contain the code. Eleventh bit is whether it is an escaped code. Next 4 bits is length (2 or 1).
+   __m512i  code1     = _mm512_i64gather_epi64(_mm512_and_epi64(word1, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code2     = _mm512_i64gather_epi64(_mm512_and_epi64(word2, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code3     = _mm512_i64gather_epi64(_mm512_and_epi64(word3, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+   __m512i  code4     = _mm512_i64gather_epi64(_mm512_and_epi64(word4, all_FFFF), symbolTable.shortCodes, sizeof(u16));
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+                        // get the first three bytes of the string. 
+   __m512i  pos1      = _mm512_mullo_epi64(_mm512_and_epi64(word1, all_FFFFFF), all_PRIME);
+   __m512i  pos2      = _mm512_mullo_epi64(_mm512_and_epi64(word2, all_FFFFFF), all_PRIME);
+   __m512i  pos3      = _mm512_mullo_epi64(_mm512_and_epi64(word3, all_FFFFFF), all_PRIME);
+   __m512i  pos4      = _mm512_mullo_epi64(_mm512_and_epi64(word4, all_FFFFFF), all_PRIME);
+                        // hash them into a random number: pos1 = pos1*PRIME; pos1 ^= pos1>>SHIFT
+                        // hash them into a random number: pos2 = pos2*PRIME; pos2 ^= pos2>>SHIFT
+                        // hash them into a random number: pos3 = pos3*PRIME; pos3 ^= pos3>>SHIFT
+                        // hash them into a random number: pos4 = pos4*PRIME; pos4 ^= pos4>>SHIFT
+            pos1      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos1,_mm512_srli_epi64(pos1,FSST_SHIFT)), all_HASH), 4);
+            pos2      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos2,_mm512_srli_epi64(pos2,FSST_SHIFT)), all_HASH), 4);
+            pos3      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos3,_mm512_srli_epi64(pos3,FSST_SHIFT)), all_HASH), 4);
+            pos4      = _mm512_slli_epi64(_mm512_and_epi64(_mm512_xor_epi64(pos4,_mm512_srli_epi64(pos4,FSST_SHIFT)), all_HASH), 4);
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+                        // lookup in the 3-byte-prefix keyed hash table
+   __m512i  icl1      = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl2      = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl3      = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 8), 1);
+   __m512i  icl4      = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 8), 1);
+                        // speculatively store the first input byte into the second position of the write1 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write2 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write3 register (in case it turns out to be an escaped byte).
+                        // speculatively store the first input byte into the second position of the write4 register (in case it turns out to be an escaped byte).
+   __m512i  write1    = _mm512_slli_epi64(_mm512_and_epi64(word1, all_FF), 8);
+   __m512i  write2    = _mm512_slli_epi64(_mm512_and_epi64(word2, all_FF), 8);
+   __m512i  write3    = _mm512_slli_epi64(_mm512_and_epi64(word3, all_FF), 8);
+   __m512i  write4    = _mm512_slli_epi64(_mm512_and_epi64(word4, all_FF), 8);
+                        // lookup just like the icl1 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl2 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl3 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+                        // lookup just like the icl4 above, but loads the next 8 bytes. This fetches the actual string bytes in the hash table.
+   __m512i  symb1     = _mm512_i64gather_epi64(pos1, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb2     = _mm512_i64gather_epi64(pos2, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb3     = _mm512_i64gather_epi64(pos3, (((char*) symbolTable.hashTab) + 0), 1);
+   __m512i  symb4     = _mm512_i64gather_epi64(pos4, (((char*) symbolTable.hashTab) + 0), 1);
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+                        // generate the FF..FF mask with an FF for each byte of the symbol (we need to AND the input with this to correctly check equality).
+            pos1      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl1, all_FF));
+            pos2      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl2, all_FF));
+            pos3      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl3, all_FF));
+            pos4      = _mm512_srlv_epi64(all_MASK, _mm512_and_epi64(icl4, all_FF));
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+                        // check symbol < |str| as well as whether it is an occupied slot (cmplt checks both conditions at once) and check string equality (cmpeq).
+   __mmask8 match1    = _mm512_cmpeq_epi64_mask(symb1, _mm512_and_epi64(word1, pos1)) & _mm512_cmplt_epi64_mask(icl1, all_ICL_FREE);
+   __mmask8 match2    = _mm512_cmpeq_epi64_mask(symb2, _mm512_and_epi64(word2, pos2)) & _mm512_cmplt_epi64_mask(icl2, all_ICL_FREE);
+   __mmask8 match3    = _mm512_cmpeq_epi64_mask(symb3, _mm512_and_epi64(word3, pos3)) & _mm512_cmplt_epi64_mask(icl3, all_ICL_FREE);
+   __mmask8 match4    = _mm512_cmpeq_epi64_mask(symb4, _mm512_and_epi64(word4, pos4)) & _mm512_cmplt_epi64_mask(icl4, all_ICL_FREE);
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+                        // for the hits, overwrite the codes with what comes from the hash table (codes for symbols of length >=3). The rest stays with what shortCodes gave.
+            code1     = _mm512_mask_mov_epi64(code1, match1, _mm512_srli_epi64(icl1, 16));
+            code2     = _mm512_mask_mov_epi64(code2, match2, _mm512_srli_epi64(icl2, 16));
+            code3     = _mm512_mask_mov_epi64(code3, match3, _mm512_srli_epi64(icl3, 16));
+            code4     = _mm512_mask_mov_epi64(code4, match4, _mm512_srli_epi64(icl4, 16));
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+                        // write out the code byte as the first output byte. Notice that this byte may also be the escape code 255 (for escapes) coming from shortCodes.
+            write1    = _mm512_or_epi64(write1, _mm512_and_epi64(code1, all_FF));
+            write2    = _mm512_or_epi64(write2, _mm512_and_epi64(code2, all_FF));
+            write3    = _mm512_or_epi64(write3, _mm512_and_epi64(code3, all_FF));
+            write4    = _mm512_or_epi64(write4, _mm512_and_epi64(code4, all_FF));
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+                        // zip the irrelevant 6 bytes (just stay with the 2 relevant bytes containing the 16-bits code)
+            code1     = _mm512_and_epi64(code1, all_FFFF);
+            code2     = _mm512_and_epi64(code2, all_FFFF);
+            code3     = _mm512_and_epi64(code3, all_FFFF);
+            code4     = _mm512_and_epi64(code4, all_FFFF);
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        // write out the compressed data. It writes 8 bytes, but only 1 byte is relevant :-(or 2 bytes are, in case of an escape code)
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job1, all_M19), write1, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job2, all_M19), write2, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job3, all_M19), write3, 1);
+                        _mm512_i64scatter_epi64(codeBase, _mm512_and_epi64(job4, all_M19), write4, 1);
+                        // increase the job1.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job2.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job3.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+                        // increase the job4.cur field in the job with the symbol length (for this, shift away 12 bits from the code) 
+            job1      = _mm512_add_epi64(job1, _mm512_slli_epi64(_mm512_srli_epi64(code1, FSST_LEN_BITS), 46));
+            job2      = _mm512_add_epi64(job2, _mm512_slli_epi64(_mm512_srli_epi64(code2, FSST_LEN_BITS), 46));
+            job3      = _mm512_add_epi64(job3, _mm512_slli_epi64(_mm512_srli_epi64(code3, FSST_LEN_BITS), 46));
+            job4      = _mm512_add_epi64(job4, _mm512_slli_epi64(_mm512_srli_epi64(code4, FSST_LEN_BITS), 46));
+                        // increase the job1.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job2.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job3.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+                        // increase the job4.out' field with one, or two in case of an escape code (add 1 plus the escape bit, i.e the 8th)
+            job1      = _mm512_add_epi64(job1, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code1, 8), all_ONE)));
+            job2      = _mm512_add_epi64(job2, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code2, 8), all_ONE)));
+            job3      = _mm512_add_epi64(job3, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code3, 8), all_ONE)));
+            job4      = _mm512_add_epi64(job4, _mm512_add_epi64(all_ONE, _mm512_and_epi64(_mm512_srli_epi64(code4, 8), all_ONE)));
+                        // test which lanes are done now (job1.cur==job1.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job1 register)
+                        // test which lanes are done now (job2.cur==job2.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job2 register)
+                        // test which lanes are done now (job3.cur==job3.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job3 register)
+                        // test which lanes are done now (job4.cur==job4.end), cur starts at bit 46, end starts at bit 28 (the highest 2x18 bits in the job4 register)
+            loadmask1 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job1, 46), _mm512_and_epi64(_mm512_srli_epi64(job1, 28), all_M18));
+            loadmask2 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job2, 46), _mm512_and_epi64(_mm512_srli_epi64(job2, 28), all_M18));
+            loadmask3 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job3, 46), _mm512_and_epi64(_mm512_srli_epi64(job3, 28), all_M18));
+            loadmask4 = _mm512_cmpeq_epi64_mask(_mm512_srli_epi64(job4, 46), _mm512_and_epi64(_mm512_srli_epi64(job4, 28), all_M18));
+                        // calculate the amount of lanes in job1 that are done
+                        // calculate the amount of lanes in job2 that are done
+                        // calculate the amount of lanes in job3 that are done
+                        // calculate the amount of lanes in job4 that are done
+            delta1    = _mm_popcnt_u32((int) loadmask1); 
+            delta2    = _mm_popcnt_u32((int) loadmask2); 
+            delta3    = _mm_popcnt_u32((int) loadmask3); 
+            delta4    = _mm_popcnt_u32((int) loadmask4); 
+                        // write out the job state for the lanes that are done (we need the final 'job1.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job2.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job3.out' value to compute the compressed string length)
+                        // write out the job state for the lanes that are done (we need the final 'job4.out' value to compute the compressed string length)
+                        _mm512_mask_compressstoreu_epi64(output, loadmask1, job1); output += delta1;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask2, job2); output += delta2;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask3, job3); output += delta3;
+                        _mm512_mask_compressstoreu_epi64(output, loadmask4, job4); output += delta4;
diff --git a/cpp/thirdparty/fsst/libfsst.cpp b/cpp/thirdparty/fsst/libfsst.cpp
new file mode 100644
index 00000000000..e3ba787b959
--- /dev/null
+++ b/cpp/thirdparty/fsst/libfsst.cpp
@@ -0,0 +1,651 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+//
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
+#include "libfsst.hpp"
+
+namespace libfsst {
+Symbol concat(Symbol a, Symbol b) {
+   Symbol s;
+   u32 length = a.length()+b.length();
+   if (length > Symbol::maxLength) length = Symbol::maxLength; 
+   s.set_code_len(FSST_CODE_MASK, length);
+   s.store_num((b.load_num() << (8*a.length())) | a.load_num());
+   return s;
+}
+}  // namespace libfsst
+
+namespace std {
+template <>
+class hash<libfsst::QSymbol> {
+   public:
+   size_t operator()(const libfsst::QSymbol& q) const {
+      uint64_t k = q.symbol.load_num();
+      const uint64_t m = 0xc6a4a7935bd1e995;
+      const int r = 47;
+      uint64_t h = 0x8445d61a4e774912 ^ (8*m);
+      k *= m;
+      k ^= k >> r;
+      k *= m;
+      h ^= k;
+      h *= m;
+      h ^= h >> r;
+      h *= m;
+      h ^= h >> r;
+      return h;
+   }
+};
+}
+
+namespace libfsst {
+bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
+
+std::ostream& operator<<(std::ostream& out, const Symbol& s) {
+   for (u32 i=0; i<s.length(); i++)
+      out << s.val.str[i];
+   return out;
+}
+
+SymbolTable *buildSymbolTable(Counters& counters, vector<const u8*> line, const size_t len[], bool zeroTerminated=false) {
+   SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
+   int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
+   size_t sampleFrac = 128;
+
+   // start by determining the terminator. We use the (lowest) most infrequent byte as terminator 
+   st->zeroTerminated = zeroTerminated;
+   if (zeroTerminated) {
+      st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
+   } else {
+      u16 byteHisto[256];
+      memset(byteHisto, 0, sizeof(byteHisto));
+      for(size_t i=0; i<line.size(); i++) {
+         const u8* cur = line[i];
+         const u8* end = cur + len[i];
+         while(cur < end) byteHisto[*cur++]++;
+      }
+      u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
+      while(i-- > 0) {
+         if (byteHisto[i] > minSize) continue;
+         st->terminator = i;
+         minSize = byteHisto[i];
+      }
+   }
+   assert(st->terminator != 256);
+
+   // a random number between 0 and 128
+   auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
+
+   // compress sample, and compute (pair-)frequencies
+   auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
+      int gain = 0;
+
+      for(size_t i=0; i<line.size(); i++) {
+         const u8* cur = line[i], *start = cur;
+         const u8* end = cur + len[i];
+
+         if (sampleFrac < 128) {
+            // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
+            if (rnd128(i) > sampleFrac) continue;
+         }
+         if (cur < end) {
+            u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
+            cur += st->symbols[code1].length();
+            gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1)));
+            while (true) {
+               // count single symbol (i.e. an option is not extending it)
+               counters.count1Inc(code1);
+	
+               // as an alternative, consider just using the next byte..
+               if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
+                  counters.count1Inc(*start); 
+
+               if (cur==end) { 
+                  break;
+               } 
+
+               // now match a new symbol
+	       start = cur;
+               if (cur<end-7) {
+                  u64 word = fsst_unaligned_load(cur);
+                  size_t code = word & 0xFFFFFF;
+                  size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
+                  Symbol s = st->hashTab[idx];
+                  code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
+                  word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                  if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) {
+                     code2 = s.code(); 
+		     cur += s.length();
+                  } else if (code2 >= FSST_CODE_BASE) {
+                     cur += 2;
+                  } else {
+                     code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
+                     cur += 1;
+                  }
+               } else {
+                  code2 = st->findLongestSymbol(cur, end);
+                  cur += st->symbols[code2].length();
+               }
+ 
+               // compute compressed output size
+               gain += ((int) (cur-start))-(1+isEscapeCode(code2));
+
+               if (sampleFrac < 128) { // no need to count pairs in final round
+	          // consider the symbol that is the concatenation of the two last symbols
+                  counters.count2Inc(code1, code2);
+
+                  // as an alternative, consider just extending with the next byte..
+                  if ((cur-start) > 1)  // ..but do not count single byte extensions doubly
+                     counters.count2Inc(code1, *start);
+               }
+               code1 = code2;
+            }
+         }
+      }
+      return gain; 
+   };
+
+   auto makeTable = [&](SymbolTable *st, Counters &counters) {
+      // hashmap of c (needed because we can generate duplicate candidates)
+      unordered_set<QSymbol> cands;
+
+      // artificially make terminater the most frequent symbol so it gets included
+      u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
+      counters.count1Set(terminator,65535); 
+
+      auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
+         if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
+         QSymbol q;
+         q.symbol = s;
+         q.gain = count * s.length();
+         auto it = cands.find(q);
+         if (it != cands.end()) {
+            q.gain += (*it).gain;
+            cands.erase(*it);
+         }
+         cands.insert(q);
+      };
+
+      // add candidate symbols based on counted frequency
+      for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) { 
+         u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
+         if (!cnt1) continue;
+
+         // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
+         Symbol s1 = st->symbols[pos1];
+         addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
+
+         if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
+             s1.length() == Symbol::maxLength || // symbol cannot be extended
+             s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
+            continue;
+         }
+         for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) { 
+            u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
+            if (!cnt2) continue;
+
+            // create a new symbol
+            Symbol s2 = st->symbols[pos2];
+            Symbol s3 = concat(s1, s2);
+            if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
+               addOrInc(cands, s3, cnt2);
+         }
+      }
+
+      // insert candidates into priority queue (by gain)
+      auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); };
+      priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
+      for (auto& q : cands)
+         pq.push(q);
+
+      // Create new symbol map using best candidates
+      st->clear();
+      while (st->nSymbols < 255 && !pq.empty()) {
+         QSymbol q = pq.top();
+         pq.pop();
+         st->add(q.symbol);
+      }
+   };
+
+   u8 bestCounters[512*sizeof(u16)];
+#ifdef NONOPT_FSST
+   for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
+      sampleFrac = frac;
+#else
+   for(sampleFrac=8; true; sampleFrac += 30) {
+#endif
+      memset(&counters, 0, sizeof(Counters));
+      long gain = compressCount(st, counters);
+      if (gain >= bestGain) { // a new best solution!
+         counters.backup1(bestCounters);
+         *bestTable = *st; bestGain = gain;
+      } 
+      if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
+      makeTable(st, counters);
+   }
+   delete st;
+   counters.restore1(bestCounters);
+   makeTable(bestTable, counters);
+   bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
+   return bestTable;
+}
+
+#ifndef NONOPT_FSST
+static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, const size_t len[], const u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
+   size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
+   u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings 
+   SIMDjob input[512];  // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
+   SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
+   size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
+
+   while (curLine < nlines && outOff <= (1<<19)) {
+      size_t prevLine = curLine, chunk, curOff = 0;
+ 
+      // bail out if the output buffer cannot hold the compressed next string fully
+      if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
+      else budget -= (len[curLine]-curOff)*2;
+
+      strOut[curLine] = (u8*) 0; 
+      lenOut[curLine] = 0;
+
+      do {
+         do {
+            chunk = len[curLine] - curOff;
+            if (chunk > 511) {
+               chunk = 511; // large strings need to be chopped up into segments of 511 bytes
+            }
+            // create a job in this batch
+            SIMDjob job;
+            job.cur = inOff;
+            job.end = job.cur + chunk;
+            job.pos = batchPos;
+            job.out = outOff;
+   
+            // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
+            outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
+            if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
+   
+            // register job in this batch
+            input[batchPos] = job;
+            jobLine[batchPos] = curLine;
+   
+            if (chunk == 0) {
+               empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
+            } else {
+               // copy string chunk into temp buffer 
+               memcpy(symbolBase + inOff, line[curLine] + curOff, chunk);
+               inOff += chunk;
+               curOff += chunk;
+               symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
+            }
+            if (++batchPos == 512) break;
+         } while(curOff < len[curLine]);
+   
+         if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines) || (((len[curLine])*2 + 7) > budget)) { // cannot accumulate more?
+            if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
+               // radix-sort jobs on length (longest string first) 
+               // -- this provides best load balancing and allows to skip empty jobs at the end
+               u16 sortpos[513]; 
+               memset(sortpos, 0, sizeof(sortpos));
+   
+               // calculate length histo 
+               for(size_t i=0; i<batchPos; i++) { 
+                  size_t len = input[i].end - input[i].cur; 
+                  sortpos[512UL - len]++;
+               }
+               // calculate running sum
+               for(size_t i=1; i<=512; i++) 
+                  sortpos[i] += sortpos[i-1]; 
+   
+               // move jobs to their final destination
+               SIMDjob inputOrdered[512];
+               for(size_t i=0; i<batchPos; i++) {
+                  size_t len = input[i].end - input[i].cur; 
+                  size_t pos = sortpos[511UL - len]++;
+                  inputOrdered[pos] = input[i]; 
+                }
+               // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..) 
+               for(size_t done = fsst_compressAVX512(symbolTable, codeBase, symbolBase, inputOrdered, output, batchPos-empty, unroll);
+                   done < batchPos; done++) output[done] = inputOrdered[done]; 
+            } else {
+               memcpy(output, input, batchPos*sizeof(SIMDjob));
+            }
+   
+            // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
+            for(size_t i=0; i<batchPos; i++) {
+               SIMDjob job = output[i];
+               if (job.cur < job.end) { // finish encoding this string with scalar code
+                  u8* cur = symbolBase + job.cur;
+                  u8* end = symbolBase + job.end;
+                  u8* out = codeBase + job.out;
+                  while (cur < end) {
+                     u64 word = fsst_unaligned_load(cur);
+                     size_t code = symbolTable.shortCodes[word & 0xFFFF];
+                     size_t pos = word & 0xFFFFFF;
+                     size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
+                     Symbol s = symbolTable.hashTab[idx];
+                     out[1] = (u8) word; // speculatively write out escaped byte
+                     word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+                     if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
+                        *out++ = (u8) s.code(); cur += s.length();
+                     } else {
+                        // could be a 2-byte or 1-byte code, or miss
+                        // handle everything with predication 
+                        *out = (u8) code; 
+                        out += 1+((code&FSST_CODE_BASE)>>8);
+                        cur += (code>>FSST_LEN_BITS); 
+                    }
+                  }
+                  job.out = out - codeBase;
+               } 
+               // postprocess job info
+               job.cur = 0;
+               job.end = job.out - input[job.pos].out; // misuse .end field as compressed size 
+               job.out = input[job.pos].out; // reset offset to start of encoded string
+               input[job.pos] = job; 
+            }
+   
+            // copy out the result data
+            for(size_t i=0; i<batchPos; i++) {
+               size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
+               size_t sz = input[i].end; // had stored compressed lengths here
+               if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
+               lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
+               memcpy(dst, codeBase+input[i].out, sz);
+               dst += sz;
+            }
+   
+            // go for the next batch of 512 chunks
+            inOff = outOff = batchPos = empty = 0;
+            budget = (size_t) (lim - dst);
+         } 
+      } while (curLine == prevLine && outOff <= (1<<19));
+   }
+   return curLine;
+}
+#endif
+
+// optimized adaptive *scalar* compression method
+static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, const size_t lenIn[], const u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
+   const u8 *cur = NULL, *end =  NULL, *lim = out + size;
+   size_t curLine, suffixLim = symbolTable.suffixLim;
+   u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
+
+   u8 buf[512+8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
+
+   // three variants are possible. dead code falls away since the bool arguments are constants
+   auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
+      while (cur < end) {
+         u64 word = fsst_unaligned_load(cur);
+         size_t code = symbolTable.shortCodes[word & 0xFFFF];
+         if (noSuffixOpt && ((u8) code) < suffixLim) {
+            // 2 byte code without having to worry about longer matches
+            *out++ = (u8) code; cur += 2;
+         } else {
+            size_t pos = word & 0xFFFFFF;
+            size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
+            Symbol s = symbolTable.hashTab[idx];
+            out[1] = (u8) word; // speculatively write out escaped byte
+            word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
+            if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
+               *out++ = (u8) s.code(); cur += s.length();
+            } else if (avoidBranch) {
+               // could be a 2-byte or 1-byte code, or miss
+               // handle everything with predication 
+               *out = (u8) code; 
+               out += 1+((code&FSST_CODE_BASE)>>8);
+               cur += (code>>FSST_LEN_BITS); 
+            } else if ((u8) code < byteLim) {
+               // 2 byte code after checking there is no longer pattern
+               *out++ = (u8) code; cur += 2;
+            } else {
+               // 1 byte code or miss. 
+               *out = (u8) code; 
+               out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
+               cur++;
+            }
+         }
+      }
+   };
+
+   for(curLine=0; curLine<nlines; curLine++) {
+      size_t chunk, curOff = 0;
+      strOut[curLine] = out;
+      do {
+         cur = strIn[curLine] + curOff; 
+         chunk = lenIn[curLine] - curOff;
+         if (chunk > 511) {
+            chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST 
+         }
+         if ((2*chunk+7) > (size_t) (lim-out)) {
+            return curLine; // out of memory
+         }
+         // copy the string to the 511-byte buffer
+         memcpy(buf, cur, chunk);
+         buf[chunk] = (u8) symbolTable.terminator;
+         cur = buf;
+         end = cur + chunk; 
+
+         // based on symboltable stats, choose a variant that is nice to the branch predictor
+         if (noSuffixOpt) {
+            compressVariant(true,false);
+         } else if (avoidBranch) {
+            compressVariant(false,true);
+         } else {
+          compressVariant(false, false);
+         }
+      } while((curOff += chunk) < lenIn[curLine]);
+      lenOut[curLine] = (size_t) (out - strOut[curLine]);
+   } 
+   return curLine;
+}
+
+#define FSST_SAMPLELINE ((size_t) 512)
+
+// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
+vector<const u8*> makeSample(u8* sampleBuf, const u8* strIn[], const size_t **lenRef, size_t nlines) {
+   size_t totSize = 0;
+   const size_t *lenIn = *lenRef;
+   vector<const u8*> sample;
+
+   for(size_t i=0; i<nlines; i++) 
+      totSize += lenIn[i];
+
+   if (totSize < FSST_SAMPLETARGET) { 
+      for(size_t i=0; i<nlines; i++) 
+         sample.push_back(strIn[i]);
+   } else {
+      size_t sampleRnd = FSST_HASH(4637947);
+      const u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
+      size_t *sampleLen =  new size_t[nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE];
+      *lenRef = sampleLen;
+      size_t* sampleLenLim = sampleLen + nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE;
+
+      while(sampleBuf < sampleLim && sampleLen < sampleLenLim) {
+         // choose a non-empty line
+         sampleRnd = FSST_HASH(sampleRnd);
+         size_t linenr = sampleRnd % nlines;
+         while (lenIn[linenr] == 0) 
+            if (++linenr == nlines) linenr = 0;
+
+         // choose a chunk
+         size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
+         sampleRnd = FSST_HASH(sampleRnd);
+         size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
+
+         // add the chunk to the sample
+         size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
+         memcpy(sampleBuf, strIn[linenr]+chunk, len);
+         sample.push_back(sampleBuf);
+         sampleBuf += *sampleLen++ = len;
+      }
+   }
+   return sample;
+}
+
+extern "C" fsst_encoder_t* fsst_create(size_t n, const size_t lenIn[], const u8 *strIn[], int zeroTerminated) {
+   u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
+   const size_t *sampleLen = lenIn;
+   vector<const u8*> sample = makeSample(sampleBuf, strIn, &sampleLen, n?n:1); // careful handling of input to get a right-size and representative sample
+   Encoder *encoder = new Encoder();
+   encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated));
+   if (sampleLen != lenIn) delete[] sampleLen; 
+   delete[] sampleBuf; 
+   return (fsst_encoder_t*) encoder;
+}
+
+/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */
+extern "C" fsst_encoder_t* fsst_duplicate(fsst_encoder_t *encoder) {
+   Encoder *e = new Encoder();
+   e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
+   return (fsst_encoder_t*) e;
+}
+
+// export a symbol table in compact format. 
+extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
+   Encoder *e = (Encoder*) encoder;
+   // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
+   // This is sufficient in principle to *reconstruct* a fsst_encoder_t from a fsst_decoder_t
+   // (such functionality could be useful to append compressed data to an existing block).
+   //
+   // However, the hash function in the encoder hash table is endian-sensitive, and given its
+   // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
+   // Doing a endian-conversion during hashing will be slow and self-defeating.
+   //
+   // Overall, we could support reconstructing an encoder for incremental compression, but 
+   // should enforce equal-endianness. Bit of a bummer. Not going there now.
+   // 
+   // The version field is now there just for future-proofness, but not used yet
+   
+   // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
+   u64 version = (FSST_VERSION << 32) |  // version is 24 bits, most significant byte is 0 
+                 (((u64) e->symbolTable->suffixLim) << 24) | 
+                 (((u64) e->symbolTable->terminator) << 16) | 
+                 (((u64) e->symbolTable->nSymbols) << 8) | 
+                 FSST_ENDIAN_MARKER; // least significant byte is nonzero
+
+   version = swap64_if_be(version); // ensure version is little-endian encoded
+
+   /* do not assume unaligned reads here */
+   memcpy(buf, &version, 8);
+   buf[8] = e->symbolTable->zeroTerminated;
+   for(u32 i=0; i<8; i++)
+      buf[9+i] = (u8) e->symbolTable->lenHisto[i];
+   u32 pos = 17;
+
+   // emit only the used bytes of the symbols 
+   for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
+      for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
+         buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
+
+   return pos; // length of what was serialized
+}
+
+#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
+
+extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) {
+   u64 version = 0;
+   u32 code, pos = 17;
+   u8 lenHisto[8];
+
+   // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
+   memcpy(&version, buf, 8);
+   version = swap64_if_be(version); // version is always little-endian encoded
+
+   if ((version>>32) != FSST_VERSION) return 0;
+   decoder->zeroTerminated = buf[8]&1;
+   memcpy(lenHisto, buf+9, 8);
+
+   // in case of zero-terminated, first symbol is "" (zero always, may be overwritten) 
+   decoder->len[0] = 1;
+   decoder->symbol[0] = 0;
+
+   // we use lenHisto[0] as 1-byte symbol run length (at the end)
+   code = decoder->zeroTerminated;
+   if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
+
+   // now get all symbols from the buffer
+   for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
+      for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++)  {
+         decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1  */
+         decoder->symbol[code] = 0;
+         for(u32 j=0; j<decoder->len[code]; j++) 
+            ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
+      }
+   }
+   if (decoder->zeroTerminated) lenHisto[0]++; 
+
+   // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
+   while(code<255) {
+       decoder->symbol[code] = FSST_CORRUPT;    
+       decoder->len[code++] = 8;
+   }
+   return pos;
+}
+
+// runtime check for simd
+inline size_t _compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+#ifndef NONOPT_FSST
+   if (simd && fsst_hasAVX512())
+      return compressSIMD(*e->symbolTable, e->simdbuf, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+#endif
+   (void) simd;
+   return compressBulk(*e->symbolTable, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch);
+}
+size_t compressImpl(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
+   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
+}
+
+// adaptive choosing of scalar compression method based on symbol length histogram 
+inline size_t _compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
+   bool avoidBranch = false, noSuffixOpt = false;
+   if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
+      noSuffixOpt = true;
+   } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
+              (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
+              (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
+      avoidBranch = true;
+   }
+   return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
+}
+size_t compressAuto(Encoder *e, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
+   return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
+}
+}  // namespace libfsst
+
+using namespace libfsst;
+// the main compression function (everything automatic)
+extern "C" size_t fsst_compress(fsst_encoder_t *encoder, size_t nlines, const size_t lenIn[], const u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) {
+   // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
+   size_t totLen = accumulate(lenIn, lenIn+nlines, 0);
+   int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15); 
+   return _compressAuto((Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, 3*simd);
+}
+
+/* deallocate encoder */
+extern "C" void fsst_destroy(fsst_encoder_t* encoder) {
+  Encoder *e = (Encoder*) encoder; 
+   delete e;
+}
+
+/* very lazy implementation relying on export and import */
+extern "C" fsst_decoder_t fsst_decoder(fsst_encoder_t *encoder) {
+   u8 buf[sizeof(fsst_decoder_t)];
+   u32 cnt1 = fsst_export(encoder, buf);
+   fsst_decoder_t decoder;
+   u32 cnt2 = fsst_import(&decoder, buf);
+   assert(cnt1 == cnt2); (void) cnt1; (void) cnt2; 
+   return decoder;
+}
diff --git a/cpp/thirdparty/fsst/libfsst.hpp b/cpp/thirdparty/fsst/libfsst.hpp
new file mode 100644
index 00000000000..f61bc0175b6
--- /dev/null
+++ b/cpp/thirdparty/fsst/libfsst.hpp
@@ -0,0 +1,471 @@
+// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
+// 
+// Copyright 2018-2020, CWI, TU Munich, FSU Jena
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files   
+// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,   
+// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is   
+// furnished to do so, subject to the following conditions:
+// 
+// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
+// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
+//                 
+// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst 
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <memory>
+#include <queue>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+
+using namespace std;
+
+#include "fsst.h" // the official FSST API -- also usable by C mortals
+
+/* unsigned integers */
+namespace libfsst {
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+}  // namespace libfsst
+
+#if UINTPTR_MAX == 0xffffffffU
+// We're on a 32-bit platform
+#define NONOPT_FSST
+#endif
+
+#define FSST_ENDIAN_MARKER ((u64) 1)
+#define FSST_VERSION_20190218 20190218
+#define FSST_VERSION ((u64) FSST_VERSION_20190218)
+
+// "symbols" are character sequences (up to 8 bytes)
+// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism:
+// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X.
+
+// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length
+#define FSST_LEN_BITS       12
+#define FSST_CODE_BITS      9 
+#define FSST_CODE_BASE      256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */
+#define FSST_CODE_MAX       (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */
+#define FSST_CODE_MASK      (FSST_CODE_MAX-1UL)   /* all bits set: indicating a symbol that has not been assigned a code yet */
+
+namespace libfsst {
+constexpr inline uint64_t swap64_if_be(uint64_t v) noexcept {
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    return __builtin_bswap64(v);
+#else
+    return v; // little-endian (or unknown), so no swap needed
+#endif
+}
+
+inline uint64_t fsst_unaligned_load(u8 const* V) {
+    uint64_t Ret;
+    memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
+    return swap64_if_be(Ret);
+}
+
+struct Symbol {
+   static const unsigned maxLength = 8;
+
+   // the byte sequence that this symbol stands for
+   union { char str[maxLength]; u64 num; } val; // usually we process it as a num(ber), as this is fast
+
+   // icl = u64 ignoredBits:16,code:12,length:4,unused:32 -- but we avoid exposing this bit-field notation
+   u64 icl;  // use a single u64 to be sure "code" is accessed with one load and can be compared with one comparison
+
+   Symbol() : icl(0) { val.num = 0; }
+
+   explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { store_num(c); } // single-char symbol
+   explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {}
+   explicit Symbol(const u8* begin, const u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {}
+   explicit Symbol(const char* input, u32 len) {
+      val.num = 0;
+      if (len>=8) {
+          len = 8;
+          memcpy(val.str, input, 8);
+      } else {
+          memcpy(val.str, input, len);
+      }
+      set_code_len(FSST_CODE_MAX, len);
+   }
+   void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); }
+
+   u64 load_num() const { return swap64_if_be(val.num); }
+   void store_num(u64 v) { val.num = swap64_if_be(v); }
+
+   u32 length() const { return (u32) (icl >> 28); }
+   u16 code() const { return (icl >> 16) & FSST_CODE_MASK; }
+   u32 ignoredBits() const { return (u32) icl; }
+
+   u8 first() const { assert( length() >= 1); return 0xFF & load_num(); }
+   u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); }
+
+#define FSST_HASH_LOG2SIZE 10 
+#define FSST_HASH_PRIME 2971215073LL
+#define FSST_SHIFT 15
+#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT))
+   size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes
+};
+
+// Symbol that can be put in a queue, ordered on gain
+struct QSymbol{
+   Symbol symbol;
+   mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols
+   bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); }
+};
+
+// we construct FSST symbol tables using a random sample of about 16KB (1<<14) 
+#define FSST_SAMPLETARGET (1<<14)
+#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET)
+
+// two phases of compression, before and after optimize():
+//
+// (1) to encode values we probe (and maintain) three datastructures:
+// - u16 byteCodes[256] array at the position of the next byte  (s.length==1)
+// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2)
+// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), 
+// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are 
+// pseudo codes representing a single byte these will become escapes)
+//
+// (2) when we finished looking for the best symbol table we call optimize() to reshape it:
+// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1)
+//   length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes 
+//   (allows shortcut during compression)
+// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding 
+//   to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one
+//   byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[]
+//   and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction
+//   is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was
+//   hence added to make symbolTable construction faster.
+//
+// this final layout allows for the fastest compression code, only currently present in compressBulk
+
+// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4
+#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket
+
+// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key
+//             ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster
+//
+// the gain field is only used in the symbol queue that sorts symbols on gain
+
+struct SymbolTable {
+   static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
+
+   // lookup table using the next two bytes (65536 codes), or just the next single byte
+   u16 shortCodes[65536]; // contains code for 2-byte symbol, otherwise code for pseudo byte (escaped byte)
+
+   // lookup table (only used during symbolTable construction, not during normal text compression)
+   u16 byteCodes[256]; // contains code for every 1-byte symbol, otherwise code for pseudo byte (escaped byte)
+
+   // 'symbols' is the current symbol  table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code'
+   Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols   
+
+   // replicate long symbols in hashTab (avoid indirection). 
+   Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes
+
+   u16 nSymbols;          // amount of symbols in the map (max 255)
+   u16 suffixLim;         // codes higher than this do not have a longer suffix
+   u16 terminator;        // code of 1-byte symbol, that can be used as a terminator during compression
+   bool zeroTerminated;   // whether we are expecting zero-terminated strings (we then also produce zero-terminated compressed strings)
+   u16 lenHisto[FSST_CODE_BITS]; // lenHisto[x] is the amount of symbols of byte-length (x+1) in this SymbolTable
+
+   SymbolTable() : nSymbols(0), suffixLim(FSST_CODE_MAX), terminator(0), zeroTerminated(false) {
+      // stuff done once at startup
+      for (u32 i=0; i<256; i++) {
+         symbols[i] = Symbol(i,i|(1<<FSST_LEN_BITS)); // pseudo symbols
+      }
+      Symbol unused = Symbol((u8) 0,FSST_CODE_MASK); // single-char symbol, exception code
+      for (u32 i=256; i<FSST_CODE_MAX; i++) {
+         symbols[i] = unused; // we start with all symbols unused
+      }
+      // empty hash table
+      Symbol s;
+      s.val.num = 0;
+      s.icl = FSST_ICL_FREE; //marks empty in hashtab
+      for(u32 i=0; i<hashTabSize; i++)
+         hashTab[i] = s;
+
+      // fill byteCodes[] with the pseudo code all bytes (escaped bytes)
+      for(u32 i=0; i<256; i++)
+         byteCodes[i] = (1<<FSST_LEN_BITS) | i;
+
+      // fill shortCodes[] with the pseudo code for the first byte of each two-byte pattern
+      for(u32 i=0; i<65536; i++)
+         shortCodes[i] = (1<<FSST_LEN_BITS) | (i&255);
+
+      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
+   }
+
+   void clear() {
+      // clear a symbolTable with minimal effort (only erase the used positions in it)
+      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
+      for(u32 i=FSST_CODE_BASE; i<FSST_CODE_BASE+nSymbols; i++) {
+          if (symbols[i].length() == 1) {
+              u16 val = symbols[i].first();
+              byteCodes[val] = (1<<FSST_LEN_BITS) | val;
+          } else if (symbols[i].length() == 2) {
+              u16 val = symbols[i].first2();
+              shortCodes[val] = (1<<FSST_LEN_BITS) | (val&255);
+          } else {
+              u32 idx = symbols[i].hash() & (hashTabSize-1);
+              hashTab[idx].val.num = 0;
+              hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab
+          }           
+      } 
+      nSymbols = 0; // no need to clean symbols[] as no symbols are used
+   }
+   bool hashInsert(Symbol s) {
+      u32 idx = s.hash() & (hashTabSize-1);
+      bool taken = (hashTab[idx].icl < FSST_ICL_FREE);
+      if (taken) return false; // collision in hash table
+      hashTab[idx].icl = s.icl;
+      hashTab[idx].store_num(s.load_num() & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl));
+      return true;
+   }
+   bool add(Symbol s) {
+      assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX);
+      u32 len = s.length();
+      s.set_code_len(FSST_CODE_BASE + nSymbols, len);
+      if (len == 1) {
+         byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<<FSST_LEN_BITS); // len=1 (<<FSST_LEN_BITS)
+      } else if (len == 2) {
+         shortCodes[s.first2()] = FSST_CODE_BASE + nSymbols + (2<<FSST_LEN_BITS); // len=2 (<<FSST_LEN_BITS)
+      } else if (!hashInsert(s)) {
+         return false;
+      }
+      symbols[FSST_CODE_BASE + nSymbols++] = s;
+      lenHisto[len-1]++;
+      return true;
+   }
+   /// Find longest expansion, return code (= position in symbol table)
+   u16 findLongestSymbol(Symbol s) const {
+      size_t idx = s.hash() & (hashTabSize-1);
+      if (hashTab[idx].icl <= s.icl && hashTab[idx].load_num() == (s.load_num() & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
+         return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol 
+      }
+      if (s.length() >= 2) {
+         u16 code =  shortCodes[s.first2()] & FSST_CODE_MASK;
+         if (code >= FSST_CODE_BASE) return code; 
+      }
+      return byteCodes[s.first()] & FSST_CODE_MASK;
+   }
+   u16 findLongestSymbol(const u8* cur, const u8* end) const {
+      return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol
+   }
+
+   // rationale for finalize:
+   // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable()
+   //   consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize()
+   //   (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass)
+   // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf.
+   //   we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). 
+   // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[]
+   //   Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte
+   //   symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time.
+   //
+   // In all, we change the layout and coding, as follows..
+   //
+   // before finalize(): 
+   // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255
+   // - The first 256 codes are pseudo symbols (all escaped bytes)
+   //
+   // after finalize(): 
+   // - table layout is symbols[0..nSymbols>, with nSymbols < 256. 
+   // - Real codes are [0,nSymbols>. 8-th bit not set. 
+   // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255
+   // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last)
+   // the two-byte codes are split in two sections: 
+   // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression
+   //
+   // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore).
+   //
+   void finalize(u8 zeroTerminated) {
+       assert(nSymbols <= 255);
+       u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated);
+
+       // compute running sum of code lengths (starting offsets for each length) 
+       rsum[0] = byteLim; // 1-byte codes are highest
+       rsum[1] = zeroTerminated;
+       for(u32 i=1; i<7; i++)
+          rsum[i+1] = rsum[i] + lenHisto[i];
+
+       // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim)
+       suffixLim = rsum[1];
+       symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only)
+
+       for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) {  
+          Symbol s1 = symbols[FSST_CODE_BASE+i];
+          u32 len = s1.length(), opt = (len == 2)*nSymbols;
+          if (opt) {
+              u16 first2 = s1.first2();
+              for(u32 k=0; k<opt; k++) {  
+                 Symbol s2 = symbols[FSST_CODE_BASE+k];
+                 if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s
+                    opt = 0;
+              }
+              newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim 
+          } else 
+              newCode[i] = rsum[len-1]++;
+          s1.set_code_len(newCode[i],len);
+          symbols[newCode[i]] = s1; 
+       }
+       // renumber the codes in byteCodes[] 
+       for(u32 i=0; i<256; i++) 
+          if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
+             byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS);
+          else 
+             byteCodes[i] = 511 + (1 << FSST_LEN_BITS);
+       
+       // renumber the codes in shortCodes[] 
+       for(u32 i=0; i<65536; i++)
+          if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
+             shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS));
+          else 
+             shortCodes[i] = byteCodes[i&0xFF];
+
+       // replace the symbols in the hash table
+       for(u32 i=0; i<hashTabSize; i++)
+          if (hashTab[i].icl < FSST_ICL_FREE)
+             hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]];
+   }
+};
+
+#ifdef NONOPT_FSST
+struct Counters {
+   u16 count1[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample 
+   u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample 
+
+   void count1Set(u32 pos1, u16 val) { 
+      count1[pos1] = val;
+   }
+   void count1Inc(u32 pos1) { 
+      count1[pos1]++;
+   }
+   void count2Inc(u32 pos1, u32 pos2) {  
+      count2[pos1][pos2]++;
+   }
+   u32 count1GetNext(u32 &pos1) { 
+      return count1[pos1];
+   }
+   u32 count2GetNext(u32 pos1, u32 &pos2) { 
+      return count2[pos1][pos2];
+   }
+   void backup1(u8 *buf) {
+      memcpy(buf, count1, FSST_CODE_MAX*sizeof(u16));
+   }
+   void restore1(u8 *buf) {
+      memcpy(count1, buf, FSST_CODE_MAX*sizeof(u16));
+   }
+};
+#else
+// we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons
+// first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure.
+// second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7)
+struct Counters {
+   // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes
+   u8 count1High[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample (16-bits)
+   u8 count1Low[FSST_CODE_MAX];    // it is split in a low and high byte: cnt = count1High*256 + count1Low
+   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
+   u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
+   // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
+   
+   void count1Set(u32 pos1, u16 val) { 
+      count1Low[pos1] = val&255;
+      count1High[pos1] = val>>8;
+   }
+   void count1Inc(u32 pos1) { 
+      if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
+         count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
+   }
+   void count2Inc(u32 pos1, u32 pos2) {  
+       if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
+          // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
+          count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
+   }
+   u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
+      // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
+      u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7]
+
+      u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes
+      high = (high >> (zero << 3)) & 255; // advance to nonzero counter
+      if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
+         return 0; // all zero
+
+      u32 low = count1Low[pos1];
+      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
+      return (u32) ((high << 8) + low);
+   }
+   u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
+      // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
+      u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15]
+      high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters
+
+      u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters
+      high = (high >> (zero << 2)) & 15;  // advance to nonzero counter
+      if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
+         return 0UL; // all zero
+
+      u32 low = count2Low[pos1][pos2];
+      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
+      return (u32) ((high << 8) + low);
+   }
+   void backup1(u8 *buf) {
+      memcpy(buf, count1High, FSST_CODE_MAX);
+      memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
+   }
+   void restore1(u8 *buf) {
+      memcpy(count1High, buf, FSST_CODE_MAX);
+      memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
+   }
+}; 
+#endif
+
+
+#define FSST_BUFSZ (3<<19) // 768KB
+
+// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression 
+struct Encoder {
+   shared_ptr<SymbolTable> symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc)
+   union {
+      Counters counters;     // for counting symbol occurences during map construction
+      u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) 
+   };
+};
+
+// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call)
+struct SIMDjob {
+   u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)  
+};
+
+extern bool 
+fsst_hasAVX512(); // runtime check for avx512 capability
+
+extern size_t 
+fsst_compressAVX512(
+   SymbolTable &symbolTable, 
+   u8* codeBase,    // IN: base address for codes, i.e. compression output (points to simdbuf+256KB)
+   u8* symbolBase,  // IN: base address for string bytes, i.e. compression input (points to simdbuf)
+   SIMDjob* input,  // IN: input array (size n) with job information: what to encode, where to store it.
+   SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer.
+   size_t n,         // IN: size of arrays input and output (should be max 512)
+   size_t unroll);   // IN: degree of SIMD unrolling
+
+// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
+size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
+size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd);
+}  // namespace libfsst
diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt
index 7ba1f4f876b..ad84ffc98ba 100644
--- a/cpp/thirdparty/versions.txt
+++ b/cpp/thirdparty/versions.txt
@@ -66,6 +66,8 @@ ARROW_CARES_BUILD_VERSION=1.17.2
 ARROW_CARES_BUILD_SHA256_CHECKSUM=4803c844ce20ce510ef0eb83f8ea41fa24ecaae9d280c468c582d2bb25b3913d
 ARROW_CRC32C_BUILD_VERSION=1.1.2
 ARROW_CRC32C_BUILD_SHA256_CHECKSUM=ac07840513072b7fcebda6e821068aa04889018f24e10e46181068fb214d7e56
+ARROW_FSST_BUILD_VERSION=89f49c580c6388acf3b6ed2a49e1bfde6c05e616
+ARROW_FSST_BUILD_SHA256_CHECKSUM=5921d2b837800e1c886903d18971994587328b3e5d08d32eb8e80c00890c304d
 ARROW_GBENCHMARK_BUILD_VERSION=v1.8.3
 ARROW_GBENCHMARK_BUILD_SHA256_CHECKSUM=6bc180a57d23d4d9515519f92b0c83d61b05b5bab188961f36ac7b06b0d9e9ce
 ARROW_GFLAGS_BUILD_VERSION=v2.2.2
@@ -144,6 +146,7 @@ DEPENDENCIES=(
   "ARROW_BZIP2_URL bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz"
   "ARROW_CARES_URL cares-${ARROW_CARES_BUILD_VERSION}.tar.gz https://github.com/c-ares/c-ares/releases/download/cares-${ARROW_CARES_BUILD_VERSION//./_}/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz"
   "ARROW_CRC32C_URL crc32c-${ARROW_CRC32C_BUILD_VERSION}.tar.gz https://github.com/google/crc32c/archive/refs/tags/${ARROW_CRC32C_BUILD_VERSION}.tar.gz"
+  "ARROW_FSST_URL fsst-${ARROW_FSST_BUILD_VERSION}.tar.gz https://github.com/cwida/fsst/archive/${ARROW_FSST_BUILD_VERSION}.tar.gz"
   "ARROW_GBENCHMARK_URL gbenchmark-${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz https://github.com/google/benchmark/archive/${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz"
   "ARROW_GFLAGS_URL gflags-${ARROW_GFLAGS_BUILD_VERSION}.tar.gz https://github.com/gflags/gflags/archive/${ARROW_GFLAGS_BUILD_VERSION}.tar.gz"
   "ARROW_GLOG_URL glog-${ARROW_GLOG_BUILD_VERSION}.tar.gz https://github.com/google/glog/archive/${ARROW_GLOG_BUILD_VERSION}.tar.gz"
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 5212dc47e0b..0544ab3b1f0 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -31,6 +31,7 @@ cpp/src/generated/substrait/*
 cpp/thirdparty/flatbuffers/include/flatbuffers/base.h
 cpp/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h
 cpp/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h
+cpp/thirdparty/fsst/*
 dev/requirements*.txt
 dev/archery/MANIFEST.in
 dev/archery/requirements*.txt
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 14cd3e363a4..86f9f27305a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1487,6 +1487,7 @@ cdef encoding_name_from_enum(ParquetEncoding encoding_):
         ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY',
         ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY',
         ParquetEncoding_BYTE_STREAM_SPLIT: 'BYTE_STREAM_SPLIT',
+        ParquetEncoding_FSST: 'FSST',
     }.get(encoding_, 'UNKNOWN')
 
 
@@ -1499,6 +1500,7 @@ cdef encoding_enum_from_name(str encoding_name):
         'DELTA_BINARY_PACKED': ParquetEncoding_DELTA_BINARY_PACKED,
         'DELTA_LENGTH_BYTE_ARRAY': ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY,
         'DELTA_BYTE_ARRAY': ParquetEncoding_DELTA_BYTE_ARRAY,
+        'FSST': ParquetEncoding_FSST,
         'RLE_DICTIONARY': 'dict',
         'PLAIN_DICTIONARY': 'dict',
     }.get(encoding_name, None)
diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd
index 42d48ba050f..3663b8bcf62 100644
--- a/python/pyarrow/includes/libparquet.pxd
+++ b/python/pyarrow/includes/libparquet.pxd
@@ -130,6 +130,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
         ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY"
         ParquetEncoding_BYTE_STREAM_SPLIT \
             " parquet::Encoding::BYTE_STREAM_SPLIT"
+        ParquetEncoding_FSST" parquet::Encoding::FSST"
 
     enum ParquetCompression" parquet::Compression::type":
         ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED"
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 24cb586c82b..5fb3cc5b699 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -821,7 +821,7 @@ def _sanitize_table(table, new_schema, flavor):
     Can only be used when ``use_dictionary`` is set to False, and
     cannot be used in combination with ``use_byte_stream_split``.
     Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT',
-    'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}.
+    'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'FSST'}.
     Certain encodings are only compatible with certain data types.
     Please refer to the encodings section of `Reading and writing Parquet
     files <https://arrow.apache.org/docs/cpp/parquet.html#encodings>`_.