diff --git a/.circleci/config.yml b/.circleci/config.yml
index 89f1e8a2..05f59074 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,7 +18,7 @@ jobs:
             CXX=/usr/bin/clang++-6.0 cmake -G 'Unix Makefiles' -DCMAKE_BUILD_TYPE=Debug -DUSE_SIMD=SSE .. &&
             make
       - run:
-          name: Compile with Clang 6 (Debug mode/SSE)
+          name: Compile with Clang 6 (Debug mode/AVX)
           command: >
             rm -rf build && mkdir build && cd build &&
             CXX=/usr/bin/clang++-6.0 cmake -G 'Unix Makefiles' -DCMAKE_BUILD_TYPE=Debug -DUSE_SIMD=AVX .. &&
diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh
index 9658c255..b64fe42e 100755
--- a/scripts/benchmark.sh
+++ b/scripts/benchmark.sh
@@ -44,24 +44,22 @@ fi
 
 chunk_size=51200
 # for rs-fnt with different packet sizes
-word_size=2
 for word_size in 1 2; do
-    for type_size in 2 4; do
-        max_len=$((256**word_size))
-        if ((type_size>word_size)); then
-            for ec_type in rs-fnt rs-fnt-sys; do
-              for k in 16 64; do
-                for n in 32 256 1024; do
-                  if ((n<max_len)) && ((n>k)); then
-                    m=$((n-k))
-                    for pkt_size in 512; do
-                      ${bin} -e ${ec_type} -w ${word_size} -t ${type_size} -k ${k} -m ${m} -c ${chunk_size} -s ${sce_type} -g ${threads_nb} -f ${show_type} -p ${pkt_size} -n ${samples_nb}
-                    show_type=0
-                    done
-                  fi
+    type_size=$((word_size*2))
+    max_len=$((256**word_size))
+    if ((type_size>word_size)); then
+        for ec_type in rs-fnt rs-fnt-sys; do
+          for k in 16 64; do
+            for n in 32 256 1024; do
+              if ((n<max_len)) && ((n>k)); then
+                m=$((n-k))
+                for pkt_size in 512; do
+                  ${bin} -e ${ec_type} -w ${word_size} -t ${type_size} -k ${k} -m ${m} -c ${chunk_size} -s ${sce_type} -g ${threads_nb} -f ${show_type} -p ${pkt_size} -n ${samples_nb}
+                show_type=0
                 done
-              done
+              fi
             done
-        fi
-    done
+          done
+        done
+    fi
 done
diff --git a/scripts/test_ec.sh b/scripts/test_ec.sh
index 4b133150..c1767187 100755
--- a/scripts/test_ec.sh
+++ b/scripts/test_ec.sh
@@ -174,7 +174,7 @@ do
     fec_type=$(echo $i|cut -d_ -f1)
     word_size=$(echo $i|cut -d_ -f2)
 
-    do_test enconly ${fec_type} ${word_size} 50 50 "" ""
+#    do_test enconly ${fec_type} ${word_size} 50 50 "" ""
     do_test all ${fec_type} ${word_size} 3 3 "" ""
     do_test all ${fec_type} ${word_size} 3 3 "0 1" "0"
     do_test all ${fec_type} ${word_size} 3 5 "0 1" "0"
diff --git a/src/fec_base.h b/src/fec_base.h
index 2802c007..fcb4b7a3 100644
--- a/src/fec_base.h
+++ b/src/fec_base.h
@@ -118,7 +118,8 @@ class FecCode {
         unsigned word_size,
         unsigned n_data,
         unsigned n_parities,
-        size_t pkt_size = 8);
+        size_t pkt_size = 8,
+        bool use_meta_buf = false);
     virtual ~FecCode() = default;
 
     /** Return the number of output parts.
@@ -173,8 +174,8 @@ class FecCode {
     bool readw(T* ptr, std::istream* stream);
     bool writew(T val, std::ostream* stream);
 
-    bool read_pkt(char* pkt, std::istream& stream);
-    bool write_pkt(char* pkt, std::ostream& stream, size_t bytes);
+    bool read_pkt(char* pkt, std::istream* stream);
+    bool write_pkt(char* pkt, std::ostream* stream, size_t bytes);
 
     void encode_streams_horizontal(
         std::vector<std::istream*> input_data_bufs,
@@ -250,6 +251,9 @@ class FecCode {
     std::unique_ptr<vec::Vector<T>> r_powers = nullptr;
     // buffers for intermediate symbols used for systematic FNT
     std::unique_ptr<vec::Buffers<T>> dec_inter_codeword;
+    // vector for intermediate symbols used for systematic FNT
+    std::unique_ptr<vec::Vector<T>> vec_dec_inter_codeword;
+    bool use_meta_buf = false;
 
     // pure abstract methods that will be defined in derived class
     virtual void check_params() = 0;
@@ -300,7 +304,8 @@ FecCode<T>::FecCode(
     unsigned word_size,
     unsigned n_data,
     unsigned n_parities,
-    size_t pkt_size)
+    size_t pkt_size,
+    bool use_meta_buf)
 {
     assert(type == FecType::SYSTEMATIC || type == FecType::NON_SYSTEMATIC);
 
@@ -312,7 +317,8 @@ FecCode<T>::FecCode(
     this->n_outputs =
         (type == FecType::SYSTEMATIC) ? this->n_parities : this->code_len;
     this->pkt_size = pkt_size;
-    this->buf_size = pkt_size * word_size;
+    this->buf_size = use_meta_buf ? pkt_size * sizeof(T) : pkt_size * word_size;
+    this->use_meta_buf = use_meta_buf;
 }
 
 template <typename T>
@@ -384,15 +390,15 @@ inline bool FecCode<T>::writew(T val, std::ostream* stream)
 }
 
 template <typename T>
-inline bool FecCode<T>::read_pkt(char* pkt, std::istream& stream)
+inline bool FecCode<T>::read_pkt(char* pkt, std::istream* stream)
 {
-    return static_cast<bool>(stream.read(pkt, buf_size));
+    return static_cast<bool>(stream->read(pkt, buf_size));
 }
 
 template <typename T>
-inline bool FecCode<T>::write_pkt(char* pkt, std::ostream& stream, size_t bytes)
+inline bool FecCode<T>::write_pkt(char* pkt, std::ostream* stream, size_t bytes)
 {
-    return static_cast<bool>(stream.write(pkt, bytes));
+    return static_cast<bool>(stream->write(pkt, bytes));
 }
 
 /** Encode streams
@@ -477,21 +483,15 @@ void FecCode<T>::encode_streams_vertical(
     bool cont = true;
     off_t offset = 0;
 
-    // vector of buffers storing data read from chunk
-    vec::Buffers<char> words_char(n_data, buf_size);
-    const std::vector<char*> words_mem_char = words_char.get_mem();
     // vector of buffers storing data that are performed in encoding, i.e. FFT
-    vec::Buffers<T> words(n_data, pkt_size);
-    const std::vector<T*> words_mem_T = words.get_mem();
+    vec::Buffers<T> words(n_data, pkt_size, use_meta_buf);
+    const std::vector<T*>& words_mem = words.get_mem();
 
     int output_len = get_n_outputs();
 
     // vector of buffers storing data that are performed in encoding, i.e. FFT
-    vec::Buffers<T> output(output_len, pkt_size);
-    const std::vector<T*> output_mem_T = output.get_mem();
-    // vector of buffers storing data in output chunk
-    vec::Buffers<char> output_char(output_len, buf_size);
-    const std::vector<char*> output_mem_char = output_char.get_mem();
+    vec::Buffers<T> output(output_len, pkt_size, use_meta_buf);
+    const std::vector<T*>& output_mem = output.get_mem();
 
     reset_stats_enc();
 
@@ -501,11 +501,13 @@ void FecCode<T>::encode_streams_vertical(
 
     while (cont) {
         for (unsigned i = 0; i < n_data; i++) {
-            if (!read_pkt(words_mem_char.at(i), *(input_data_bufs[i]))) {
+            if (!read_pkt(
+                    reinterpret_cast<char*>(words_mem.at(i)),
+                    input_data_bufs[i])) {
                 read_bytes = input_data_bufs[i]->gcount();
                 // Zero-out trailing part
                 std::fill_n(
-                    words_mem_char.at(i) + read_bytes,
+                    reinterpret_cast<char*>(words_mem.at(i)) + read_bytes,
                     buf_size - read_bytes,
                     0);
 
@@ -517,8 +519,9 @@ void FecCode<T>::encode_streams_vertical(
             break;
         }
 
-        vec::pack<char, T>(
-            words_mem_char, words_mem_T, n_data, pkt_size, word_size);
+        if (use_meta_buf) {
+            words.reset_meta();
+        }
 
         timeval t1 = tick();
         uint64_t start = hw_timer();
@@ -530,12 +533,11 @@ void FecCode<T>::encode_streams_vertical(
         total_encode_cycles += (end - start) / buf_size;
         n_encode_ops++;
 
-        vec::unpack<T, char>(
-            output_mem_T, output_mem_char, output_len, pkt_size, word_size);
-
         for (unsigned i = 0; i < n_outputs; i++) {
             write_pkt(
-                output_mem_char.at(i), *(output_parities_bufs[i]), read_bytes);
+                reinterpret_cast<char*>(output_mem.at(i)),
+                output_parities_bufs[i],
+                read_bytes);
         }
         offset += pkt_size;
     }
@@ -749,6 +751,11 @@ void FecCode<T>::decode(
 
     // Lagrange interpolation
     decode_apply(context, output, words);
+
+    if (type == FecType::SYSTEMATIC) {
+        this->fft->fft(*vec_dec_inter_codeword, output);
+        output.copy(vec_dec_inter_codeword.get(), n_data);
+    }
 }
 
 /* Initialize context for decoding
@@ -804,7 +811,13 @@ void FecCode<T>::decode_prepare(
 {
     const vec::Vector<T>& fragments_ids = context.get_fragments_id();
     for (unsigned i = 0; i < this->n_data; ++i) {
-        const int j = fragments_ids.get(i);
+        unsigned j = fragments_ids.get(i);
+        if (type == FecType::SYSTEMATIC && j < this->n_data) {
+            continue;
+        }
+        if (type == FecType::SYSTEMATIC) {
+            j -= this->n_data;
+        }
         if (props[j].is_marked(context.props_indices[j], offset)) {
             // Check if the symbol is a special case whick is marked by
             // `OOR_MARK`, i.e. true. Note: this check is necessary when
@@ -957,21 +970,15 @@ bool FecCode<T>::decode_streams_vertical(
 
     decode_build();
 
-    // vector of buffers storing data read from chunk
-    vec::Buffers<char> words_char(n_data, buf_size);
-    const std::vector<char*> words_mem_char = words_char.get_mem();
     // vector of buffers storing data that are performed in encoding, i.e. FFT
-    vec::Buffers<T> words(n_data, pkt_size);
-    const std::vector<T*> words_mem_T = words.get_mem();
+    vec::Buffers<T> words(n_data, pkt_size, use_meta_buf);
+    const std::vector<T*>& words_mem = words.get_mem();
 
     int output_len = n_data;
 
     // vector of buffers storing data that are performed in decoding, i.e. FFT
-    vec::Buffers<T> output(output_len, pkt_size);
-    const std::vector<T*> output_mem_T = output.get_mem();
-    // vector of buffers storing data in output chunk
-    vec::Buffers<char> output_char(output_len, buf_size);
-    const std::vector<char*> output_mem_char = output_char.get_mem();
+    vec::Buffers<T> output(output_len, pkt_size, use_meta_buf);
+    const std::vector<T*>& output_mem = output.get_mem();
 
     std::unique_ptr<DecodeContext<T>> context = init_context_dec(
         fragments_ids, input_parities_props, pkt_size, &output);
@@ -987,11 +994,12 @@ bool FecCode<T>::decode_streams_vertical(
             for (unsigned i = 0; i < avail_data_nb; i++) {
                 unsigned data_idx = fragments_ids.get(i);
                 if (!read_pkt(
-                        words_mem_char.at(i), *(input_data_bufs[data_idx]))) {
+                        reinterpret_cast<char*>(words_mem.at(i)),
+                        input_data_bufs[data_idx])) {
                     read_bytes = input_data_bufs[data_idx]->gcount();
                     // Zero-out trailing part
                     std::fill_n(
-                        words_mem_char.at(i) + read_bytes,
+                        reinterpret_cast<char*>(words_mem.at(i)) + read_bytes,
                         buf_size - read_bytes,
                         0);
 
@@ -1002,12 +1010,13 @@ bool FecCode<T>::decode_streams_vertical(
         for (unsigned i = 0; i < n_data - avail_data_nb; ++i) {
             unsigned parity_idx = avail_parity_ids.get(i);
             if (!read_pkt(
-                    words_mem_char.at(avail_data_nb + i),
-                    *(input_parities_bufs[parity_idx]))) {
+                    reinterpret_cast<char*>(words_mem.at(avail_data_nb + i)),
+                    input_parities_bufs[parity_idx])) {
                 read_bytes = input_parities_bufs[parity_idx]->gcount();
                 // Zero-out trailing part
                 std::fill_n(
-                    words_mem_char.at(avail_data_nb + i) + read_bytes,
+                    reinterpret_cast<char*>(words_mem.at(avail_data_nb + i))
+                        + read_bytes,
                     buf_size - read_bytes,
                     0);
 
@@ -1019,8 +1028,9 @@ bool FecCode<T>::decode_streams_vertical(
             break;
         }
 
-        vec::pack<char, T>(
-            words_mem_char, words_mem_T, n_data, pkt_size, word_size);
+        if (use_meta_buf) {
+            words.reset_meta();
+        }
 
         timeval t1 = tick();
         uint64_t start = hw_timer();
@@ -1032,13 +1042,12 @@ bool FecCode<T>::decode_streams_vertical(
         total_decode_cycles += (end - start) / word_size;
         n_decode_ops++;
 
-        vec::unpack<T, char>(
-            output_mem_T, output_mem_char, output_len, pkt_size, word_size);
-
         for (unsigned i = 0; i < n_data; i++) {
             if (output_data_bufs[i] != nullptr) {
                 write_pkt(
-                    output_mem_char.at(i), *(output_data_bufs[i]), read_bytes);
+                    reinterpret_cast<char*>(output_mem.at(i)),
+                    output_data_bufs[i],
+                    read_bytes);
             }
         }
         offset += pkt_size;
@@ -1080,23 +1089,18 @@ void FecCode<T>::encode_blocks_vertical(
     }
 
     size_t offset = 0;
-    size_t block_size = block_size_bytes / word_size;
+    const size_t element_size = use_meta_buf ? sizeof(T) : word_size;
+    size_t block_size = block_size_bytes / element_size;
 
-    // vector of buffers storing data read from chunk
-    vec::Buffers<uint8_t> words_char(n_data, buf_size);
-    const std::vector<uint8_t*> words_mem_char = words_char.get_mem();
     // vector of buffers storing data that are performed in encoding, i.e. FFT
-    vec::Buffers<T> words(n_data, pkt_size);
-    const std::vector<T*> words_mem_T = words.get_mem();
+    vec::Buffers<T> words(n_data, pkt_size, use_meta_buf);
+    const std::vector<T*>& words_mem = words.get_mem();
 
     int output_len = get_n_outputs();
 
     // vector of buffers storing data that are performed in encoding, i.e. FFT
-    vec::Buffers<T> output(output_len, pkt_size);
-    const std::vector<T*> output_mem_T = output.get_mem();
-    // vector of buffers storing data in output chunk
-    vec::Buffers<uint8_t> output_char(output_len, buf_size);
-    const std::vector<uint8_t*> output_mem_char = output_char.get_mem();
+    vec::Buffers<T> output(output_len, pkt_size, use_meta_buf);
+    const std::vector<T*>& output_mem = output.get_mem();
 
     reset_stats_enc();
 
@@ -1105,25 +1109,26 @@ void FecCode<T>::encode_blocks_vertical(
         size_t copy_size = std::min(pkt_size, remain_size);
         for (unsigned i = 0; i < n_data; i++) {
             memcpy(
-                reinterpret_cast<char*>(words_mem_char.at(i)),
-                data_bufs[i] + offset * word_size,
-                copy_size * word_size);
+                reinterpret_cast<char*>(words_mem.at(i)),
+                data_bufs[i] + offset * element_size,
+                copy_size * element_size);
         }
 
         // Zero-out trailing part of data
         if (copy_size < pkt_size) {
-            const size_t copy_bytes = copy_size * word_size;
+            const size_t copy_bytes = copy_size * element_size;
             const size_t trailing_bytes = buf_size - copy_bytes;
             for (unsigned i = 0; i < n_data; i++) {
                 memset(
-                    reinterpret_cast<char*>(words_mem_char.at(i)) + copy_bytes,
+                    reinterpret_cast<char*>(words_mem.at(i)) + copy_bytes,
                     0,
                     trailing_bytes);
             }
         }
 
-        vec::pack<uint8_t, T>(
-            words_mem_char, words_mem_T, n_data, pkt_size, word_size);
+        if (use_meta_buf) {
+            words.reset_meta();
+        }
 
         timeval t1 = tick();
         uint64_t start = hw_timer();
@@ -1132,18 +1137,15 @@ void FecCode<T>::encode_blocks_vertical(
         uint64_t t2 = hrtime_usec(t1);
 
         total_enc_usec += t2;
-        total_encode_cycles += (end - start) / (copy_size * word_size);
+        total_encode_cycles += (end - start) / (copy_size * element_size);
         n_encode_ops++;
 
-        vec::unpack<T, uint8_t>(
-            output_mem_T, output_mem_char, output_len, pkt_size, word_size);
-
         for (unsigned i = 0; i < n_outputs; i++) {
             if (wanted_idxs[i]) {
                 memcpy(
-                    parities_bufs[i] + offset * word_size,
-                    reinterpret_cast<char*>(output_mem_char.at(i)),
-                    copy_size * word_size);
+                    parities_bufs[i] + offset * element_size,
+                    reinterpret_cast<char*>(output_mem.at(i)),
+                    copy_size * element_size);
             }
         }
         offset += pkt_size;
@@ -1183,7 +1185,8 @@ bool FecCode<T>::decode_blocks_vertical(
     size_t block_size_bytes)
 {
     size_t offset = 0;
-    size_t block_size = block_size_bytes / word_size;
+    const size_t element_size = use_meta_buf ? sizeof(T) : word_size;
+    size_t block_size = block_size_bytes / element_size;
 
     unsigned fragment_index = 0;
     unsigned parity_index = 0;
@@ -1237,21 +1240,15 @@ bool FecCode<T>::decode_blocks_vertical(
 
     decode_build();
 
-    // vector of buffers storing data read from chunk
-    vec::Buffers<uint8_t> words_char(n_data, buf_size);
-    const std::vector<uint8_t*> words_mem_char = words_char.get_mem();
     // vector of buffers storing data that are performed in encoding, i.e. FFT
-    vec::Buffers<T> words(n_data, pkt_size);
-    const std::vector<T*> words_mem_T = words.get_mem();
+    vec::Buffers<T> words(n_data, pkt_size, use_meta_buf);
+    const std::vector<T*>& words_mem = words.get_mem();
 
     int output_len = n_data;
 
     // vector of buffers storing data that are performed in decoding, i.e. FFT
-    vec::Buffers<T> output(output_len, pkt_size);
-    const std::vector<T*> output_mem_T = output.get_mem();
-    // vector of buffers storing data in output chunk
-    vec::Buffers<uint8_t> output_char(output_len, buf_size);
-    const std::vector<uint8_t*> output_mem_char = output_char.get_mem();
+    vec::Buffers<T> output(output_len, pkt_size, use_meta_buf);
+    const std::vector<T*>& output_mem = output.get_mem();
 
     std::unique_ptr<DecodeContext<T>> context =
         init_context_dec(fragments_ids, parities_props, pkt_size, &output);
@@ -1265,33 +1262,34 @@ bool FecCode<T>::decode_blocks_vertical(
             for (unsigned i = 0; i < avail_data_nb; i++) {
                 unsigned data_idx = fragments_ids.get(i);
                 memcpy(
-                    reinterpret_cast<char*>(words_mem_char.at(i)),
-                    data_bufs[data_idx] + offset * word_size,
-                    copy_size * word_size);
+                    reinterpret_cast<char*>(words_mem.at(i)),
+                    data_bufs[data_idx] + offset * element_size,
+                    copy_size * element_size);
             }
         }
         for (unsigned i = 0; i < n_data - avail_data_nb; ++i) {
             unsigned parity_idx = avail_parity_ids.get(i);
             memcpy(
-                reinterpret_cast<char*>(words_mem_char.at(avail_data_nb + i)),
-                parities_bufs[parity_idx] + offset * word_size,
-                copy_size * word_size);
+                reinterpret_cast<char*>(words_mem.at(avail_data_nb + i)),
+                parities_bufs[parity_idx] + offset * element_size,
+                copy_size * element_size);
         }
 
         // Zero-out trailing part of data
         if (copy_size < pkt_size) {
-            const size_t copy_bytes = copy_size * word_size;
+            const size_t copy_bytes = copy_size * element_size;
             const size_t trailing_bytes = buf_size - copy_bytes;
             for (unsigned i = 0; i < n_data; i++) {
                 memset(
-                    reinterpret_cast<char*>(words_mem_char.at(i)) + copy_bytes,
+                    reinterpret_cast<char*>(words_mem.at(i)) + copy_bytes,
                     0,
                     trailing_bytes);
             }
         }
 
-        vec::pack<uint8_t, T>(
-            words_mem_char, words_mem_T, n_data, pkt_size, word_size);
+        if (use_meta_buf) {
+            words.reset_meta();
+        }
 
         timeval t1 = tick();
         uint64_t start = hw_timer();
@@ -1300,18 +1298,15 @@ bool FecCode<T>::decode_blocks_vertical(
         uint64_t t2 = hrtime_usec(t1);
 
         total_dec_usec += t2;
-        total_decode_cycles += (end - start) / word_size;
+        total_decode_cycles += (end - start) / element_size;
         n_decode_ops++;
 
-        vec::unpack<T, uint8_t>(
-            output_mem_T, output_mem_char, output_len, pkt_size, word_size);
-
         for (unsigned i = 0; i < n_data; i++) {
             if (wanted_idxs[i]) {
                 memcpy(
-                    data_bufs[i] + offset * word_size,
-                    reinterpret_cast<char*>(output_mem_char.at(i)),
-                    copy_size * word_size);
+                    data_bufs[i] + offset * element_size,
+                    reinterpret_cast<char*>(output_mem.at(i)),
+                    copy_size * element_size);
             }
         }
         offset += pkt_size;
@@ -1388,15 +1383,23 @@ void FecCode<T>::decode_prepare(
             // As loc.offset := offset + j
             const size_t j = (loc_offset - offset);
 
-            // Check if the symbol is a special case whick is marked by
-            // `OOR_MARK`.
-            // Note: this check is necessary when word_size is not large
-            // enough to cover all symbols of the field. Following check is
-            // used for FFT over FNT where the single special case symbol
-            // equals card - 1
-            if (props[frag_id].marker(context.props_indices.at(frag_id))
-                == OOR_MARK) {
-                chunk[j] = thres;
+            if (use_meta_buf) {
+                T meta =
+                    props[frag_id].marker(context.props_indices.at(frag_id));
+                if (meta) {
+                    words.set_meta(i, j, meta);
+                }
+            } else {
+                // Check if the symbol is a special case whick is marked by
+                // `OOR_MARK`.
+                // Note: this check is necessary when word_size is not large
+                // enough to cover all symbols of the field. Following check is
+                // used for FFT over FNT where the single special case symbol
+                // equals card - 1
+                if (props[frag_id].marker(context.props_indices.at(frag_id))
+                    == OOR_MARK) {
+                    chunk[j] = thres;
+                }
             }
             context.props_indices.at(frag_id)++;
         }
diff --git a/src/fec_context.h b/src/fec_context.h
index 0e153267..03c9560c 100644
--- a/src/fec_context.h
+++ b/src/fec_context.h
@@ -117,11 +117,14 @@ class DecodeContext {
         } else {
             assert(output != nullptr);
 
+            bool b_meta = output->has_meta();
+
             // Buffers each of which is fully allocated
             // Buffer of length `len_2k`
-            buf1_2k = std::make_unique<vec::Buffers<T>>(len_2k, size);
+            buf1_2k = std::make_unique<vec::Buffers<T>>(len_2k, size, b_meta);
             // Buffer of length `max_n_2k - k`
-            bNmK = std::make_unique<vec::Buffers<T>>(max_n_2k - k, size);
+            bNmK =
+                std::make_unique<vec::Buffers<T>>(max_n_2k - k, size, b_meta);
 
             // Buffers that are derived from the two above ones
             // Buffer sliced from `k` first elements of `buf1_2k`
diff --git a/src/fec_rs_fnt.h b/src/fec_rs_fnt.h
index 730d08df..0996b39c 100644
--- a/src/fec_rs_fnt.h
+++ b/src/fec_rs_fnt.h
@@ -60,6 +60,13 @@ class RsFnt : public FecCode<T> {
     // decoding context used in encoding of systematic FNT
     std::unique_ptr<DecodeContext<T>> enc_context;
 
+    // vector for intermediate symbols used for systematic FNT
+    std::unique_ptr<vec::Vector<T>> vec_inter_words;
+    // vector of `n` symbols used for systematic FNT
+    std::unique_ptr<vec::Vector<T>> vec_inter_codeword;
+    // decoding context used in horizontal encoding of systematic FNT
+    std::unique_ptr<DecodeContext<T>> vec_enc_context;
+
     // Indices used for accelerated functions
     size_t simd_vec_len;
     size_t simd_trailing_len;
@@ -72,7 +79,7 @@ class RsFnt : public FecCode<T> {
         unsigned n_data,
         unsigned n_parities,
         size_t pkt_size = 8)
-        : FecCode<T>(type, word_size, n_data, n_parities, pkt_size)
+        : FecCode<T>(type, word_size, n_data, n_parities, pkt_size, true)
     {
         this->fec_init();
 
@@ -143,18 +150,29 @@ class RsFnt : public FecCode<T> {
                 enc_frag_ids->set(i, i);
             }
 
-            inter_words =
-                std::make_unique<vec::Buffers<T>>(this->n_data, this->pkt_size);
+            inter_words = std::make_unique<vec::Buffers<T>>(
+                this->n_data, this->pkt_size, true);
             suffix_words = std::make_unique<vec::Buffers<T>>(
-                this->n - this->n_data - this->n_outputs, this->pkt_size);
+                this->n - this->n_data - this->n_outputs, this->pkt_size, true);
 
             std::vector<Properties> dummy_props;
             enc_context = this->init_context_dec(
                 *enc_frag_ids, dummy_props, this->pkt_size, inter_words.get());
 
             // for decoding
-            this->dec_inter_codeword =
-                std::make_unique<vec::Buffers<T>>(this->n, this->pkt_size);
+            this->dec_inter_codeword = std::make_unique<vec::Buffers<T>>(
+                this->n, this->pkt_size, true);
+
+            vec_inter_words =
+                std::make_unique<vec::Vector<T>>(*(this->gf), this->n_data);
+            vec_inter_codeword =
+                std::make_unique<vec::Vector<T>>(*(this->gf), this->n);
+            vec_enc_context =
+                this->init_context_dec(*enc_frag_ids, dummy_props);
+
+            // for decoding
+            this->vec_dec_inter_codeword =
+                std::make_unique<vec::Vector<T>>(*(this->gf), this->n);
         }
     }
 
@@ -177,7 +195,14 @@ class RsFnt : public FecCode<T> {
         off_t offset,
         vec::Vector<T>& words) override
     {
-        this->fft->fft(output, words);
+        if (this->type == FecType::SYSTEMATIC) {
+            this->decode_apply(*vec_enc_context, *vec_inter_words, words);
+            this->fft->fft(*vec_inter_codeword, *vec_inter_words);
+            output.copy(
+                vec_inter_codeword.get(), this->n_parities, 0, this->n_data);
+        } else {
+            this->fft->fft(output, words);
+        }
         encode_post_process(output, props, offset);
     }
 
@@ -237,6 +262,7 @@ class RsFnt : public FecCode<T> {
     {
         if (this->type == FecType::SYSTEMATIC) {
             decode_data(*enc_context, *inter_words, words);
+            // this->decode_apply(*enc_context, *inter_words, words);
             vec::Buffers<T> _tmp(words, output);
             vec::Buffers<T> _output(_tmp, *suffix_words);
             this->fft->fft(_output, *inter_words);
@@ -251,14 +277,26 @@ class RsFnt : public FecCode<T> {
         std::vector<Properties>& props,
         off_t offset) override
     {
-        // check for out of range value in output
         unsigned size = output.get_size();
-        T thres = (this->gf->card() - 1);
-        for (unsigned i = 0; i < this->n_outputs; ++i) {
-            T* chunk = output.get(i);
-            for (unsigned j = 0; j < size; ++j) {
-                if (chunk[j] & thres) {
-                    props[i].add(offset + j, OOR_MARK);
+        if (output.has_meta()) {
+            // check meta of elements
+            for (unsigned i = 0; i < this->n_outputs; ++i) {
+                for (unsigned j = 0; j < size; ++j) {
+                    T meta = output.get_meta(i, j);
+                    if (meta) {
+                        props[i].add(offset + j, meta);
+                    }
+                }
+            }
+        } else {
+            // check for out of range value in output
+            T thres = (this->gf->card() - 1);
+            for (unsigned i = 0; i < this->n_outputs; ++i) {
+                T* chunk = output.get(i);
+                for (unsigned j = 0; j < size; ++j) {
+                    if (chunk[j] & thres) {
+                        props[i].add(offset + j, OOR_MARK);
+                    }
                 }
             }
         }
diff --git a/src/fec_rs_nf4.h b/src/fec_rs_nf4.h
index 67893405..37217a35 100644
--- a/src/fec_rs_nf4.h
+++ b/src/fec_rs_nf4.h
@@ -110,6 +110,9 @@ class RsNf4 : public FecCode<T> {
         for (unsigned i = 0; i < this->n; i++) {
             this->r_powers->set(i, ngff4->exp(this->r, i));
         }
+
+        work_buf =
+            std::make_unique<vec::Buffers<T>>(this->n_data, this->pkt_size);
     }
 
     int get_n_outputs() override
@@ -168,6 +171,7 @@ class RsNf4 : public FecCode<T> {
     }
 
   private:
+    std::unique_ptr<vec::Buffers<T>> work_buf;
     const gf::Field<uint32_t>* sub_field;
     gf::NF4<T>* ngff4;
     int gf_n;
@@ -258,13 +262,28 @@ class RsNf4 : public FecCode<T> {
         off_t offset,
         vec::Buffers<T>& words) override
     {
+        // as Buffers has not meta, `words` contains only `buf_size = pkt_size *
+        // word_size` bytes from source. It is stored in first `buf_size /
+        // sizeof(T)` elements of `words`
+
+        const unsigned nb_words_per_element = sizeof(T) / this->word_size;
         for (unsigned i = 0; i < this->n_data; ++i) {
             T* chunk = words.get(i);
-            for (size_t j = 0; j < this->pkt_size; ++j) {
-                chunk[j] = ngff4->pack(chunk[j]);
+            T* work = work_buf->get(i);
+
+            size_t u = 0;
+            T element = chunk[u];
+            for (size_t j = 0, u = 0; j < this->pkt_size; ++j) {
+                work[j] = ngff4->pack(element);
+
+                (j + 1) % nb_words_per_element == 0
+                    ? element = chunk[++u]
+                    : element = static_cast<T>(element)
+                                >> (CHAR_BIT * this->word_size);
             }
         }
-        this->fft->fft(output, words);
+
+        this->fft->fft(output, *work_buf);
         encode_post_process(output, props, offset);
     }
 
@@ -273,17 +292,42 @@ class RsNf4 : public FecCode<T> {
         std::vector<Properties>& props,
         off_t offset) override
     {
-        size_t size = output.get_size();
+        // as Buffers has not meta, output write only `buf_size = pkt_size *
+        // word_size` bytes to destination
+        // This data should be stored in first `buf_size / sizeof(T)` elements.
+
+        const unsigned nb_words_per_element = sizeof(T) / this->word_size;
+        const size_t size = output.get_size();
         GroupedValues<T> true_val;
         for (unsigned frag_id = 0; frag_id < this->code_len; ++frag_id) {
             T* chunk = output.get(frag_id);
-            for (size_t symb_id = 0; symb_id < size; symb_id++) {
+
+            size_t out_symb_id = 0;
+            unsigned symb_offset = 0;
+            T element = 0;
+            for (size_t symb_id = 0; symb_id < size; ++symb_id) {
                 ngff4->unpack(chunk[symb_id], true_val);
                 if (true_val.flag > 0) {
                     const off_t loc = offset + symb_id;
                     props[frag_id].add(loc, true_val.flag);
                 }
-                chunk[symb_id] = true_val.values;
+
+                // for test
+                chunk[symb_id] = 0;
+
+                element |= (static_cast<T>(true_val.values) << symb_offset);
+                if ((symb_id + 1) % nb_words_per_element == 0) {
+                    chunk[out_symb_id] = element;
+                    out_symb_id++;
+                    symb_offset = 0;
+                    element = 0;
+                } else {
+                    symb_offset += (CHAR_BIT * this->word_size);
+                }
+            }
+            // for the last no-full element
+            if (symb_offset > 0) {
+                chunk[out_symb_id] = element;
             }
         }
     }
@@ -295,23 +339,35 @@ class RsNf4 : public FecCode<T> {
         vec::Buffers<T>& words) override
     {
         const vec::Vector<T>& fragments_ids = context.get_fragments_id();
+        // as Buffers has not meta, `words` contains only `buf_size = pkt_size *
+        // word_size` bytes from source. It is stored in first `buf_size /
+        // sizeof(T)` elements of `words`
+
+        const unsigned nb_words_per_element = sizeof(T) / this->word_size;
         for (unsigned i = 0; i < this->n_data; ++i) {
             const int frag_id = fragments_ids.get(i);
             T* chunk = words.get(i);
+            T* work = work_buf->get(i);
 
-            // pack marked symbols
-            for (size_t index = 0; index < this->pkt_size; ++index) {
+            size_t u = 0;
+            T element = chunk[u];
+            for (size_t j = 0, u = 0; j < this->pkt_size; ++j) {
                 if (props[frag_id].is_marked(
-                        context.props_indices[frag_id], offset + index)) {
+                        context.props_indices[frag_id], offset + j)) {
                     // pack marked symbol
-                    chunk[index] = ngff4->pack(
-                        chunk[index],
+                    work[j] = ngff4->pack(
+                        element,
                         props[frag_id].marker(context.props_indices[frag_id]));
                     context.props_indices.at(frag_id)++;
                 } else {
                     // pack un-marked symbol
-                    chunk[index] = ngff4->pack(chunk[index]);
+                    work[j] = ngff4->pack(element);
                 }
+
+                (j + 1) % nb_words_per_element == 0
+                    ? element = chunk[++u]
+                    : element = static_cast<T>(element)
+                                >> (CHAR_BIT * this->word_size);
             }
         }
     }
@@ -319,15 +375,42 @@ class RsNf4 : public FecCode<T> {
     void decode_apply(
         DecodeContext<T>& context,
         vec::Buffers<T>& output,
-        vec::Buffers<T>& words) override
+        vec::Buffers<T>&) override
     {
+        // as Buffers has not meta, output write only `buf_size = pkt_size *
+        // word_size` bytes to destination
+        // This data should be stored in first `buf_size / sizeof(T)` elements.
+
         // decode_apply: do the same thing as in fec_base
-        FecCode<T>::decode_apply(context, output, words);
+        FecCode<T>::decode_apply(context, output, *work_buf);
+
+        const unsigned nb_words_per_element = sizeof(T) / this->word_size;
+        GroupedValues<T> true_val;
         // unpack decoded symbols
-        for (unsigned i = 0; i < this->n_data; ++i) {
-            T* chunk = output.get(i);
-            for (unsigned j = 0; j < this->pkt_size; ++j) {
-                chunk[j] = ngff4->unpack(chunk[j]).values;
+        for (unsigned frag_id = 0; frag_id < this->n_data; ++frag_id) {
+            T* chunk = output.get(frag_id);
+
+            size_t out_symb_id = 0;
+            unsigned symb_offset = 0;
+            T element = 0;
+            for (size_t symb_id = 0; symb_id < this->pkt_size; ++symb_id) {
+                ngff4->unpack(chunk[symb_id], true_val);
+                // for test
+                chunk[symb_id] = 0;
+
+                element |= (static_cast<T>(true_val.values) << symb_offset);
+                if ((symb_id + 1) % nb_words_per_element == 0) {
+                    chunk[out_symb_id] = element;
+                    out_symb_id++;
+                    symb_offset = 0;
+                    element = 0;
+                } else {
+                    symb_offset += (CHAR_BIT * this->word_size);
+                }
+            }
+            // for the last no-full element
+            if (symb_offset > 0) {
+                chunk[out_symb_id] = element;
             }
         }
     }
diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp
index ed82fab8..bc412fd2 100644
--- a/src/fec_vectorisation.cpp
+++ b/src/fec_vectorisation.cpp
@@ -50,18 +50,16 @@ void RsFnt<uint16_t>::encode_post_process(
     off_t offset)
 {
     size_t size = this->pkt_size;
-    uint16_t threshold = this->gf->card_minus_one();
     unsigned code_len = this->n_outputs;
 
-    simd::encode_post_process(
-        output, props, offset, code_len, threshold, simd_vec_len);
+    simd::encode_post_process(output, props, offset, code_len, simd_vec_len);
 
     if (simd_trailing_len > 0) {
         for (unsigned i = 0; i < code_len; ++i) {
-            uint16_t* chunk = output.get(i);
-            for (size_t j = simd_offset; j < size; ++j) {
-                if (chunk[j] == threshold) {
-                    props[i].add(offset + j, OOR_MARK);
+            for (unsigned j = simd_offset; j < size; ++j) {
+                uint16_t meta = output.get_meta(i, j);
+                if (meta) {
+                    props[i].add(offset + j, meta);
                 }
             }
         }
@@ -75,18 +73,16 @@ void RsFnt<uint32_t>::encode_post_process(
     off_t offset)
 {
     const size_t size = this->pkt_size;
-    const uint32_t threshold = this->gf->card_minus_one();
     const unsigned code_len = this->n_outputs;
 
-    simd::encode_post_process(
-        output, props, offset, code_len, threshold, simd_vec_len);
+    simd::encode_post_process(output, props, offset, code_len, simd_vec_len);
 
     if (simd_trailing_len > 0) {
         for (unsigned i = 0; i < code_len; ++i) {
-            uint32_t* chunk = output.get(i);
-            for (size_t j = simd_offset; j < size; ++j) {
-                if (chunk[j] == threshold) {
-                    props[i].add(offset + j, OOR_MARK);
+            for (unsigned j = simd_offset; j < size; ++j) {
+                uint32_t meta = output.get_meta(i, j);
+                if (meta) {
+                    props[i].add(offset + j, meta);
                 }
             }
         }
diff --git a/src/fft_2n.h b/src/fft_2n.h
index 1862362d..ceab08c6 100644
--- a/src/fft_2n.h
+++ b/src/fft_2n.h
@@ -465,14 +465,36 @@ void Radix2<T>::butterfly_ct_step_slow(
     unsigned step,
     size_t offset)
 {
-    for (int i = start; i < this->n; i += step) {
-        T* a = buf.get(i);
-        T* b = buf.get(i + m);
-        // perform butterfly operation for Cooley-Tukey FFT algorithm
-        for (size_t j = offset; j < this->pkt_size; ++j) {
-            T x = this->gf->mul(coef, b[j]);
-            b[j] = this->gf->sub(a[j], x);
-            a[j] = this->gf->add(a[j], x);
+    if (buf.has_meta()) {
+        for (int i = start; i < this->n; i += step) {
+            // perform butterfly operation for Cooley-Tukey FFT algorithm
+            for (size_t j = offset; j < this->pkt_size; ++j) {
+                T a_lo = 0, a_hi = 0, b_lo = 0, b_hi = 0;
+                buf.get(i, j, a_hi, a_lo);
+                buf.get(i + m, j, b_hi, b_lo);
+
+                T x = this->gf->mul(coef, b_lo);
+                b_lo = this->gf->sub(a_lo, x);
+                a_lo = this->gf->add(a_lo, x);
+
+                x = this->gf->mul(coef, b_hi);
+                b_hi = this->gf->sub(a_hi, x);
+                a_hi = this->gf->add(a_hi, x);
+
+                buf.set(i, j, a_hi, a_lo);
+                buf.set(i + m, j, b_hi, b_lo);
+            }
+        }
+    } else {
+        for (int i = start; i < this->n; i += step) {
+            T* a = buf.get(i);
+            T* b = buf.get(i + m);
+            // perform butterfly operation for Cooley-Tukey FFT algorithm
+            for (size_t j = offset; j < this->pkt_size; ++j) {
+                T x = this->gf->mul(coef, b[j]);
+                b[j] = this->gf->sub(a[j], x);
+                a[j] = this->gf->add(a[j], x);
+            }
         }
     }
 }
@@ -564,14 +586,36 @@ void Radix2<T>::butterfly_gs_step_slow(
     unsigned step,
     size_t offset)
 {
-    for (int i = start; i < this->n; i += step) {
-        T* a = buf.get(i);
-        T* b = buf.get(i + m);
-        // perform butterfly operation for Cooley-Tukey FFT algorithm
-        for (size_t j = offset; j < this->pkt_size; ++j) {
-            T x = this->gf->sub(a[j], b[j]);
-            a[j] = this->gf->add(a[j], b[j]);
-            b[j] = this->gf->mul(coef, x);
+    if (buf.has_meta()) {
+        for (int i = start; i < this->n; i += step) {
+            // perform butterfly operation for Cooley-Tukey FFT algorithm
+            for (size_t j = offset; j < this->pkt_size; ++j) {
+                T a_lo = 0, a_hi = 0, b_lo = 0, b_hi = 0;
+                buf.get(i, j, a_hi, a_lo);
+                buf.get(i + m, j, b_hi, b_lo);
+
+                T x = this->gf->sub(a_lo, b_lo);
+                a_lo = this->gf->add(a_lo, b_lo);
+                b_lo = this->gf->mul(coef, x);
+
+                x = this->gf->sub(a_hi, b_hi);
+                a_hi = this->gf->add(a_hi, b_hi);
+                b_hi = this->gf->mul(coef, x);
+
+                buf.set(i, j, a_hi, a_lo);
+                buf.set(i + m, j, b_hi, b_lo);
+            }
+        }
+    } else {
+        for (int i = start; i < this->n; i += step) {
+            T* a = buf.get(i);
+            T* b = buf.get(i + m);
+            // perform butterfly operation for Cooley-Tukey FFT algorithm
+            for (size_t j = offset; j < this->pkt_size; ++j) {
+                T x = this->gf->sub(a[j], b[j]);
+                a[j] = this->gf->add(a[j], b[j]);
+                b[j] = this->gf->mul(coef, x);
+            }
         }
     }
 }
@@ -585,12 +629,28 @@ void Radix2<T>::butterfly_gs_step_simple_slow(
     unsigned step,
     size_t offset)
 {
-    for (int i = start; i < this->n; i += step) {
-        T* a = buf.get(i);
-        T* b = buf.get(i + m);
-        // perform butterfly operation for Cooley-Tukey FFT algorithm
-        for (size_t j = offset; j < this->pkt_size; ++j) {
-            b[j] = this->gf->mul(coef, a[j]);
+    if (buf.has_meta()) {
+        for (int i = start; i < this->n; i += step) {
+            // perform butterfly operation for Cooley-Tukey FFT algorithm
+            for (size_t j = offset; j < this->pkt_size; ++j) {
+                T a_lo = 0, a_hi = 0, b_lo = 0, b_hi = 0;
+                buf.get(i, j, a_hi, a_lo);
+                buf.get(i + m, j, b_hi, b_lo);
+
+                b_lo = this->gf->mul(coef, a_lo);
+                b_hi = this->gf->mul(coef, a_hi);
+
+                buf.set(i + m, j, b_hi, b_lo);
+            }
+        }
+    } else {
+        for (int i = start; i < this->n; i += step) {
+            T* a = buf.get(i);
+            T* b = buf.get(i + m);
+            // perform butterfly operation for Cooley-Tukey FFT algorithm
+            for (size_t j = offset; j < this->pkt_size; ++j) {
+                b[j] = this->gf->mul(coef, a[j]);
+            }
         }
     }
 }
diff --git a/src/fft_naive.h b/src/fft_naive.h
index cf312f21..16c12dc0 100644
--- a/src/fft_naive.h
+++ b/src/fft_naive.h
@@ -148,16 +148,35 @@ void Naive<T>::_fft(
     vec::Buffers<T>& input,
     vec::Matrix<T>* _W)
 {
+    assert(output.has_meta() == input.has_meta());
     const unsigned len = this->n;
     const unsigned size = this->pkt_size;
-    for (unsigned i = 0; i < len; ++i) {
-        output.fill(i, 0);
-        T* buf = output.get(i);
-        for (unsigned j = 0; j < len; ++j) {
-            T* ibuf = input.get(j);
-            T r = _W->get(i, j);
-            for (unsigned u = 0; u < size; ++u) {
-                buf[u] = this->gf->add(buf[u], this->gf->mul(r, ibuf[u]));
+
+    if (output.has_meta()) {
+        for (unsigned i = 0; i < len; ++i) {
+            output.fill(i, 0);
+            for (unsigned j = 0; j < len; ++j) {
+                T r = _W->get(i, j);
+                for (unsigned u = 0; u < size; ++u) {
+                    T o_hi = 0, o_lo = 0, i_hi = 0, i_lo = 0;
+                    output.get(i, u, o_hi, o_lo);
+                    input.get(j, u, i_hi, i_lo);
+                    o_hi = this->gf->add(o_hi, this->gf->mul(r, i_hi));
+                    o_lo = this->gf->add(o_lo, this->gf->mul(r, i_lo));
+                    output.set(i, u, o_hi, o_lo);
+                }
+            }
+        }
+    } else {
+        for (unsigned i = 0; i < len; ++i) {
+            output.fill(i, 0);
+            T* buf = output.get(i);
+            for (unsigned j = 0; j < len; ++j) {
+                T* ibuf = input.get(j);
+                T r = _W->get(i, j);
+                for (unsigned u = 0; u < size; ++u) {
+                    buf[u] = this->gf->add(buf[u], this->gf->mul(r, ibuf[u]));
+                }
             }
         }
     }
diff --git a/src/gf_ring.cpp b/src/gf_ring.cpp
index da1ed530..0a340a60 100644
--- a/src/gf_ring.cpp
+++ b/src/gf_ring.cpp
@@ -38,83 +38,36 @@ namespace quadiron {
 namespace gf {
 
 template <>
-void RingModN<uint16_t>::neg(size_t n, uint16_t* x) const
+void RingModN<uint16_t>::neg(vec::Buffers<uint16_t>& buf, size_t buf_id) const
 {
-    simd::neg(n, x, this->_card);
+    simd::neg(*this, buf, buf_id);
 }
 
 template <>
-void RingModN<uint32_t>::neg(size_t n, uint32_t* x) const
+void RingModN<uint32_t>::neg(vec::Buffers<uint32_t>& buf, size_t buf_id) const
 {
-    simd::neg(n, x, this->_card);
-}
-
-template <>
-void RingModN<uint32_t>::mul_coef_to_buf(
-    uint32_t a,
-    uint32_t* src,
-    uint32_t* dest,
-    size_t len) const
-{
-    simd::mul_coef_to_buf(a, src, dest, len, this->_card);
-}
-
-template <>
-void RingModN<uint32_t>::add_two_bufs(uint32_t* src, uint32_t* dest, size_t len)
-    const
-{
-    simd::add_two_bufs(src, dest, len, this->_card);
-}
-
-template <>
-void RingModN<uint32_t>::sub_two_bufs(
-    uint32_t* bufa,
-    uint32_t* bufb,
-    uint32_t* res,
-    size_t len) const
-{
-    simd::sub_two_bufs(bufa, bufb, res, len, this->_card);
+    simd::neg(*this, buf, buf_id);
 }
 
+// @note We specialize the function for the case Buffers having meta
 template <>
 void RingModN<uint16_t>::mul_coef_to_buf(
-    uint16_t a,
-    uint16_t* src,
-    uint16_t* dest,
-    size_t len) const
+    uint16_t coef,
+    vec::Buffers<uint16_t>& src,
+    vec::Buffers<uint16_t>& dest,
+    size_t buf_id) const
 {
-    simd::mul_coef_to_buf(a, src, dest, len, this->_card);
+    simd::mul_coef_to_buf(*this, coef, src, dest, buf_id);
 }
 
 template <>
-void RingModN<uint16_t>::add_two_bufs(uint16_t* src, uint16_t* dest, size_t len)
-    const
-{
-    simd::add_two_bufs(src, dest, len, this->_card);
-}
-
-template <>
-void RingModN<uint16_t>::sub_two_bufs(
-    uint16_t* bufa,
-    uint16_t* bufb,
-    uint16_t* res,
-    size_t len) const
-{
-    simd::sub_two_bufs(bufa, bufb, res, len, this->_card);
-}
-
-template <>
-void RingModN<uint16_t>::hadamard_mul(int n, uint16_t* x_u16, uint16_t* y_u16)
-    const
-{
-    simd::mul_two_bufs(y_u16, x_u16, n, this->_card);
-}
-
-template <>
-void RingModN<uint32_t>::hadamard_mul(int n, uint32_t* x_u32, uint32_t* y_u32)
-    const
+void RingModN<uint32_t>::mul_coef_to_buf(
+    uint32_t coef,
+    vec::Buffers<uint32_t>& src,
+    vec::Buffers<uint32_t>& dest,
+    size_t buf_id) const
 {
-    simd::mul_two_bufs(y_u32, x_u32, n, this->_card);
+    simd::mul_coef_to_buf(*this, coef, src, dest, buf_id);
 }
 
 } // namespace gf
diff --git a/src/gf_ring.h b/src/gf_ring.h
index 523d7dd6..31f9dad7 100644
--- a/src/gf_ring.h
+++ b/src/gf_ring.h
@@ -95,7 +95,11 @@ class RingModN {
     T exp_quick(T base, T exponent) const;
     T log_naive(T base, T exponent) const;
     virtual T replicate(T a) const;
-    virtual void mul_coef_to_buf(T a, T* src, T* dest, size_t len) const;
+    virtual void mul_coef_to_buf(
+        T coef,
+        vec::Buffers<T>& src,
+        vec::Buffers<T>& dest,
+        size_t buf_id) const;
     virtual void mul_vec_to_vecp(
         vec::Vector<T>& u,
         vec::Buffers<T>& src,
@@ -127,8 +131,8 @@ class RingModN {
     T get_code_len(T n) const;
     T get_code_len_high_compo(T n) const;
     virtual void hadamard_mul(int n, T* x, T* y) const;
-    virtual void neg(size_t n, T* x) const;
-    virtual void neg(vec::Buffers<T>& buf) const;
+    void neg(vec::Buffers<T>& buf, size_t buf_id) const;
+    void neg(vec::Buffers<T>& buf) const;
 
     RingModN(RingModN&&) = default;
 
@@ -359,15 +363,30 @@ inline T RingModN<T>::replicate(T a) const
     return a;
 }
 
-// For each i, dest[i] = a * src[i]
 template <typename T>
-inline void RingModN<T>::mul_coef_to_buf(T a, T* src, T* dest, size_t len) const
+inline void RingModN<T>::mul_coef_to_buf(
+    T coef,
+    vec::Buffers<T>& src,
+    vec::Buffers<T>& dest,
+    size_t buf_id) const
 {
-    size_t i;
-    DoubleSizeVal<T> coef = DoubleSizeVal<T>(a);
-    for (i = 0; i < len; i++) {
-        // perform multiplication
-        dest[i] = mul(coef, src[i]);
+    assert(buf_id >= 0 && buf_id < static_cast<size_t>(src.get_n()));
+
+    size_t len = src.get_size();
+    if (src.has_meta()) {
+        for (size_t i = 0; i < len; i++) {
+            T lo = 0, hi = 0;
+            src.get(buf_id, i, hi, lo);
+            dest.set(buf_id, i, mul(coef, hi), mul(coef, lo));
+        }
+    } else {
+        T* src_mem = src.get(buf_id);
+        T* dest_mem = dest.get(buf_id);
+        DoubleSizeVal<T> d_coef = DoubleSizeVal<T>(coef);
+
+        for (size_t i = 0; i < len; i++) {
+            dest_mem[i] = mul(d_coef, src_mem[i]);
+        }
     }
 }
 
@@ -378,24 +397,28 @@ inline void RingModN<T>::mul_vec_to_vecp(
     vec::Buffers<T>& dest) const
 {
     assert(u.get_n() == src.get_n());
-    int i;
-    int n = u.get_n();
-    size_t len = src.get_size();
-    T h = this->card_minus_one();
-    const std::vector<T*>& src_mem = src.get_mem();
-    const std::vector<T*>& dest_mem = dest.get_mem();
+
+    const size_t n = src.get_n();
+    const size_t h = this->card_minus_one();
+    const bool same_obj = std::addressof(src) == std::addressof(dest);
     T* coef_vec = u.get_mem();
-    for (i = 0; i < n; i++) {
-        T coef = coef_vec[i];
-        if (coef > 1 && coef < h) {
-            this->mul_coef_to_buf(coef, src_mem[i], dest_mem[i], len);
-        } else if (coef == 1) {
-            dest.copy(src, i, i);
-        } else if (coef == 0) {
+
+    for (size_t i = 0; i < n; ++i) {
+        const T coef = coef_vec[i];
+        if (coef == 0) {
             dest.fill(i, 0);
-        } else if (coef == h) {
+        } else if (coef == 1) {
+            if (same_obj) {
+                continue;
+            }
             dest.copy(src, i, i);
-            this->neg(len, dest_mem[i]);
+        } else if (coef < h) {
+            this->mul_coef_to_buf(coef, src, dest, i);
+        } else { // coef == card - 1
+            if (!same_obj) {
+                dest.copy(src, i, i);
+            }
+            this->neg(dest, i);
         }
     }
 }
@@ -830,11 +853,21 @@ inline void RingModN<T>::hadamard_mul(int n, T* x, T* y) const
 }
 
 template <typename T>
-inline void RingModN<T>::neg(size_t n, T* x) const
+inline void RingModN<T>::neg(vec::Buffers<T>& buf, size_t buf_id) const
 {
-    // add y to the first half of `x`
-    for (size_t i = 0; i < n; i++) {
-        x[i] = sub(0, x[i]);
+    size_t size = buf.get_size();
+    if (buf.has_meta()) {
+        for (size_t i = 0; i < size; ++i) {
+            T hi = 0, lo = 0;
+            buf.get(buf_id, i, hi, lo);
+            buf.set(buf_id, i, neg(hi), neg(lo));
+        }
+    } else {
+        T* x = buf.get(buf_id);
+        // add y to the first half of `x`
+        for (size_t i = 0; i < size; ++i) {
+            x[i] = sub(0, x[i]);
+        }
     }
 }
 
@@ -842,8 +875,18 @@ template <typename T>
 inline void RingModN<T>::neg(vec::Buffers<T>& buf) const
 {
     size_t size = buf.get_size();
-    for (int i = 0; i < buf.get_n(); i++) {
-        neg(size, buf.get(i));
+    if (buf.has_meta()) {
+        for (int i = 0; i < buf.get_n(); i++) {
+            for (size_t j = 0; j < size; ++j) {
+                T hi = 0, lo = 0;
+                buf.get(i, j, hi, lo);
+                buf.set(i, j, neg(hi), neg(lo));
+            }
+        }
+    } else {
+        for (int i = 0; i < buf.get_n(); i++) {
+            neg(buf, i);
+        }
     }
 }
 
@@ -851,56 +894,23 @@ inline void RingModN<T>::neg(vec::Buffers<T>& buf) const
 /* Operations are vectorized by SIMD */
 
 template <>
-void RingModN<uint16_t>::neg(size_t n, uint16_t* x) const;
-
+void RingModN<uint16_t>::neg(vec::Buffers<uint16_t>& buf, size_t buf_id) const;
 template <>
-void RingModN<uint32_t>::neg(size_t n, uint32_t* x) const;
+void RingModN<uint32_t>::neg(vec::Buffers<uint32_t>& buf, size_t buf_id) const;
 
 template <>
 void RingModN<uint16_t>::mul_coef_to_buf(
-    uint16_t a,
-    uint16_t* src,
-    uint16_t* dest,
-    size_t len) const;
+    uint16_t coef,
+    vec::Buffers<uint16_t>& src,
+    vec::Buffers<uint16_t>& dest,
+    size_t buf_id) const;
 
 template <>
 void RingModN<uint32_t>::mul_coef_to_buf(
-    uint32_t a,
-    uint32_t* src,
-    uint32_t* dest,
-    size_t len) const;
-
-template <>
-void RingModN<uint16_t>::add_two_bufs(uint16_t* src, uint16_t* dest, size_t len)
-    const;
-
-template <>
-void RingModN<uint32_t>::add_two_bufs(uint32_t* src, uint32_t* dest, size_t len)
-    const;
-
-template <>
-void RingModN<uint16_t>::sub_two_bufs(
-    uint16_t* bufa,
-    uint16_t* bufb,
-    uint16_t* res,
-    size_t len) const;
-
-template <>
-void RingModN<uint32_t>::sub_two_bufs(
-    uint32_t* bufa,
-    uint32_t* bufb,
-    uint32_t* res,
-    size_t len) const;
-
-template <>
-void RingModN<uint16_t>::hadamard_mul(int n, uint16_t* x, uint16_t* y) const;
-template <>
-void RingModN<uint32_t>::hadamard_mul(int n, uint32_t* x, uint32_t* y) const;
-// template <>
-// void RingModN<uint64_t>::hadamard_mul(int n, uint64_t* x, uint64_t* y) const;
-// template <>
-// void RingModN<__uint128_t>::hadamard_mul(int n, __uint128_t* x, __uint128_t*
-// y) const;
+    uint32_t coef,
+    vec::Buffers<uint32_t>& src,
+    vec::Buffers<uint32_t>& dest,
+    size_t buf_id) const;
 
 #endif // #ifdef QUADIRON_USE_SIMD
 
diff --git a/src/property.h b/src/property.h
index 2fc6a564..c83f93b8 100644
--- a/src/property.h
+++ b/src/property.h
@@ -110,6 +110,7 @@ class Properties {
         unsigned i = 2;
         for (auto const& item : props) {
             dwords[i++] = htonl(narrow_cast<uint32_t>(item.first));
+            dwords[i++] = htonl(narrow_cast<uint32_t>(item.second));
         }
         dwords[1] = htonl(i - 2);
         std::fill(dwords + i, dwords + n_dwords - 1, htonl(0));
@@ -135,8 +136,9 @@ class Properties {
         if ((2 + _n_dwords) > n_dwords) {
             return -1;
         }
-        for (unsigned i = 0; i < _n_dwords; i++) {
-            add(static_cast<size_t>(ntohl(dwords[i + 2])), OOR_MARK);
+        for (unsigned i = 0; i < _n_dwords; i += 2) {
+            add(static_cast<size_t>(ntohl(dwords[i + 2])),
+                ntohl(dwords[i + 3]));
         }
         return 0;
     }
diff --git a/src/simd/definitions.h b/src/simd/definitions.h
index 50617a38..304bde9f 100644
--- a/src/simd/definitions.h
+++ b/src/simd/definitions.h
@@ -76,6 +76,7 @@ enum class InstructionSet {
 
 using RegisterType = __m256;
 using MaskType = __m256i;
+using MetaType = uint32_t;
 
 static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::AVX;
 
@@ -86,6 +87,7 @@ static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::AVX;
 
 using RegisterType = __m128;
 using MaskType = __m128i;
+using MetaType = uint16_t;
 
 static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::SSE;
 
@@ -98,9 +100,13 @@ static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::SSE;
 #if __LP64__ == 1
 using RegisterType = uint64_t;
 using MaskType = uint64_t;
+using MetaType = uint8_t;
+
 #else
 using RegisterType = uint32_t;
 using MaskType = uint32_t;
+using MetaType = uint8_t;
+
 #endif
 
 static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::NONE;
diff --git a/src/simd_128.h b/src/simd_128.h
index 856588ec..399b1e6d 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -43,26 +43,23 @@ typedef __m128i VecType;
 // @note: using const leads to an lint error of initialization of 'variable'
 // with static storage duration may throw an exception that cannot be caught
 
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F4_U32 = _mm_set1_epi32(65537);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F4_MINUS_ONE_U32 = _mm_set1_epi32(65536);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F3_U32 = _mm_set1_epi32(257);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F3_MINUS_ONE_U32 = _mm_set1_epi32(256);
-
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F3_U16 = _mm_set1_epi16(257);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F3_MINUS_ONE_U16 = _mm_set1_epi16(256);
+template <typename T>
+inline VecType one();
+template <>
+inline VecType one<uint16_t>()
+{
+    return _mm_set1_epi16(1);
+}
+template <>
+inline VecType one<uint32_t>()
+{
+    return _mm_set1_epi32(1);
+}
 
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType ZERO = _mm_setzero_si128();
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType ONE_U16 = _mm_set1_epi16(1);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType ONE_U32 = _mm_set1_epi32(1);
+inline VecType zero()
+{
+    return _mm_setzero_si128();
+}
 
 // NOLINTNEXTLINE(cert-err58-cpp)
 const VecType MASK8_LO = _mm_set1_epi16(0x80);
@@ -78,25 +75,25 @@ inline void store_to_mem(VecType* address, VecType reg)
     _mm_store_si128(address, reg);
 }
 
-inline VecType bit_and(VecType x, VecType y)
+inline VecType bit_and(const VecType& x, const VecType& y)
 {
     return _mm_and_si128(x, y);
 }
-inline VecType bit_xor(VecType x, VecType y)
+inline VecType bit_xor(const VecType& x, const VecType& y)
 {
     return _mm_xor_si128(x, y);
 }
-inline uint16_t msb8_mask(VecType x)
+inline uint16_t msb8_mask(const VecType& x)
 {
     return _mm_movemask_epi8(x);
 }
-inline bool and_is_zero(VecType x, VecType y)
+inline bool and_is_zero(const VecType& x, const VecType& y)
 {
     return _mm_testz_si128(x, y);
 }
-inline bool is_zero(VecType x)
+inline bool is_zero(const VecType& x)
 {
-    return _mm_testc_si128(ZERO, x);
+    return _mm_testc_si128(zero(), x);
 }
 
 #define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8))
@@ -119,70 +116,165 @@ inline VecType set_one(uint16_t val)
 }
 
 template <typename T>
-inline VecType add(VecType x, VecType y);
+inline VecType add(const VecType& x, const VecType& y);
 template <>
-inline VecType add<uint32_t>(VecType x, VecType y)
+inline VecType add<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm_add_epi32(x, y);
 }
 template <>
-inline VecType add<uint16_t>(VecType x, VecType y)
+inline VecType add<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm_add_epi16(x, y);
 }
 
 template <typename T>
-inline VecType sub(VecType x, VecType y);
+inline VecType sub(const VecType& x, const VecType& y);
 template <>
-inline VecType sub<uint32_t>(VecType x, VecType y)
+inline VecType sub<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm_sub_epi32(x, y);
 }
 template <>
-inline VecType sub<uint16_t>(VecType x, VecType y)
+inline VecType sub<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm_sub_epi16(x, y);
 }
 
 template <typename T>
-inline VecType mul(VecType x, VecType y);
+inline VecType mul(const VecType& x, const VecType& y);
 template <>
-inline VecType mul<uint32_t>(VecType x, VecType y)
+inline VecType mul<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm_mullo_epi32(x, y);
 }
 template <>
-inline VecType mul<uint16_t>(VecType x, VecType y)
+inline VecType mul<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm_mullo_epi16(x, y);
 }
 
 template <typename T>
-inline VecType compare_eq(VecType x, VecType y);
+inline VecType compare_eq(const VecType& x, const VecType& y);
 template <>
-inline VecType compare_eq<uint32_t>(VecType x, VecType y)
+inline VecType compare_eq<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm_cmpeq_epi32(x, y);
 }
 template <>
-inline VecType compare_eq<uint16_t>(VecType x, VecType y)
+inline VecType compare_eq<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm_cmpeq_epi16(x, y);
 }
 
 template <typename T>
-inline VecType min(VecType x, VecType y);
+inline VecType min(const VecType& x, const VecType& y);
 template <>
-inline VecType min<uint32_t>(VecType x, VecType y)
+inline VecType min<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm_min_epu32(x, y);
 }
 template <>
-inline VecType min<uint16_t>(VecType x, VecType y)
+inline VecType min<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm_min_epu16(x, y);
 }
 
+template <typename T>
+void show(simd::VecType val)
+{
+    const size_t n = simd::countof<T>();
+    T buffer[n];
+    simd::store_to_mem(reinterpret_cast<simd::VecType*>(buffer), val);
+    for (unsigned i = 0; i < n; i++) {
+        std::cout << unsigned(buffer[i]) << " ";
+    }
+    std::cout << "\n";
+}
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType mask1 =
+    _mm_set_epi16(0x0101, 0x0101, 0x0101, 0x0101, 0x0, 0x0, 0x0, 0x0);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType mask2 = _mm_set_epi16(
+    static_cast<int16_t>(0x8040),
+    0x2010,
+    0x0804,
+    0x0201,
+    static_cast<int16_t>(0x8040),
+    0x2010,
+    0x0804,
+    0x0201);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType mask3 = _mm_set1_epi8(1);
+
+inline VecType from_msb8_mask(MetaType m)
+{
+    // set `m` to all elements
+    VecType x = _mm_set1_epi16(m);
+    // re-arrange byte-by-byte
+    x = _mm_shuffle_epi8(x, mask1);
+    // get appropriated bit per byte
+    x = _mm_and_si128(x, mask2);
+    // generate mask
+    x = _mm_cmpeq_epi8(x, mask2);
+    // result
+    x = _mm_and_si128(x, mask3);
+
+    return x;
+}
+
+template <typename T>
+inline void unpack(MetaType m, const VecType& x, VecType& hi, VecType& lo);
+template <>
+inline void
+unpack<uint32_t>(MetaType m, const VecType& x, VecType& hi, VecType& lo)
+{
+    VecType m_mask = m ? from_msb8_mask(m) : zero();
+    lo = _mm_unpacklo_epi16(x, m_mask);
+    hi = _mm_unpackhi_epi16(x, m_mask);
+}
+template <>
+inline void
+unpack<uint16_t>(MetaType m, const VecType& x, VecType& hi, VecType& lo)
+{
+    VecType m_mask = m ? from_msb8_mask(m) : zero();
+    lo = _mm_unpacklo_epi8(x, m_mask);
+    hi = _mm_unpackhi_epi8(x, m_mask);
+}
+
+template <typename T>
+inline void pack(const VecType& lo, const VecType& hi, VecType& x, MetaType& m);
+template <>
+inline void
+pack<uint32_t>(const VecType& lo, const VecType& hi, VecType& x, MetaType& m)
+{
+    VecType hi_data = BLEND16(zero(), hi, 0x55);
+    VecType hi_meta = BLEND16(zero(), SHIFTR(hi, 2), 0x55);
+    VecType lo_data = BLEND16(zero(), lo, 0x55);
+    VecType lo_meta = BLEND16(zero(), SHIFTR(lo, 2), 0x55);
+
+    x = _mm_packus_epi32(lo_data, hi_data);
+    VecType meta_x = _mm_packus_epi32(lo_meta, hi_meta);
+    meta_x = _mm_cmpgt_epi8(meta_x, zero());
+    m = _mm_movemask_epi8(meta_x);
+}
+template <>
+inline void
+pack<uint16_t>(const VecType& lo, const VecType& hi, VecType& x, MetaType& m)
+{
+    VecType hi_data = BLEND8(zero(), hi, MASK8_LO);
+    VecType hi_meta = BLEND8(hi, zero(), MASK8_LO);
+    VecType lo_data = BLEND8(zero(), lo, MASK8_LO);
+    VecType lo_meta = BLEND8(lo, zero(), MASK8_LO);
+
+    x = _mm_packus_epi16(lo_data, hi_data);
+    VecType meta_x = _mm_packus_epi16(lo_meta, hi_meta);
+    m = _mm_movemask_epi8(meta_x);
+}
+
 } // namespace simd
 } // namespace quadiron
 
diff --git a/src/simd_256.h b/src/simd_256.h
index 8084d95e..299519a2 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -51,31 +51,40 @@ namespace simd {
 typedef __m256i VecType;
 typedef __m128i HalfVecType;
 
+template <typename T>
+void show(simd::VecType val)
+{
+    const size_t n = simd::countof<T>();
+    T buffer[n];
+    _mm256_storeu_si256(reinterpret_cast<simd::VecType*>(buffer), val);
+    for (unsigned i = 0; i < n; i++) {
+        std::cout << unsigned(buffer[i]) << " ";
+    }
+    std::cout << "\n";
+}
+
 /* ============= Constant variable  ============ */
 
 // @note: using const leads to an lint error of initialization of 'variable'
 // with static storage duration may throw an exception that cannot be caught
 
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F4_U32 = _mm256_set1_epi32(65537);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F4_MINUS_ONE_U32 = _mm256_set1_epi32(65536);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F3_U32 = _mm256_set1_epi32(257);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F3_MINUS_ONE_U32 = _mm256_set1_epi32(256);
-
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F3_U16 = _mm256_set1_epi16(257);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType F3_MINUS_ONE_U16 = _mm256_set1_epi16(256);
+template <typename T>
+inline VecType one();
+template <>
+inline VecType one<uint16_t>()
+{
+    return _mm256_set1_epi16(1);
+}
+template <>
+inline VecType one<uint32_t>()
+{
+    return _mm256_set1_epi32(1);
+}
 
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType ZERO = _mm256_setzero_si256();
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType ONE_U16 = _mm256_set1_epi16(1);
-// NOLINTNEXTLINE(cert-err58-cpp)
-const VecType ONE_U32 = _mm256_set1_epi32(1);
+inline VecType zero()
+{
+    return _mm256_setzero_si256();
+}
 
 // NOLINTNEXTLINE(cert-err58-cpp)
 const VecType MASK8_LO = _mm256_set1_epi16(0x80);
@@ -86,30 +95,30 @@ inline VecType load_to_reg(VecType* address)
 {
     return _mm256_load_si256(address);
 }
-inline void store_to_mem(VecType* address, VecType reg)
+inline void store_to_mem(VecType* address, const VecType& reg)
 {
     _mm256_store_si256(address, reg);
 }
 
-inline VecType bit_and(VecType x, VecType y)
+inline VecType bit_and(const VecType& x, const VecType& y)
 {
     return _mm256_and_si256(x, y);
 }
-inline VecType bit_xor(VecType x, VecType y)
+inline VecType bit_xor(const VecType& x, const VecType& y)
 {
     return _mm256_xor_si256(x, y);
 }
-inline uint32_t msb8_mask(VecType x)
+inline uint32_t msb8_mask(const VecType& x)
 {
     return _mm256_movemask_epi8(x);
 }
-inline bool and_is_zero(VecType x, VecType y)
+inline bool and_is_zero(const VecType& x, const VecType& y)
 {
     return _mm256_testz_si256(x, y);
 }
-inline bool is_zero(VecType x)
+inline bool is_zero(const VecType& x)
 {
-    return _mm256_testc_si256(ZERO, x);
+    return _mm256_testc_si256(zero(), x);
 }
 
 #define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8))
@@ -132,70 +141,160 @@ inline VecType set_one(uint16_t val)
 }
 
 template <typename T>
-inline VecType add(VecType x, VecType y);
+inline VecType add(const VecType& x, const VecType& y);
 template <>
-inline VecType add<uint32_t>(VecType x, VecType y)
+inline VecType add<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm256_add_epi32(x, y);
 }
 template <>
-inline VecType add<uint16_t>(VecType x, VecType y)
+inline VecType add<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm256_add_epi16(x, y);
 }
 
 template <typename T>
-inline VecType sub(VecType x, VecType y);
+inline VecType sub(const VecType& x, const VecType& y);
 template <>
-inline VecType sub<uint32_t>(VecType x, VecType y)
+inline VecType sub<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm256_sub_epi32(x, y);
 }
 template <>
-inline VecType sub<uint16_t>(VecType x, VecType y)
+inline VecType sub<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm256_sub_epi16(x, y);
 }
 
 template <typename T>
-inline VecType mul(VecType x, VecType y);
+inline VecType mul(const VecType& x, const VecType& y);
 template <>
-inline VecType mul<uint32_t>(VecType x, VecType y)
+inline VecType mul<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm256_mullo_epi32(x, y);
 }
 template <>
-inline VecType mul<uint16_t>(VecType x, VecType y)
+inline VecType mul<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm256_mullo_epi16(x, y);
 }
 
 template <typename T>
-inline VecType compare_eq(VecType x, VecType y);
+inline VecType compare_eq(const VecType& x, const VecType& y);
 template <>
-inline VecType compare_eq<uint32_t>(VecType x, VecType y)
+inline VecType compare_eq<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm256_cmpeq_epi32(x, y);
 }
 template <>
-inline VecType compare_eq<uint16_t>(VecType x, VecType y)
+inline VecType compare_eq<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm256_cmpeq_epi16(x, y);
 }
 
 template <typename T>
-inline VecType min(VecType x, VecType y);
+inline VecType min(const VecType& x, const VecType& y);
 template <>
-inline VecType min<uint32_t>(VecType x, VecType y)
+inline VecType min<uint32_t>(const VecType& x, const VecType& y)
 {
     return _mm256_min_epu32(x, y);
 }
 template <>
-inline VecType min<uint16_t>(VecType x, VecType y)
+inline VecType min<uint16_t>(const VecType& x, const VecType& y)
 {
     return _mm256_min_epu16(x, y);
 }
 
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType mask1 = _mm256_set_epi32(
+    0x03030303,
+    0x03030303,
+    0x02020202,
+    0x02020202,
+    0x01010101,
+    0x01010101,
+    0x0,
+    0x0);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType mask2 = _mm256_set_epi32(
+    0x80402010,
+    0x08040201,
+    0x80402010,
+    0x08040201,
+    0x80402010,
+    0x08040201,
+    0x80402010,
+    0x08040201);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType mask3 = _mm256_set1_epi8(1);
+
+inline VecType from_msb8_mask(MetaType m)
+{
+    // set `m` to all elements
+    VecType x = _mm256_set1_epi32(m);
+    // re-arrange byte-by-byte
+    x = _mm256_shuffle_epi8(x, mask1);
+    // get appropriated bit per byte
+    x = _mm256_and_si256(x, mask2);
+    // generate mask
+    x = _mm256_cmpeq_epi8(x, mask2);
+    // result
+    x = _mm256_and_si256(x, mask3);
+
+    return x;
+}
+
+template <typename T>
+inline void unpack(MetaType m, const VecType& x, VecType& hi, VecType& lo);
+template <>
+inline void
+unpack<uint32_t>(MetaType m, const VecType& x, VecType& hi, VecType& lo)
+{
+    VecType m_mask = m ? from_msb8_mask(m) : zero();
+    lo = _mm256_unpacklo_epi16(x, m_mask);
+    hi = _mm256_unpackhi_epi16(x, m_mask);
+}
+template <>
+inline void
+unpack<uint16_t>(MetaType m, const VecType& x, VecType& hi, VecType& lo)
+{
+    VecType m_mask = m ? from_msb8_mask(m) : zero();
+    lo = _mm256_unpacklo_epi8(x, m_mask);
+    hi = _mm256_unpackhi_epi8(x, m_mask);
+}
+
+template <typename T>
+inline void pack(const VecType& lo, const VecType& hi, VecType& x, MetaType& m);
+template <>
+inline void
+pack<uint32_t>(const VecType& lo, const VecType& hi, VecType& x, MetaType& m)
+{
+    VecType hi_data = BLEND16(zero(), hi, 0x55);
+    VecType hi_meta = BLEND16(zero(), SHIFTR(hi, 2), 0x55);
+    VecType lo_data = BLEND16(zero(), lo, 0x55);
+    VecType lo_meta = BLEND16(zero(), SHIFTR(lo, 2), 0x55);
+
+    x = _mm256_packus_epi32(lo_data, hi_data);
+    VecType meta_x = _mm256_packus_epi32(lo_meta, hi_meta);
+    meta_x = _mm256_cmpgt_epi8(meta_x, zero());
+    m = _mm256_movemask_epi8(meta_x);
+}
+template <>
+inline void
+pack<uint16_t>(const VecType& lo, const VecType& hi, VecType& x, MetaType& m)
+{
+    VecType hi_data = BLEND8(zero(), hi, MASK8_LO);
+    VecType hi_meta = BLEND8(hi, zero(), MASK8_LO);
+    VecType lo_data = BLEND8(zero(), lo, MASK8_LO);
+    VecType lo_meta = BLEND8(lo, zero(), MASK8_LO);
+
+    x = _mm256_packus_epi16(lo_data, hi_data);
+    VecType meta_x = _mm256_packus_epi16(lo_meta, hi_meta);
+    m = _mm256_movemask_epi8(meta_x);
+}
+
 } // namespace simd
 } // namespace quadiron
 
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
index 59e52410..64273223 100644
--- a/src/simd_fnt.h
+++ b/src/simd_fnt.h
@@ -33,46 +33,61 @@
 
 #include <x86intrin.h>
 
+#include "vec_buffers.h"
+
 namespace quadiron {
 namespace simd {
 
 template <typename T>
-inline VecType card(T q);
+inline VecType card();
 template <>
-inline VecType card<uint16_t>(uint16_t)
+inline VecType card<uint16_t>()
 {
-    return F3_U16;
+    return set_one<uint16_t>(257);
 }
 template <>
-inline VecType card<uint32_t>(uint32_t q)
+inline VecType card<uint32_t>()
 {
-    return (q == F3) ? F3_U32 : F4_U32;
+    return set_one<uint32_t>(65537);
 }
 
 template <typename T>
-inline VecType card_minus_one(T q);
+inline VecType card_minus_one();
 template <>
-inline VecType card_minus_one<uint16_t>(uint16_t)
+inline VecType card_minus_one<uint16_t>()
 {
-    return F3_MINUS_ONE_U16;
+    return set_one<uint16_t>(256);
 }
 template <>
-inline VecType card_minus_one<uint32_t>(uint32_t q)
+inline VecType card_minus_one<uint32_t>()
 {
-    return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32;
+    return set_one<uint32_t>(65536);
 }
 
 template <typename T>
-inline VecType get_low_half(VecType x, T q)
+inline VecType get_low_half(const VecType& x);
+template <>
+inline VecType get_low_half<uint16_t>(const VecType& x)
+{
+    return BLEND8(zero(), x, MASK8_LO);
+}
+template <>
+inline VecType get_low_half<uint32_t>(const VecType& x)
 {
-    return (q == F3) ? BLEND8(ZERO, x, MASK8_LO) : BLEND16(ZERO, x, 0x55);
+    return BLEND16(zero(), x, 0x55);
 }
 
 template <typename T>
-inline VecType get_high_half(VecType x, T q)
+inline VecType get_high_half(const VecType& x);
+template <>
+inline VecType get_high_half<uint16_t>(const VecType& x)
+{
+    return BLEND8(zero(), SHIFTR(x, 1), MASK8_LO);
+}
+template <>
+inline VecType get_high_half<uint32_t>(const VecType& x)
 {
-    return (q == F3) ? BLEND8(ZERO, SHIFTR(x, 1), MASK8_LO)
-                     : BLEND16(ZERO, SHIFTR(x, 2), 0x55);
+    return BLEND16(zero(), SHIFTR(x, 2), 0x55);
 }
 
 /* ================= Basic Operations ================= */
@@ -82,14 +97,13 @@ inline VecType get_high_half(VecType x, T q)
  *
  * @param x input register
  * @param y input register
- * @param q modulo
  * @return (x + y) mod q
  */
 template <typename T>
-inline VecType mod_add(VecType x, VecType y, T q)
+inline VecType mod_add(const VecType& x, const VecType& y)
 {
     const VecType res = add<T>(x, y);
-    return min<T>(res, sub<T>(res, card(q)));
+    return min<T>(res, sub<T>(res, card<T>()));
 }
 
 /**
@@ -97,28 +111,26 @@ inline VecType mod_add(VecType x, VecType y, T q)
  *
  * @param x input register
  * @param y input register
- * @param q modulo
  * @return (x - y) mod q
  */
 template <typename T>
-inline VecType mod_sub(VecType x, VecType y, T q)
+inline VecType mod_sub(const VecType& x, const VecType& y)
 {
     const VecType res = sub<T>(x, y);
-    return min<T>(res, add<T>(res, card(q)));
+    return min<T>(res, add<T>(res, card<T>()));
 }
 
 /**
  * Modular negation for packed unsigned 32-bit integers
  *
  * @param x input register
- * @param q modulo
  * @return (-x) mod q
  */
 template <typename T>
-inline VecType mod_neg(VecType x, T q)
+inline VecType mod_neg(const VecType& x)
 {
-    const VecType res = sub<T>(card(q), x);
-    return min<T>(res, sub<T>(res, card(q)));
+    const VecType res = sub<T>(card<T>(), x);
+    return min<T>(res, sub<T>(res, card<T>()));
 }
 
 /**
@@ -129,16 +141,15 @@ inline VecType mod_neg(VecType x, T q)
  *
  * @param x input register
  * @param y input register
- * @param q modulo
  * @return (x * y) mod q
  */
 template <typename T>
-inline VecType mod_mul(VecType x, VecType y, T q)
+inline VecType mod_mul(const VecType& x, const VecType& y)
 {
     const VecType res = mul<T>(x, y);
-    const VecType lo = get_low_half(res, q);
-    const VecType hi = get_high_half(res, q);
-    return mod_sub(lo, hi, q);
+    const VecType lo = get_low_half<T>(res);
+    const VecType hi = get_high_half<T>(res);
+    return mod_sub<T>(lo, hi);
 }
 
 /**
@@ -148,53 +159,68 @@ inline VecType mod_mul(VecType x, VecType y, T q)
  *
  * @param x input register
  * @param y input register
- * @param q modulo
  * @return (x * y) mod q
  */
 template <typename T>
-inline VecType mod_mul_safe(VecType x, VecType y, T q)
+inline VecType mod_mul_safe(const VecType& x, const VecType& y)
 {
-    const VecType res = mod_mul(x, y, q);
+    const VecType res = mod_mul<T>(x, y);
 
     // filter elements of both of a & b = card-1
     const VecType cmp = bit_and(
-        compare_eq<T>(x, card_minus_one(q)),
-        compare_eq<T>(y, card_minus_one(q)));
+        compare_eq<T>(x, card_minus_one<T>()),
+        compare_eq<T>(y, card_minus_one<T>()));
 
     if (is_zero(cmp)) {
         return res;
     }
-    return (q == F3) ? bit_xor(res, bit_and(F4_U32, cmp))
-                     : add<T>(res, bit_and(ONE_U32, cmp));
+    return add<T>(res, bit_and(one<T>(), cmp));
 }
 
 /**
- * Update property for a given register for packed unsigned 32-bit integers
+ * Update property given an output Buffers
  *
+ * @param output output Buffers
  * @param props properties bound to fragments
- * @param threshold register storing max value in its elements
- * @param mask a specific mask
- * @param symb input register
  * @param offset offset in the data fragments
+ * @param code_len erasure codes' length
+ * @param vecs_nb number of vectors corresponding to the data
  */
 template <typename T>
-inline void add_props(
-    Properties& props,
-    VecType threshold,
-    VecType mask,
-    VecType symb,
+inline void encode_post_process(
+    vec::Buffers<T>& output,
+    std::vector<Properties>& props,
     off_t offset,
-    T)
+    unsigned code_len,
+    size_t vecs_nb)
 {
-    const VecType b = compare_eq<T>(threshold, symb);
-    const VecType c = bit_and(mask, b);
-    auto d = msb8_mask(c);
-    const unsigned element_size = sizeof(T);
-    while (d > 0) {
-        const unsigned byte_idx = __builtin_ctz(d);
-        const size_t _offset = offset + byte_idx / element_size;
-        props.add(_offset, OOR_MARK);
-        d ^= 1 << byte_idx;
+    // nb of elements per vector
+    const unsigned vec_size = countof<T>();
+    // size of meta element in bits
+    const unsigned ele_size_in_bits = sizeof(MetaType) * CHAR_BIT / vec_size;
+    // mask to get meta
+    const T mask = ((static_cast<T>(1) << ele_size_in_bits) - 1);
+
+    const std::vector<uint8_t*>& meta = output.get_meta();
+    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
+        const MetaType* meta_frag = reinterpret_cast<MetaType*>(meta[frag_id]);
+        for (size_t vec_id = 0; vec_id < vecs_nb; ++vec_id) {
+            if (meta_frag[vec_id]) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                MetaType val = meta_frag[vec_id];
+                unsigned idx = 0;
+                while (val) {
+                    const T m_val = val & mask;
+                    if (m_val) {
+                        const size_t _offset = curr_offset + idx;
+                        props[frag_id].add(_offset, m_val);
+                    }
+
+                    val >>= ele_size_in_bits;
+                    idx++;
+                }
+            }
+        }
     }
 }
 
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index f9ec1172..06a46ebf 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -180,7 +180,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType vec_a = load_to_reg(a);
     VecType vec_b = load_to_reg(b);
-    store_low_half_to_mem(&res, mod_add(vec_a, vec_b, F4));
+    store_low_half_to_mem(&res, mod_add<uint32_t>(vec_a, vec_b));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -189,7 +189,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType vec_a = load_to_reg(a);
     VecType vec_b = load_to_reg(b);
-    store_low_half_to_mem(&res, mod_sub(vec_a, vec_b, F4));
+    store_low_half_to_mem(&res, mod_sub<uint32_t>(vec_a, vec_b));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -198,7 +198,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType vec_a = load_to_reg(a);
     VecType vec_b = load_to_reg(b);
-    store_low_half_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4));
+    store_low_half_to_mem(&res, mod_mul_safe<uint32_t>(vec_a, vec_b));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -217,8 +217,8 @@ inline void add_buf_to_two_bufs_rem(
         VecType _x_next_p = load_to_reg(_x_half[i]);
         VecType _y_p = load_to_reg(_y[i]);
 
-        store_low_half_to_mem(_x + i, mod_add(_x_p, _y_p, F4));
-        store_low_half_to_mem(_x_half + i, mod_add(_x_next_p, _y_p, F4));
+        store_low_half_to_mem(_x + i, mod_add<uint32_t>(_x_p, _y_p));
+        store_low_half_to_mem(_x_half + i, mod_add<uint32_t>(_x_next_p, _y_p));
     }
 }
 
@@ -230,7 +230,7 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
         VecType _x_p = load_to_reg(_x[i]);
         VecType _y_p = load_to_reg(_y[i]);
 
-        store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4));
+        store_low_half_to_mem(_x + i, mod_mul_safe<uint32_t>(_x_p, _y_p));
     }
 }
 
@@ -248,8 +248,9 @@ inline void hadamard_mul_doubled_rem(
         VecType _x_next_p = load_to_reg(_x_half[i]);
         VecType _y_p = load_to_reg(_y[i]);
 
-        store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4));
-        store_low_half_to_mem(_x_half + i, mod_mul_safe(_x_next_p, _y_p, F4));
+        store_low_half_to_mem(_x + i, mod_mul_safe<uint32_t>(_x_p, _y_p));
+        store_low_half_to_mem(
+            _x_half + i, mod_mul_safe<uint32_t>(_x_next_p, _y_p));
     }
 }
 
@@ -266,7 +267,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b)
     VecType res;
     VecType vec_a = load_to_reg(a);
     VecType vec_b = load_to_reg(b);
-    store_to_mem(&res, mod_add(vec_a, vec_b, F4));
+    store_to_mem(&res, mod_add<uint32_t>(vec_a, vec_b));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -275,7 +276,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b)
     VecType res;
     VecType vec_a = load_to_reg(a);
     VecType vec_b = load_to_reg(b);
-    store_to_mem(&res, mod_sub(vec_a, vec_b, F4));
+    store_to_mem(&res, mod_sub<uint32_t>(vec_a, vec_b));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -284,7 +285,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     VecType res;
     VecType vec_a = load_to_reg(a);
     VecType vec_b = load_to_reg(b);
-    store_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4));
+    store_to_mem(&res, mod_mul_safe<uint32_t>(vec_a, vec_b));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -327,12 +328,12 @@ inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y)
 
     // add y to the first half of `x`
     for (i = 0; i < vec_len; ++i) {
-        x[i] = mod_add(x[i], y[i], F4);
+        x[i] = mod_add<uint32_t>(x[i], y[i]);
     }
 
     // add y to the second half of `x`
     for (i = 0; i < vec_len; ++i) {
-        x_next[i] = mod_add(x_next[i], y[i], F4);
+        x_next[i] = mod_add<uint32_t>(x_next[i], y[i]);
     }
 
     if (rem_len > 0) {
@@ -354,7 +355,7 @@ inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y)
 
     // multiply y to the first half of `x`
     for (i = 0; i < vec_len; ++i) {
-        x[i] = mod_mul_safe(x[i], y[i], F4);
+        x[i] = mod_mul_safe<uint32_t>(x[i], y[i]);
     }
 
     if (rem_len > 0) {
diff --git a/src/simd_radix2_fft.h b/src/simd_radix2_fft.h
index 8405f8f0..531f8b78 100644
--- a/src/simd_radix2_fft.h
+++ b/src/simd_radix2_fft.h
@@ -36,6 +36,24 @@
 namespace quadiron {
 namespace simd {
 
+enum class CtGsCase {
+    SIMPLE,
+    NORMAL,
+    EXTREME,
+};
+
+template <typename T>
+inline CtGsCase get_case(T r, T q)
+{
+    if (r == 1) {
+        return CtGsCase::SIMPLE;
+    } else if (r < q - 1) {
+        return CtGsCase::NORMAL;
+    } else {
+        return CtGsCase::EXTREME;
+    }
+}
+
 /* ================= Vectorized Operations ================= */
 
 /**
@@ -44,22 +62,31 @@ namespace simd {
  * x <- x + r * y
  * y <- x - r * y
  *
- * @param rp1 coefficient `r` plus one
+ * @param ct_case coefficient case
  * @param c a register stores coefficient `r`
  * @param x working register
  * @param y working register
- * @param q modular
  */
 template <typename T>
-inline void butterfly_ct(T rp1, VecType c, VecType* x, VecType* y, T q)
+inline void
+butterfly_ct(CtGsCase ct_case, const VecType& c, VecType& x, VecType& y)
 {
-    VecType z = (rp1 == 2) ? *y : mod_mul(c, *y, q);
-    if (rp1 < q) {
-        *y = mod_sub(*x, z, q);
-        *x = mod_add(*x, z, q);
-    } else { // i.e. r == q - 1
-        *y = mod_add(*x, z, q);
-        *x = mod_sub(*x, z, q);
+    VecType z = y;
+    switch (ct_case) {
+    case CtGsCase::SIMPLE:
+        y = mod_sub<T>(x, z);
+        x = mod_add<T>(x, z);
+        break;
+    case CtGsCase::EXTREME:
+        y = mod_add<T>(x, z);
+        x = mod_sub<T>(x, z);
+        break;
+    case CtGsCase::NORMAL:
+    default:
+        z = mod_mul<T>(c, y);
+        y = mod_sub<T>(x, z);
+        x = mod_add<T>(x, z);
+        break;
     }
 }
 
@@ -69,25 +96,30 @@ inline void butterfly_ct(T rp1, VecType c, VecType* x, VecType* y, T q)
  * x <- x + y
  * y <- r * (x - y)
  *
- * @param rp1 coefficient `r` plus one
+ * @param gs_case coefficient case
  * @param c a register stores coefficient `r`
  * @param x working register
  * @param y working register
- * @param q modular
  */
 template <typename T>
-inline void butterfly_gs(T rp1, VecType c, VecType* x, VecType* y, T q)
+inline void
+butterfly_gs(CtGsCase gs_case, const VecType& c, VecType& x, VecType& y)
 {
-    VecType add = mod_add(*x, *y, q);
-    if (rp1 == 2) {
-        *y = mod_sub(*x, *y, q);
-    } else if (rp1 < q) {
-        VecType sub = mod_sub(*x, *y, q);
-        *y = mod_mul(c, sub, q);
-    } else { // i.e. r == q - 1
-        *y = mod_sub(*y, *x, q);
+    VecType add = mod_add<T>(x, y);
+    switch (gs_case) {
+    case CtGsCase::SIMPLE:
+        y = mod_sub<T>(x, y);
+        break;
+    case CtGsCase::EXTREME:
+        y = mod_sub<T>(y, x);
+        break;
+    case CtGsCase::NORMAL:
+    default:
+        VecType sub = mod_sub<T>(x, y);
+        y = mod_mul<T>(c, sub);
+        break;
     }
-    *x = add;
+    x = add;
 }
 
 /**
@@ -96,24 +128,25 @@ inline void butterfly_gs(T rp1, VecType c, VecType* x, VecType* y, T q)
  * x <- x, i.e. no operation
  * y <- r * x
  *
- * @param rp1 coefficient `r` plus one
+ * @param gs_case coefficient case
  * @param c a register stores coefficient `r`
  * @param x working register
- * @param q modular
- * @return r * x
  */
 template <typename T>
-inline VecType butterfly_simple_gs(T rp1, VecType c, VecType x, T q)
+inline void butterfly_simple_gs(CtGsCase gs_case, const VecType& c, VecType& x)
 {
-    if (rp1 == 2) {
-        return x;
-    } else if (rp1 < q) {
-        return mod_mul(c, x, q);
-    } else {
-        return mod_neg(x, q);
+    switch (gs_case) {
+    case CtGsCase::EXTREME:
+        x = mod_neg<T>(x);
+        break;
+    case CtGsCase::NORMAL:
+        x = mod_mul<T>(c, x);
+        break;
+    case CtGsCase::SIMPLE:
+    default:
+        break;
     }
 }
-
 /**
  * Vectorized butterfly CT step
  *
@@ -139,55 +172,45 @@ inline void butterfly_ct_step(
     size_t len,
     T card)
 {
-    if (len == 0) {
-        return;
-    }
-    const T rp1 = r + 1;
-    VecType c = set_one(r);
+    const CtGsCase ct_case = get_case<T>(r, card);
+    const VecType c = set_one(r);
 
-    const size_t end = (len > 1) ? len - 1 : 0;
     const unsigned bufs_nb = buf.get_n();
     const std::vector<T*>& mem = buf.get_mem();
+    const std::vector<uint8_t*>& meta = buf.get_meta();
     for (unsigned i = start; i < bufs_nb; i += step) {
-        VecType x1, y1;
-        VecType x2, y2;
         VecType* p = reinterpret_cast<VecType*>(mem[i]);
         VecType* q = reinterpret_cast<VecType*>(mem[i + m]);
+        MetaType* m_p = reinterpret_cast<MetaType*>(meta[i]);
+        MetaType* m_q = reinterpret_cast<MetaType*>(meta[i + m]);
 
-        size_t j = 0;
-        for (; j < end; j += 2) {
-            x1 = load_to_reg(p + j);
-            y1 = load_to_reg(q + j);
-
-            butterfly_ct(rp1, c, &x1, &y1, card);
+        for (size_t j = 0; j < len; ++j) {
+            VecType x1 = load_to_reg(p);
+            VecType y1 = load_to_reg(q);
 
-            x2 = load_to_reg(p + j + 1);
-            y2 = load_to_reg(q + j + 1);
+            VecType x1_lo, x1_hi;
+            VecType y1_lo, y1_hi;
 
-            butterfly_ct(rp1, c, &x2, &y2, card);
+            unpack<T>(m_p[j], x1, x1_hi, x1_lo);
+            unpack<T>(m_q[j], y1, y1_hi, y1_lo);
 
-            // Store back to memory
-            store_to_mem(p + j, x1);
-            store_to_mem(p + j + 1, x2);
-            store_to_mem(q + j, y1);
-            store_to_mem(q + j + 1, y2);
-        }
-        for (; j < len; ++j) {
-            x1 = load_to_reg(p + j);
-            y1 = load_to_reg(q + j);
+            butterfly_ct<T>(ct_case, c, x1_lo, y1_lo);
+            butterfly_ct<T>(ct_case, c, x1_hi, y1_hi);
 
-            butterfly_ct(rp1, c, &x1, &y1, card);
+            pack<T>(x1_lo, x1_hi, x1, m_p[j]);
+            pack<T>(y1_lo, y1_hi, y1, m_q[j]);
 
             // Store back to memory
-            store_to_mem(p + j, x1);
-            store_to_mem(q + j, y1);
+            store_to_mem(p++, x1);
+            store_to_mem(q++, y1);
         }
     }
 }
 
 template <typename T>
-inline static void do_butterfly_ct_2_layers(
+inline void do_butterfly_ct_2_layers(
     const std::vector<T*>& mem,
+    const std::vector<uint8_t*>& meta,
     T r1,
     T r2,
     T r3,
@@ -196,9 +219,9 @@ inline static void do_butterfly_ct_2_layers(
     size_t len,
     T card)
 {
-    const T r1p1 = r1 + 1;
-    const T r2p1 = r2 + 1;
-    const T r3p1 = r3 + 1;
+    const CtGsCase case1 = get_case<T>(r1, card);
+    const CtGsCase case2 = get_case<T>(r2, card);
+    const CtGsCase case3 = get_case<T>(r3, card);
 
     VecType c1 = set_one(r1);
     VecType c2 = set_one(r2);
@@ -209,68 +232,47 @@ inline static void do_butterfly_ct_2_layers(
     VecType* r = reinterpret_cast<VecType*>(mem[start + 2 * m]);
     VecType* s = reinterpret_cast<VecType*>(mem[start + 3 * m]);
 
-    size_t j = 0;
-    const size_t end = (len > 1) ? len - 1 : 0;
-    while (j < end) {
-        // First layer (c1, x, y) & (c1, u, v)
+    MetaType* m_p = reinterpret_cast<MetaType*>(meta[start]);
+    MetaType* m_q = reinterpret_cast<MetaType*>(meta[start + m]);
+    MetaType* m_r = reinterpret_cast<MetaType*>(meta[start + 2 * m]);
+    MetaType* m_s = reinterpret_cast<MetaType*>(meta[start + 3 * m]);
+
+    for (size_t j = 0; j < len; ++j) {
         VecType x1 = load_to_reg(p);
-        VecType x2 = load_to_reg(p + 1);
         VecType y1 = load_to_reg(q);
-        VecType y2 = load_to_reg(q + 1);
-
-        butterfly_ct(r1p1, c1, &x1, &y1, card);
-        butterfly_ct(r1p1, c1, &x2, &y2, card);
-
         VecType u1 = load_to_reg(r);
-        VecType u2 = load_to_reg(r + 1);
         VecType v1 = load_to_reg(s);
-        VecType v2 = load_to_reg(s + 1);
-
-        butterfly_ct(r1p1, c1, &u1, &v1, card);
-        butterfly_ct(r1p1, c1, &u2, &v2, card);
-
-        // Second layer (c2, x, u) & (c3, y, v)
-        butterfly_ct(r2p1, c2, &x1, &u1, card);
-        butterfly_ct(r2p1, c2, &x2, &u2, card);
-
-        butterfly_ct(r3p1, c3, &y1, &v1, card);
-        butterfly_ct(r3p1, c3, &y2, &v2, card);
-
-        // Store back to memory
-        store_to_mem(p, x1);
-        store_to_mem(p + 1, x2);
-        store_to_mem(q, y1);
-        store_to_mem(q + 1, y2);
-
-        store_to_mem(r, u1);
-        store_to_mem(r + 1, u2);
-        store_to_mem(s, v1);
-        store_to_mem(s + 1, v2);
-        p = p + 2;
-        q = q + 2;
-        r = r + 2;
-        s = s + 2;
-        j = j + 2;
-    };
-
-    for (; j < len; ++j) {
-        // First layer (c1, x, y) & (c1, u, v)
-        VecType x1 = load_to_reg(p + j);
-        VecType y1 = load_to_reg(q + j);
-        VecType u1 = load_to_reg(r + j);
-        VecType v1 = load_to_reg(s + j);
-
-        // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card);
-        butterfly_ct(r1p1, c1, &x1, &y1, card);
-        butterfly_ct(r1p1, c1, &u1, &v1, card);
-        butterfly_ct(r2p1, c2, &x1, &u1, card);
-        butterfly_ct(r3p1, c3, &y1, &v1, card);
-
-        // Store back to memory
-        store_to_mem(p + j, x1);
-        store_to_mem(q + j, y1);
-        store_to_mem(r + j, u1);
-        store_to_mem(s + j, v1);
+
+        VecType x1_lo, x1_hi;
+        VecType y1_lo, y1_hi;
+        VecType u1_lo, u1_hi;
+        VecType v1_lo, v1_hi;
+
+        unpack<T>(m_p[j], x1, x1_hi, x1_lo);
+        unpack<T>(m_q[j], y1, y1_hi, y1_lo);
+        unpack<T>(m_r[j], u1, u1_hi, u1_lo);
+        unpack<T>(m_s[j], v1, v1_hi, v1_lo);
+
+        butterfly_ct<T>(case1, c1, x1_lo, y1_lo);
+        butterfly_ct<T>(case1, c1, x1_hi, y1_hi);
+        butterfly_ct<T>(case1, c1, u1_lo, v1_lo);
+        butterfly_ct<T>(case1, c1, u1_hi, v1_hi);
+
+        butterfly_ct<T>(case2, c2, x1_lo, u1_lo);
+        butterfly_ct<T>(case2, c2, x1_hi, u1_hi);
+
+        butterfly_ct<T>(case3, c3, y1_lo, v1_lo);
+        butterfly_ct<T>(case3, c3, y1_hi, v1_hi);
+
+        pack<T>(x1_lo, x1_hi, x1, m_p[j]);
+        pack<T>(y1_lo, y1_hi, y1, m_q[j]);
+        pack<T>(u1_lo, u1_hi, u1, m_r[j]);
+        pack<T>(v1_lo, v1_hi, v1, m_s[j]);
+
+        store_to_mem(p++, x1);
+        store_to_mem(q++, y1);
+        store_to_mem(r++, u1);
+        store_to_mem(s++, v1);
     }
 }
 
@@ -320,8 +322,9 @@ inline void butterfly_ct_two_layers_step(
     const unsigned bufs_nb = buf.get_n();
 
     const std::vector<T*>& mem = buf.get_mem();
+    const std::vector<uint8_t*>& meta = buf.get_meta();
     for (unsigned i = start; i < bufs_nb; i += step) {
-        do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card);
+        do_butterfly_ct_2_layers(mem, meta, r1, r2, r3, i, m, len, card);
     }
 }
 
@@ -352,53 +355,36 @@ inline void butterfly_gs_step(
         return;
     }
     const unsigned step = m << 1;
-    const T rp1 = r + 1;
+    const CtGsCase gs_case = get_case<T>(r, card);
     VecType c = set_one(r);
 
-    const size_t end = (len > 3) ? len - 3 : 0;
     const unsigned bufs_nb = buf.get_n();
     const std::vector<T*>& mem = buf.get_mem();
+    const std::vector<uint8_t*>& meta = buf.get_meta();
     for (unsigned i = start; i < bufs_nb; i += step) {
-        VecType x1, x2, x3, x4;
-        VecType y1, y2, y3, y4;
         VecType* p = reinterpret_cast<VecType*>(mem[i]);
         VecType* q = reinterpret_cast<VecType*>(mem[i + m]);
+        MetaType* m_p = reinterpret_cast<MetaType*>(meta[i]);
+        MetaType* m_q = reinterpret_cast<MetaType*>(meta[i + m]);
 
-        size_t j = 0;
-        for (; j < end; j += 4) {
-            x1 = load_to_reg(p + j);
-            x2 = load_to_reg(p + j + 1);
-            x3 = load_to_reg(p + j + 2);
-            x4 = load_to_reg(p + j + 3);
-            y1 = load_to_reg(q + j);
-            y2 = load_to_reg(q + j + 1);
-            y3 = load_to_reg(q + j + 2);
-            y4 = load_to_reg(q + j + 3);
-
-            butterfly_gs(rp1, c, &x1, &y1, card);
-            butterfly_gs(rp1, c, &x2, &y2, card);
-            butterfly_gs(rp1, c, &x3, &y3, card);
-            butterfly_gs(rp1, c, &x4, &y4, card);
+        for (size_t j = 0; j < len; ++j) {
+            VecType x1 = load_to_reg(p);
+            VecType y1 = load_to_reg(q);
 
-            // Store back to memory
-            store_to_mem(p + j, x1);
-            store_to_mem(p + j + 1, x2);
-            store_to_mem(p + j + 2, x3);
-            store_to_mem(p + j + 3, x4);
-            store_to_mem(q + j, y1);
-            store_to_mem(q + j + 1, y2);
-            store_to_mem(q + j + 2, y3);
-            store_to_mem(q + j + 3, y4);
-        }
-        for (; j < len; ++j) {
-            x1 = load_to_reg(p + j);
-            y1 = load_to_reg(q + j);
+            VecType x1_lo, x1_hi;
+            VecType y1_lo, y1_hi;
 
-            butterfly_gs(rp1, c, &x1, &y1, card);
+            unpack<T>(m_p[j], x1, x1_hi, x1_lo);
+            unpack<T>(m_q[j], y1, y1_hi, y1_lo);
 
-            // Store back to memory
-            store_to_mem(p + j, x1);
-            store_to_mem(q + j, y1);
+            butterfly_gs<T>(gs_case, c, x1_lo, y1_lo);
+            butterfly_gs<T>(gs_case, c, x1_hi, y1_hi);
+
+            pack<T>(x1_lo, x1_hi, x1, m_p[j]);
+            pack<T>(y1_lo, y1_hi, y1, m_q[j]);
+
+            store_to_mem(p++, x1);
+            store_to_mem(q++, y1);
         }
     }
 }
@@ -429,95 +415,30 @@ inline void butterfly_gs_step_simple(
         return;
     }
     const unsigned step = m << 1;
-    const T rp1 = r + 1;
+    const CtGsCase gs_case = get_case<T>(r, card);
     VecType c = set_one(r);
 
-    const size_t end = (len > 1) ? len - 1 : 0;
     const unsigned bufs_nb = buf.get_n();
     const std::vector<T*>& mem = buf.get_mem();
+    const std::vector<uint8_t*>& meta = buf.get_meta();
     for (unsigned i = start; i < bufs_nb; i += step) {
-        VecType x1, y1;
-        VecType x2, y2;
         VecType* p = reinterpret_cast<VecType*>(mem[i]);
         VecType* q = reinterpret_cast<VecType*>(mem[i + m]);
+        MetaType* m_p = reinterpret_cast<MetaType*>(meta[i]);
+        MetaType* m_q = reinterpret_cast<MetaType*>(meta[i + m]);
 
-        size_t j = 0;
-        for (; j < end; j += 2) {
-            x1 = load_to_reg(p + j);
-            x2 = load_to_reg(p + j + 1);
+        for (size_t j = 0; j < len; ++j) {
+            VecType x = load_to_reg(p++);
+            VecType x_lo, x_hi;
 
-            y1 = butterfly_simple_gs(rp1, c, x1, card);
-            y2 = butterfly_simple_gs(rp1, c, x2, card);
+            unpack<T>(m_p[j], x, x_hi, x_lo);
 
-            // Store back to memory
-            store_to_mem(q + j, y1);
-            store_to_mem(q + j + 1, y2);
-        }
-        for (; j < len; ++j) {
-            x1 = load_to_reg(p + j);
-
-            y1 = butterfly_simple_gs(rp1, c, x1, card);
+            butterfly_simple_gs<T>(gs_case, c, x_lo);
+            butterfly_simple_gs<T>(gs_case, c, x_hi);
 
-            // Store back to memory
-            store_to_mem(q + j, y1);
-        }
-    }
-}
+            pack<T>(x_lo, x_hi, x, m_q[j]);
 
-template <typename T>
-inline void encode_post_process(
-    vec::Buffers<T>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    T threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = countof<T>();
-    const T max = 1U << (sizeof(T) * CHAR_BIT - 1);
-    const VecType _threshold = set_one(threshold);
-    const VecType mask_hi = set_one(max);
-
-    const std::vector<T*>& mem = output.get_mem();
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        VecType* buf = reinterpret_cast<VecType*>(mem[frag_id]);
-
-        size_t vec_id = 0;
-        size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0;
-        for (; vec_id < end; vec_id += 4) {
-            VecType a1 = load_to_reg(buf + vec_id);
-            VecType a2 = load_to_reg(buf + vec_id + 1);
-            VecType a3 = load_to_reg(buf + vec_id + 2);
-            VecType a4 = load_to_reg(buf + vec_id + 3);
-
-            if (!and_is_zero(a1, _threshold)) {
-                const off_t curr_offset = offset + vec_id * vec_size;
-                add_props(
-                    props[frag_id], _threshold, mask_hi, a1, curr_offset, max);
-            }
-            if (!and_is_zero(a2, _threshold)) {
-                const off_t curr_offset = offset + (vec_id + 1) * vec_size;
-                add_props(
-                    props[frag_id], _threshold, mask_hi, a2, curr_offset, max);
-            }
-            if (!and_is_zero(a3, _threshold)) {
-                const off_t curr_offset = offset + (vec_id + 2) * vec_size;
-                add_props(
-                    props[frag_id], _threshold, mask_hi, a3, curr_offset, max);
-            }
-            if (!and_is_zero(a4, _threshold)) {
-                const off_t curr_offset = offset + (vec_id + 3) * vec_size;
-                add_props(
-                    props[frag_id], _threshold, mask_hi, a4, curr_offset, max);
-            }
-        }
-        for (; vec_id < vecs_nb; ++vec_id) {
-            VecType a = load_to_reg(buf + vec_id);
-            if (!and_is_zero(a, _threshold)) {
-                const off_t curr_offset = offset + vec_id * vec_size;
-                add_props(
-                    props[frag_id], _threshold, mask_hi, a, curr_offset, max);
-            }
+            store_to_mem(q++, x);
         }
     }
 }
diff --git a/src/simd_ring.h b/src/simd_ring.h
index 47edce99..9b1a8797 100644
--- a/src/simd_ring.h
+++ b/src/simd_ring.h
@@ -33,134 +33,93 @@
 
 #include <x86intrin.h>
 
+#include "gf_ring.h"
+
 namespace quadiron {
 namespace simd {
 
 /* ==================== Operations for RingModN =================== */
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
+/** Perform a multiplication of a coefficient `c` to each element of `buf_id`th
+ * buffers of `src` and store result to correspondent element of `dest`
  *
+ * @note: Buffers `src` and `dest` have meta
  * @note: 1 < `a` < card - 1
  */
 template <typename T>
-inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card)
+inline void mul_coef_to_buf(
+    const gf::RingModN<T>& gf,
+    T c,
+    vec::Buffers<T>& src,
+    vec::Buffers<T>& dest,
+    size_t buf_id)
 {
-    const VecType coef = set_one(a);
-
-    VecType* _src = reinterpret_cast<VecType*>(src);
-    VecType* _dest = reinterpret_cast<VecType*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i = 0;
-    const size_t end = (_len > 3) ? _len - 3 : 0;
-    for (; i < end; i += 4) {
-        _dest[i] = mod_mul(coef, _src[i], card);
-        _dest[i + 1] = mod_mul(coef, _src[i + 1], card);
-        _dest[i + 2] = mod_mul(coef, _src[i + 2], card);
-        _dest[i + 3] = mod_mul(coef, _src[i + 3], card);
-    }
-    for (; i < _len; ++i) {
-        _dest[i] = mod_mul(coef, _src[i], card);
-    }
+    const size_t size = src.get_size();
+    const unsigned ratio = simd::countof<T>();
+    const size_t simd_vec_len = size / ratio;
+    const size_t simd_trailing_len = size - simd_vec_len * ratio;
 
-    if (_last_len > 0) {
-        const DoubleSizeVal<T> coef_double = DoubleSizeVal<T>(a);
-        for (size_t i = _len * ratio; i < len; i++) {
-            dest[i] = static_cast<T>((coef_double * src[i]) % card);
-        }
-    }
-}
+    const VecType coef = set_one(c);
 
-template <typename T>
-inline void add_two_bufs(T* src, T* dest, size_t len, T card)
-{
-    VecType* _src = reinterpret_cast<VecType*>(src);
-    VecType* _dest = reinterpret_cast<VecType*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _dest[i] = mod_add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            const T tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
+    VecType* s_data = reinterpret_cast<VecType*>(src.get(buf_id));
+    VecType* d_data = reinterpret_cast<VecType*>(dest.get(buf_id));
 
-template <typename T>
-inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card)
-{
-    VecType* _bufa = reinterpret_cast<VecType*>(bufa);
-    VecType* _bufb = reinterpret_cast<VecType*>(bufb);
-    VecType* _res = reinterpret_cast<VecType*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = mod_sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i]) {
-                res[i] = bufa[i] - bufb[i];
-            } else {
-                res[i] = card - (bufb[i] - bufa[i]);
-            }
-        }
-    }
-}
+    MetaType* s_meta = reinterpret_cast<MetaType*>(src.get_meta(buf_id));
+    MetaType* d_meta = reinterpret_cast<MetaType*>(dest.get_meta(buf_id));
 
-template <typename T>
-inline void mul_two_bufs(T* src, T* dest, size_t len, T card)
-{
-    VecType* _src = reinterpret_cast<VecType*>(src);
-    VecType* _dest = reinterpret_cast<VecType*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mod_mul_safe(_src[i], _dest[i], card);
+    for (size_t i = 0; i < simd_vec_len; ++i) {
+        VecType lo, hi;
+        VecType x = load_to_reg(&s_data[i]);
+
+        unpack<T>(s_meta[i], x, hi, lo);
+        hi = mod_mul<T>(coef, hi);
+        lo = mod_mul<T>(coef, lo);
+        pack<T>(lo, hi, x, d_meta[i]);
+
+        store_to_mem(&d_data[i], x);
     }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = T((DoubleSizeVal<T>(src[i]) * dest[i]) % card);
+
+    if (simd_trailing_len) {
+        const size_t simd_offset = simd_vec_len * ratio;
+        for (size_t i = simd_offset; i < size; ++i) {
+            T hi, lo;
+            src.get(buf_id, i, hi, lo);
+            dest.set(buf_id, i, gf.mul(c, hi), gf.mul(c, lo));
         }
     }
 }
 
 /** Apply an element-wise negation to a buffer
+ * @note: Buffers `src` and `dest` have meta
  */
 template <typename T>
-inline void neg(size_t len, T* buf, T card)
+inline void neg(const gf::RingModN<T>& gf, vec::Buffers<T>& buf, size_t buf_id)
 {
-    VecType* _buf = reinterpret_cast<VecType*>(buf);
-    const unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = mod_neg(_buf[i], card);
+    const size_t size = buf.get_size();
+    const unsigned ratio = simd::countof<T>();
+    const size_t simd_vec_len = size / ratio;
+    const size_t simd_trailing_len = size - simd_vec_len * ratio;
+
+    VecType* vec_data = reinterpret_cast<VecType*>(buf.get(buf_id));
+    MetaType* vec_meta = reinterpret_cast<MetaType*>(buf.get_meta(buf_id));
+
+    for (size_t i = 0; i < simd_vec_len; ++i) {
+        VecType lo, hi;
+        VecType x = load_to_reg(&vec_data[i]);
+
+        unpack<T>(vec_meta[i], x, hi, lo);
+        hi = mod_neg<T>(hi);
+        lo = mod_neg<T>(lo);
+        pack<T>(lo, hi, x, vec_meta[i]);
+
+        store_to_mem(&vec_data[i], x);
     }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
+
+    if (simd_trailing_len) {
+        const size_t simd_offset = simd_vec_len * ratio;
+        for (size_t i = simd_offset; i < size; ++i) {
+            T hi, lo;
+            buf.get(buf_id, i, hi, lo);
+            buf.set(buf_id, i, gf.neg(hi), gf.neg(lo));
         }
     }
 }
diff --git a/src/vec_buffers.h b/src/vec_buffers.h
index a721a25b..4f993d9e 100644
--- a/src/vec_buffers.h
+++ b/src/vec_buffers.h
@@ -407,7 +407,7 @@ Buffers<T>::Buffers(const Buffers<T>& vec, int begin, int end)
     this->n = end - begin;
     this->size = vec.get_size();
     this->mem_len = this->n * this->size;
-    const std::vector<T*> vec_mem = vec.get_mem();
+    const std::vector<T*>& vec_mem = vec.get_mem();
 
     mem.reserve(this->n);
     // slice from input buffers
@@ -427,7 +427,7 @@ Buffers<T>::Buffers(const Buffers<T>& vec, int begin, int end)
 
     this->m_meta = vec.has_meta();
     if (this->m_meta) {
-        const std::vector<uint8_t*> vec_meta = vec.get_meta();
+        const std::vector<uint8_t*>& vec_meta = vec.get_meta();
         this->init_meta();
         meta.reserve(this->n);
         // slice from input buffers
@@ -531,7 +531,7 @@ Buffers<T>::Buffers(
     if (this->m_meta) {
         this->init_meta();
 
-        const std::vector<uint8_t*> vec_meta = vec.get_meta();
+        const std::vector<uint8_t*>& vec_meta = vec.get_meta();
         // output is sliced & shuffled from `vec`
         meta.reserve(this->n);
         if (vec_n < n) { // output is zero-extended & shuffled from `vec`
diff --git a/src/vec_cast.h b/src/vec_cast.h
index d4eef92d..13b52727 100644
--- a/src/vec_cast.h
+++ b/src/vec_cast.h
@@ -173,7 +173,7 @@ std::vector<Td*>* cast_mem_of_vecp(vec::Buffers<Ts>* s)
 
     // std::cout << "\ninput: "; s->dump();
 
-    const std::vector<Ts*> mem_s = s->get_mem();
+    const std::vector<Ts*>& mem_s = s->get_mem();
     std::vector<Td*>* mem_d = new std::vector<Td*>(n, nullptr);
     for (i = 0; i < n; i++) {
         mem_d->at(i) = reinterpret_cast<Td*>(mem_s.at(i));
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b97ac468..f57bf5e9 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -127,6 +127,7 @@ set(TEST_SRC
   ${CMAKE_CURRENT_SOURCE_DIR}/simd/test_allocator.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/simd/test_definitions.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/simd/test_simd.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/simd/test_simd_fnt.cpp
 
   CACHE
   INTERNAL
diff --git a/test/buffers_utest.cpp b/test/buffers_utest.cpp
index 7a939ac4..b0d2538c 100644
--- a/test/buffers_utest.cpp
+++ b/test/buffers_utest.cpp
@@ -67,7 +67,7 @@ class BuffersTest : public ::testing::Test {
         std::uniform_int_distribution<T> dis(0, max - 1);
         auto vec = std::make_unique<vec::Buffers<T>>(n, size, has_meta);
 
-        const std::vector<T*> mem = vec->get_mem();
+        const std::vector<T*>& mem = vec->get_mem();
         for (int i = 0; i < n; i++) {
             for (int j = 0; j < size; j++) {
                 mem[i][j] = dis(quadiron::prng());
@@ -75,7 +75,7 @@ class BuffersTest : public ::testing::Test {
         }
 
         if (has_meta) {
-            const std::vector<uint8_t*> meta = vec->get_meta();
+            const std::vector<uint8_t*>& meta = vec->get_meta();
             const size_t meta_size = vec->get_meta_size();
             for (int i = 0; i < n; ++i) {
                 for (size_t j = 0; j < meta_size; ++j) {
@@ -175,8 +175,8 @@ TYPED_TEST(BuffersTest, TestConstructors) // NOLINT
 
         vec::Buffers<TypeParam> vec2(*vec1, begin, end);
 
-        const std::vector<TypeParam*> mem1 = vec1->get_mem();
-        const std::vector<TypeParam*> mem2 = vec2.get_mem();
+        const std::vector<TypeParam*>& mem1 = vec1->get_mem();
+        const std::vector<TypeParam*>& mem2 = vec2.get_mem();
 
         // Check Slice constructor
         ASSERT_EQ(vec2.get_n(), end - begin);
@@ -189,8 +189,8 @@ TYPED_TEST(BuffersTest, TestConstructors) // NOLINT
             ASSERT_EQ(vec2.has_meta(), vec1->has_meta());
             ASSERT_EQ(vec2.get_meta_size(), meta_size);
 
-            const std::vector<uint8_t*> meta1 = vec1->get_meta();
-            const std::vector<uint8_t*> meta2 = vec2.get_meta();
+            const std::vector<uint8_t*>& meta1 = vec1->get_meta();
+            const std::vector<uint8_t*>& meta2 = vec2.get_meta();
 
             for (int i = begin, j = 0; i < end; ++i, ++j) {
                 ASSERT_TRUE(
@@ -206,7 +206,7 @@ TYPED_TEST(BuffersTest, TestConstructors) // NOLINT
         }
 
         if (has_meta) {
-            const std::vector<uint8_t*> meta1 = vec1->get_meta();
+            const std::vector<uint8_t*>& meta1 = vec1->get_meta();
             std::vector<uint8_t*> meta3(end - begin);
             for (int i = 0; i < end - begin; i++) {
                 meta3[i] = this->allocator_meta.allocate(meta_size);
@@ -283,11 +283,11 @@ TYPED_TEST(BuffersTest, TestPackUnpack) // NOLINT
         for (bool const& has_meta : tests) {
             auto words = this->gen_buffers_rand_data(n, size, has_meta, max);
 
-            const std::vector<TypeParam*> mem_T = words->get_mem();
+            const std::vector<TypeParam*>& mem_T = words->get_mem();
 
             // Pack manually from TypeParam to uint8_t.
             vec::Buffers<uint8_t> vec_char(n, bytes_size);
-            const std::vector<uint8_t*> mem_char = vec_char.get_mem();
+            const std::vector<uint8_t*>& mem_char = vec_char.get_mem();
             for (int j = 0; j < n; j++) {
                 int t = 0;
                 TypeParam* buf_T = mem_T.at(j);
@@ -308,14 +308,14 @@ TYPED_TEST(BuffersTest, TestPackUnpack) // NOLINT
 
             // Pack bufs of type uint8_t to bufs of type TypeParam.
             vec::Buffers<TypeParam> vec_T_tmp(n, size);
-            const std::vector<TypeParam*> mem_T_tmp = vec_T_tmp.get_mem();
+            const std::vector<TypeParam*>& mem_T_tmp = vec_T_tmp.get_mem();
 
             vec::pack<uint8_t, TypeParam>(
                 mem_char, mem_T_tmp, n, size, word_size);
 
             // Unpack bufs of type TypeParam to bufs of type uint8_t.
             vec::Buffers<uint8_t> vec_char_tmp(n, bytes_size);
-            const std::vector<uint8_t*> mem_char_tmp = vec_char_tmp.get_mem();
+            const std::vector<uint8_t*>& mem_char_tmp = vec_char_tmp.get_mem();
             vec::unpack<TypeParam, uint8_t>(
                 mem_T_tmp, mem_char_tmp, n, size, word_size);
 
diff --git a/test/ec_driver.cpp b/test/ec_driver.cpp
index 0166f848..6a03d76a 100644
--- a/test/ec_driver.cpp
+++ b/test/ec_driver.cpp
@@ -582,15 +582,15 @@ int main(int argc, char** argv)
     data_zpad = count_digits(n_data - 1);
 
     if (eflag == EC_TYPE_RS_FNT) {
-        if (word_size <= 4) {
-            run_fec_rs_fnt<uint32_t>(
+        if (word_size == 1) {
+            run_fec_rs_fnt<uint16_t>(
                 word_size,
                 n_data,
                 n_parities,
                 rflag,
                 quadiron::fec::FecType::NON_SYSTEMATIC);
-        } else if (word_size <= 8) {
-            run_fec_rs_fnt<uint64_t>(
+        } else if (word_size == 2) {
+            run_fec_rs_fnt<uint32_t>(
                 word_size,
                 n_data,
                 n_parities,
@@ -598,15 +598,15 @@ int main(int argc, char** argv)
                 quadiron::fec::FecType::NON_SYSTEMATIC);
         }
     } else if (eflag == EC_TYPE_RS_FNT_SYS) {
-        if (word_size <= 4) {
-            run_fec_rs_fnt<uint32_t>(
+        if (word_size == 1) {
+            run_fec_rs_fnt<uint16_t>(
                 word_size,
                 n_data,
                 n_parities,
                 rflag,
                 quadiron::fec::FecType::SYSTEMATIC);
-        } else if (word_size <= 8) {
-            run_fec_rs_fnt<uint64_t>(
+        } else if (word_size == 2) {
+            run_fec_rs_fnt<uint32_t>(
                 word_size,
                 n_data,
                 n_parities,
diff --git a/test/fec_utest.cpp b/test/fec_utest.cpp
index 72e17a14..304907c3 100644
--- a/test/fec_utest.cpp
+++ b/test/fec_utest.cpp
@@ -40,9 +40,12 @@ class FecTestCommon : public ::testing::Test {
   public:
     const unsigned n_data = 3;
     const unsigned n_parities = 3;
+    const size_t pkt_size = 9;
 
-    void
-    run_test(fec::FecCode<T>& fec, bool props_flag = false, bool is_nf4 = false)
+    void run_test_horizontal(
+        fec::FecCode<T>& fec,
+        bool props_flag = false,
+        bool is_nf4 = false)
     {
         const int code_len = n_data + n_parities;
 
@@ -52,7 +55,7 @@ class FecTestCommon : public ::testing::Test {
 
         vec::Vector<T> data_frags(gf, n_data);
         vec::Vector<T> copied_data_frags(gf, n_data);
-        vec::Vector<T> encoded_frags(gf, fec.n);
+        vec::Vector<T> encoded_frags(gf, fec.get_n_outputs());
         vec::Vector<T> received_frags(gf, n_data);
         vec::Vector<T> decoded_frags(gf, n_data);
         std::vector<int> ids;
@@ -62,10 +65,10 @@ class FecTestCommon : public ::testing::Test {
             ids.push_back(i);
         }
 
-        std::vector<quadiron::Properties> props(code_len);
+        std::vector<quadiron::Properties> props(fec.get_n_outputs());
         for (int j = 0; j < 1000; j++) {
             if (props_flag) {
-                for (int i = 0; i < code_len; i++) {
+                for (int i = 0; i < fec.get_n_outputs(); i++) {
                     props[i] = quadiron::Properties();
                 }
             }
@@ -83,8 +86,25 @@ class FecTestCommon : public ::testing::Test {
             std::random_shuffle(ids.begin(), ids.end());
             for (unsigned i = 0; i < n_data; i++) {
                 fragments_ids.set(i, ids.at(i));
-                received_frags.set(i, encoded_frags.get(ids.at(i)));
             }
+            if (fec.type == fec::FecType::SYSTEMATIC) {
+                for (unsigned i = 0; i < n_data; i++) {
+                    if (fragments_ids.get(i) < n_data) {
+                        received_frags.set(
+                            i, data_frags.get(fragments_ids.get(i)));
+                    } else {
+                        received_frags.set(
+                            i,
+                            encoded_frags.get(fragments_ids.get(i) - n_data));
+                    }
+                }
+            } else {
+                for (unsigned i = 0; i < n_data; i++) {
+                    received_frags.set(
+                        i, encoded_frags.get(fragments_ids.get(i)));
+                }
+            }
+
             std::unique_ptr<fec::DecodeContext<T>> context =
                 fec.init_context_dec(fragments_ids, props);
 
@@ -93,22 +113,97 @@ class FecTestCommon : public ::testing::Test {
             ASSERT_EQ(copied_data_frags, decoded_frags);
         }
     }
-};
 
-using AllTypes = ::testing::Types<uint32_t, uint64_t, __uint128_t>;
-TYPED_TEST_CASE(FecTestCommon, AllTypes);
+    void run_test_vertical(
+        fec::FecCode<T>& fec,
+        bool props_flag = false,
+        bool has_meta = false)
+    {
+        const int code_len = n_data + n_parities;
 
-TYPED_TEST(FecTestCommon, TestNf4) // NOLINT
-{
-    const int iter_count = arith::log2<TypeParam>(sizeof(TypeParam));
+        const quadiron::gf::Field<T>& gf = fec.get_gf();
 
-    for (int i = 1; i < iter_count; i++) {
-        const unsigned word_size = 1 << i;
-        fec::RsNf4<TypeParam> fec(word_size, this->n_data, this->n_parities);
+        vec::Buffers<T> data_frags(n_data, pkt_size, has_meta);
+        vec::Buffers<T> encoded_frags(fec.get_n_outputs(), pkt_size, has_meta);
+        vec::Buffers<T> received_frags(n_data, pkt_size, has_meta);
+        vec::Buffers<T> decoded_frags(n_data, pkt_size, has_meta);
+        std::vector<int> ids;
+        vec::Vector<T> fragments_ids(gf, n_data);
+
+        // It's necessary to set `data_frags` all zeros for `RsNf4` as
+        // `data_frags` has not meta
+        data_frags.zero_fill();
+
+        for (int i = 0; i < code_len; i++) {
+            ids.push_back(i);
+        }
+
+        std::vector<quadiron::Properties> props(fec.get_n_outputs());
+        for (int j = 0; j < 1; j++) {
+            if (props_flag) {
+                for (int i = 0; i < fec.get_n_outputs(); i++) {
+                    props[i] = quadiron::Properties();
+                }
+            }
+            const std::vector<T*>& mem = data_frags.get_mem();
+            for (unsigned i = 0; i < n_data; i++) {
+                char* buf = reinterpret_cast<char*>(mem[i]);
+                for (size_t j = 0; j < fec.buf_size; ++j) {
+                    buf[j] = static_cast<char>(gf.rand());
+                }
+            }
+            if (has_meta) {
+                data_frags.reset_meta();
+            }
+
+            fec.encode(encoded_frags, props, 0, data_frags);
+
+            std::random_shuffle(ids.begin(), ids.end());
+            for (unsigned i = 0; i < n_data; i++) {
+                fragments_ids.set(i, ids.at(i));
+            }
+            if (fec.type == fec::FecType::SYSTEMATIC) {
+                for (unsigned i = 0; i < n_data; i++) {
+                    if (fragments_ids.get(i) < n_data) {
+                        received_frags.copy(
+                            data_frags, fragments_ids.get(i), i);
+                    } else {
+                        received_frags.copy(
+                            encoded_frags, fragments_ids.get(i) - n_data, i);
+                    }
+                }
+            } else {
+                for (unsigned i = 0; i < n_data; i++) {
+                    received_frags.copy(encoded_frags, fragments_ids.get(i), i);
+                }
+            }
 
-        this->run_test(fec, true, true);
+            std::unique_ptr<fec::DecodeContext<T>> context =
+                fec.init_context_dec(
+                    fragments_ids, props, pkt_size, &decoded_frags);
+
+            fec.decode(*context, decoded_frags, props, 0, received_frags);
+
+            ASSERT_EQ(data_frags, decoded_frags);
+        }
     }
-}
+
+    void run_test(
+        fec::FecCode<T>& fec,
+        bool props_flag = false,
+        bool is_nf4 = false,
+        bool vertical_support = false,
+        bool has_meta = false)
+    {
+        run_test_horizontal(fec, props_flag, is_nf4);
+        if (vertical_support) {
+            run_test_vertical(fec, props_flag, has_meta);
+        }
+    }
+};
+
+using AllTypes = ::testing::Types<uint32_t, uint64_t>;
+TYPED_TEST_CASE(FecTestCommon, AllTypes);
 
 TYPED_TEST(FecTestCommon, TestGf2nFft) // NOLINT
 {
@@ -130,36 +225,64 @@ TYPED_TEST(FecTestCommon, TestGf2nFftAdd) // NOLINT
 }
 
 template <typename T>
-class FecTestNo128 : public FecTestCommon<T> {
+class FecTestNf4 : public FecTestCommon<T> {
 };
 
-using No128 = ::testing::Types<uint32_t, uint64_t>;
-TYPED_TEST_CASE(FecTestNo128, No128);
+using Nf4Type = ::testing::Types<uint64_t>;
+TYPED_TEST_CASE(FecTestNf4, Nf4Type);
 
-TYPED_TEST(FecTestNo128, TestFnt) // NOLINT
+TYPED_TEST(FecTestNf4, TestNf4) // NOLINT
 {
-    for (unsigned word_size = 1; word_size <= 2; ++word_size) {
-        fec::RsFnt<TypeParam> fec(
-            fec::FecType::NON_SYSTEMATIC,
-            word_size,
-            this->n_data,
-            this->n_parities);
-        this->run_test(fec, true);
+    const int iter_count = arith::log2<TypeParam>(sizeof(TypeParam));
+
+    for (int i = 1; i < iter_count; i++) {
+        const unsigned word_size = 1 << i;
+        fec::RsNf4<TypeParam> fec(
+            word_size, this->n_data, this->n_parities, this->pkt_size);
+
+        this->run_test(fec, true, true, true);
     }
 }
 
-TYPED_TEST(FecTestNo128, TestFntSys) // NOLINT
+template <typename T>
+class FecTestFnt : public FecTestCommon<T> {
+};
+
+using FntType = ::testing::Types<uint16_t, uint32_t>;
+TYPED_TEST_CASE(FecTestFnt, FntType);
+
+TYPED_TEST(FecTestFnt, TestFnt) // NOLINT
 {
-    for (unsigned word_size = 1; word_size <= 2; ++word_size) {
-        fec::RsFnt<TypeParam> fec(
-            fec::FecType::SYSTEMATIC,
-            word_size,
-            this->n_data,
-            this->n_parities);
-        this->run_test(fec, true);
-    }
+    const unsigned word_size = sizeof(TypeParam) / 2;
+    fec::RsFnt<TypeParam> fec(
+        fec::FecType::NON_SYSTEMATIC,
+        word_size,
+        this->n_data,
+        this->n_parities,
+        this->pkt_size);
+
+    this->run_test(fec, true, false, true, true);
 }
 
+TYPED_TEST(FecTestFnt, TestFntSys) // NOLINT
+{
+    const unsigned word_size = sizeof(TypeParam) / 2;
+    fec::RsFnt<TypeParam> fec(
+        fec::FecType::SYSTEMATIC,
+        word_size,
+        this->n_data,
+        this->n_parities,
+        this->pkt_size);
+    this->run_test(fec, true, false, true, true);
+}
+
+template <typename T>
+class FecTestNo128 : public FecTestCommon<T> {
+};
+
+using No128 = ::testing::Types<uint32_t, uint64_t>;
+TYPED_TEST_CASE(FecTestNo128, No128);
+
 TYPED_TEST(FecTestNo128, TestGfpFft) // NOLINT
 {
     for (size_t word_size = 1; word_size <= 4 && word_size < sizeof(TypeParam);
diff --git a/test/fft_utest.cpp b/test/fft_utest.cpp
index e13695d2..d4a21c0c 100644
--- a/test/fft_utest.cpp
+++ b/test/fft_utest.cpp
@@ -335,7 +335,10 @@ TYPED_TEST(FftTest, TestFft2kVecp) // NOLINT
 {
     auto gf(gf::create<gf::Prime<TypeParam>>(this->q));
     const unsigned R = gf.get_primitive_root();
-    const size_t size = 4;
+    const size_t size = 128;
+    const std::vector<bool> tests = {true};
+
+    const unsigned half_elelemnt_in_bits = sizeof(TypeParam) * CHAR_BIT / 2;
 
     ASSERT_EQ(arith::jacobi<TypeParam>(R, this->q), -1);
 
@@ -349,22 +352,33 @@ TYPED_TEST(FftTest, TestFft2kVecp) // NOLINT
             fft::Radix2<TypeParam> fft(gf, n, data_len, size);
 
             const int vec_n = fft.get_n();
-            vec::Buffers<TypeParam> v2(vec_n, size);
-            vec::Buffers<TypeParam> _v2(vec_n, size);
-            for (unsigned len = 2; len <= n; len *= 2) {
-                vec::Buffers<TypeParam> v(len, size);
-                vec::Buffers<TypeParam> _v(_v2, 0, len);
-                for (int j = 0; j < 100; j++) {
-                    for (unsigned i = 0; i < len; i++) {
-                        TypeParam* mem = v.get(i);
-                        for (size_t u = 0; u < size; u++) {
-                            mem[u] = gf.rand();
+
+            for (bool const& has_meta : tests) {
+                vec::Buffers<TypeParam> v2(vec_n, size, has_meta);
+                vec::Buffers<TypeParam> _v2(vec_n, size, has_meta);
+                for (unsigned len = 2; len <= n; len *= 2) {
+                    for (int j = 0; j < 100; j++) {
+                        vec::Buffers<TypeParam> v(len, size, has_meta);
+                        vec::Buffers<TypeParam> _v(_v2, 0, len);
+                        for (unsigned i = 0; i < len; i++) {
+                            TypeParam* mem = v.get(i);
+                            if (has_meta) {
+                                for (size_t u = 0; u < size; u++) {
+                                    mem[u] =
+                                        (gf.rand() << half_elelemnt_in_bits)
+                                        | gf.rand();
+                                }
+                            } else {
+                                for (size_t u = 0; u < size; u++) {
+                                    mem[u] = gf.rand();
+                                }
+                            }
                         }
-                    }
-                    fft.fft(v2, v);
-                    fft.ifft(_v2, v2);
+                        fft.fft(v2, v);
+                        fft.ifft(_v2, v2);
 
-                    ASSERT_EQ(v, _v);
+                        ASSERT_EQ(v, _v);
+                    }
                 }
             }
         }
@@ -375,7 +389,8 @@ TYPED_TEST(FftTest, TestNaiveVsFft2kVecp) // NOLINT
 {
     auto gf(gf::create<gf::Prime<TypeParam>>(this->q));
     const unsigned R = gf.get_primitive_root();
-    const size_t size = 2;
+    const size_t size = 128;
+    const std::vector<bool> tests = {true};
 
     ASSERT_EQ(arith::jacobi<TypeParam>(R, this->q), -1);
 
@@ -393,29 +408,31 @@ TYPED_TEST(FftTest, TestNaiveVsFft2kVecp) // NOLINT
 
         ASSERT_EQ(fft_naive.get_n(), fft_2n.get_n());
 
-        vec::Buffers<TypeParam> v(n, size);
-        vec::Buffers<TypeParam> fft1(n, size);
-        vec::Buffers<TypeParam> fft2(n, size);
-        vec::Buffers<TypeParam> ifft1(n, size);
-        vec::Buffers<TypeParam> ifft2(n, size);
-        for (int j = 0; j < 100; j++) {
-            for (unsigned i = 0; i < n; i++) {
-                TypeParam* mem = v.get(i);
-                for (size_t u = 0; u < size; u++) {
-                    mem[u] = gf.rand();
+        for (bool const& has_meta : tests) {
+            vec::Buffers<TypeParam> v(n, size, has_meta);
+            vec::Buffers<TypeParam> fft1(n, size, has_meta);
+            vec::Buffers<TypeParam> fft2(n, size, has_meta);
+            vec::Buffers<TypeParam> ifft1(n, size, has_meta);
+            vec::Buffers<TypeParam> ifft2(n, size, has_meta);
+            for (int j = 0; j < 100; j++) {
+                for (unsigned i = 0; i < n; i++) {
+                    TypeParam* mem = v.get(i);
+                    for (size_t u = 0; u < size; u++) {
+                        mem[u] = gf.rand();
+                    }
                 }
-            }
 
-            fft_naive.fft(fft1, v);
-            fft_2n.fft(fft2, v);
+                fft_naive.fft(fft1, v);
+                fft_2n.fft(fft2, v);
 
-            ASSERT_EQ(fft1, fft2);
+                ASSERT_EQ(fft1, fft2);
 
-            fft_naive.ifft(ifft1, fft1);
-            fft_2n.ifft(ifft2, fft2);
+                fft_naive.ifft(ifft1, fft1);
+                fft_2n.ifft(ifft2, fft2);
 
-            ASSERT_EQ(ifft1, ifft2);
-            ASSERT_EQ(ifft1, v);
+                ASSERT_EQ(ifft1, ifft2);
+                ASSERT_EQ(ifft1, v);
+            }
         }
     }
 }
diff --git a/test/perf/fnt.cpp b/test/perf/fnt.cpp
new file mode 100644
index 00000000..44758485
--- /dev/null
+++ b/test/perf/fnt.cpp
@@ -0,0 +1,526 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <vector>
+
+#include <functional>
+#include <gtest/gtest.h>
+
+#include "arith.h"
+#include "core.h"
+#include "fec_rs_fnt.h"
+#include "fft_2n.h"
+#include "gf_prime.h"
+#include "misc.h"
+#include "vec_buffers.h"
+
+namespace vec = quadiron::vec;
+namespace gf = quadiron::gf;
+namespace fft = quadiron::fft;
+namespace fec = quadiron::fec;
+
+#ifdef QUADIRON_USE_SIMD
+
+#include "simd.h"
+#include "simd/simd.h"
+#include "simd_fnt.h"
+
+namespace simd = quadiron::simd;
+
+template <typename T>
+void show(simd::VecType val)
+{
+    const size_t n = simd::countof<T>();
+    T buffer[n];
+    simd::store_to_mem(reinterpret_cast<simd::VecType*>(buffer), val);
+    for (unsigned i = 0; i < n; i++) {
+        std::cout << unsigned(buffer[i]) << " ";
+    }
+    std::cout << "\n";
+}
+
+template <typename T>
+void dump(T* buf, size_t bytes)
+{
+    const size_t nb = bytes / sizeof(T);
+    for (size_t i = 0; i < nb; ++i) {
+        std::cout << unsigned(buf[i]) << " ";
+    }
+    std::cout << "\n";
+}
+
+template <typename T>
+class PerfFnt : public ::testing::Test {
+  public:
+    PerfFnt()
+    {
+        if (sizeof(T) == 2) {
+            this->q = 257;
+            this->word_size = 1;
+        } else if (sizeof(T) == 4) {
+            this->q = static_cast<T>(65537);
+            this->word_size = 2;
+        } else {
+            throw "Wrong TypeParam for PerfFnt tests";
+        }
+
+        this->distribution =
+            std::make_unique<std::uniform_int_distribution<uint32_t>>(0, q - 1);
+    }
+
+    void
+    buf_rand_data(vec::Buffers<T>& vec, bool has_meta = false, int _max = 0)
+    {
+        const T max = (_max == 0) ? std::numeric_limits<T>::max() : _max;
+        std::uniform_int_distribution<T> dis(0, max - 1);
+
+        const std::vector<T*>& mem = vec.get_mem();
+        const size_t size = vec.get_size();
+        const size_t n = vec.get_n();
+        for (size_t i = 0; i < n; ++i) {
+            for (size_t j = 0; j < size; j++) {
+                mem[i][j] = dis(quadiron::prng());
+            }
+        }
+
+        if (has_meta) {
+            const std::vector<uint8_t*>& meta = vec.get_meta();
+            const size_t meta_size = vec.get_meta_size();
+            for (size_t i = 0; i < n; ++i) {
+                for (size_t j = 0; j < meta_size; ++j) {
+                    meta[i][j] = static_cast<uint8_t>(dis(quadiron::prng()));
+                }
+            }
+        }
+    }
+
+    simd::VecType rand_vec(T lower = 0, T upper_bound = 0)
+    {
+        const size_t n = simd::countof<T>();
+        T buf[n];
+        simd::VecType* vec = reinterpret_cast<simd::VecType*>(buf);
+
+        T bound = upper_bound ? upper_bound : q;
+        bound -= lower;
+
+        for (unsigned i = 0; i < n; i++) {
+            buf[i] = lower + distribution->operator()(quadiron::prng()) % bound;
+        }
+
+        return vec[0];
+    }
+
+    template <typename Tx>
+    void gen_rand_data(Tx* vec, size_t len)
+    {
+        for (size_t i = 0; i < len; i++) {
+            vec[i] = distribution->operator()(quadiron::prng());
+        }
+    }
+
+    simd::VecType copy(simd::VecType x)
+    {
+        const size_t n = simd::countof<T>();
+        T buf[n];
+        T val[n];
+        simd::VecType* vec = reinterpret_cast<simd::VecType*>(buf);
+
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(val), x);
+        std::copy_n(val, n, buf);
+
+        return vec[0];
+    }
+
+    template <typename TFunc>
+    void core_op_perf_single(const std::string& text, const TFunc& f)
+    {
+        const size_t len = simd::countof<T>();
+        // Init a Buffers to obtain aligned memory
+        quadiron::vec::Buffers<T> data_buf(2, len);
+        T* x = data_buf.get(0);
+        T* y = data_buf.get(1);
+
+        for (unsigned i = 0; i < len; ++i) {
+            x[i] =
+                1
+                + (distribution->operator()(quadiron::prng()) % (this->q - 1));
+            y[i] =
+                1
+                + (distribution->operator()(quadiron::prng()) % (this->q - 1));
+        }
+
+        simd::VecType* vec_x = reinterpret_cast<simd::VecType*>(x);
+        simd::VecType* vec_y = reinterpret_cast<simd::VecType*>(y);
+
+        uint64_t start = quadiron::hw_timer();
+        for (unsigned i = 0; i < iters_nb; ++i) {
+            simd::VecType _x = simd::load_to_reg(vec_x);
+            simd::VecType _y = simd::load_to_reg(vec_y);
+
+            f(_x, _y);
+
+            simd::store_to_mem(vec_x, _x);
+        }
+        uint64_t end = quadiron::hw_timer();
+        double avg_cycles_nb =
+            static_cast<double>(end - start) / static_cast<double>(iters_nb);
+        std::cout << "#CPU cycles of " << text << ": " << avg_cycles_nb << "\n";
+    }
+
+    template <typename TFunc>
+    void core_op_perf(const std::string& text, const TFunc& f)
+    {
+        std::cout << text << "\n";
+        std::cout << "\tVectors nb\t#CPU cycles\n";
+        for (auto vec_len : arr_vec_len) {
+            const size_t len = vec_len * simd::countof<T>();
+
+            // Init a Buffers to obtain aligned memory
+            quadiron::vec::Buffers<T> data_buf(2, len);
+            T* buf_x = data_buf.get(0);
+            T* buf_y = data_buf.get(1);
+            gen_rand_data(buf_x, len);
+            gen_rand_data(buf_y, len);
+
+            simd::VecType* data_x = reinterpret_cast<simd::VecType*>(buf_x);
+            simd::VecType* data_y = reinterpret_cast<simd::VecType*>(buf_y);
+
+            uint64_t start = quadiron::hw_timer();
+            for (unsigned i = 0; i < iters_nb; ++i) {
+                for (size_t j = 0; j < vec_len; ++j) {
+                    simd::VecType x = simd::load_to_reg(&data_x[j]);
+                    simd::VecType y = simd::load_to_reg(&data_y[j]);
+
+                    f(x, y);
+
+                    simd::store_to_mem(&data_x[j], x);
+                }
+            }
+            uint64_t end = quadiron::hw_timer();
+            double avg_cycles_nb = static_cast<double>(end - start)
+                                   / static_cast<double>(iters_nb)
+                                   / static_cast<double>(vec_len);
+
+            std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n";
+        }
+        std::cout << "\n";
+    }
+
+    template <typename TFunc>
+    void butterfly_perf(const std::string& text, const TFunc& f)
+    {
+        std::cout << text << "\n";
+        std::cout << "\tVectors nb\t#CPU cycles\n";
+        for (auto vec_len : arr_vec_len) {
+            const size_t len = vec_len * simd::countof<T>();
+
+            // Init a Buffers to obtain aligned memory
+            quadiron::vec::Buffers<T> data_buf(2, len);
+            T* buf_x = data_buf.get(0);
+            T* buf_y = data_buf.get(1);
+            gen_rand_data(buf_x, len);
+            gen_rand_data(buf_y, len);
+
+            simd::VecType* data_x = reinterpret_cast<simd::VecType*>(buf_x);
+            simd::VecType* data_y = reinterpret_cast<simd::VecType*>(buf_y);
+
+            T coef = 1
+                     + this->distribution->operator()(quadiron::prng())
+                           % (this->q - 2);
+            const simd::CtGsCase ct_case = simd::get_case<T>(coef, this->q);
+            const simd::VecType c = simd::set_one(coef);
+
+            uint64_t start = quadiron::hw_timer();
+            for (unsigned i = 0; i < iters_nb; ++i) {
+                for (size_t j = 0; j < vec_len; ++j) {
+                    simd::VecType x = simd::load_to_reg(&data_x[j]);
+                    simd::VecType y = simd::load_to_reg(&data_y[j]);
+
+                    f(ct_case, c, x, y);
+
+                    simd::store_to_mem(&data_x[j], x);
+                    simd::store_to_mem(&data_y[j], y);
+                }
+            }
+            uint64_t end = quadiron::hw_timer();
+
+            double avg_cycles_nb = static_cast<double>(end - start)
+                                   / static_cast<double>(iters_nb)
+                                   / static_cast<double>(vec_len);
+
+            std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n";
+        }
+        std::cout << "\n";
+    }
+
+    template <typename TFunc>
+    void fft_perf(const std::string& text, size_t fft_len, const TFunc& f)
+    {
+        std::cout << text << " of length " << fft_len << "\n";
+        std::cout << "\tVectors nb\t#CPU cycles\n";
+
+        for (auto vec_len : arr_vec_len) {
+            const size_t len = vec_len * simd::countof<T>();
+
+            vec::Buffers<T> input(fft_len, len, true);
+            vec::Buffers<T> output(fft_len, len, true);
+
+            buf_rand_data(input);
+            buf_rand_data(output);
+
+            uint64_t start = quadiron::hw_timer();
+            for (unsigned i = 0; i < iters_nb; ++i) {
+                (i % 2) ? f(output, input) : f(input, output);
+            }
+            uint64_t end = quadiron::hw_timer();
+
+            double avg_cycles_nb = static_cast<double>(end - start)
+                                   / static_cast<double>(iters_nb)
+                                   / static_cast<double>(vec_len);
+
+            std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n";
+        }
+    }
+
+    void fnt_perf(
+        const std::string& text,
+        fec::FecCode<T>& fec,
+        size_t fft_len,
+        int n_data,
+        size_t vec_len)
+    {
+        const size_t len = vec_len * simd::countof<T>();
+        const bool has_meta = true;
+
+        vec::Buffers<T> data_frags(n_data, len, has_meta);
+        vec::Buffers<T> encoded_frags(fft_len, len, has_meta);
+
+        // It's necessary to set `data_frags` all zeros for `RsNf4` as
+        // `data_frags` has not meta
+        data_frags.zero_fill();
+
+        std::vector<quadiron::Properties> props(fec.get_n_outputs());
+
+        buf_rand_data(data_frags);
+
+        uint64_t start = quadiron::hw_timer();
+        for (unsigned i = 0; i < iters_nb; ++i) {
+            for (int i = 0; i < fec.get_n_outputs(); i++) {
+                props[i] = quadiron::Properties();
+            }
+            data_frags.reset_meta();
+
+            fec.encode(encoded_frags, props, 0, data_frags);
+        }
+        uint64_t end = quadiron::hw_timer();
+
+        double avg_cycles_nb = static_cast<double>(end - start)
+                               / static_cast<double>(iters_nb)
+                               / static_cast<double>(vec_len);
+
+        std::cout << "\t" << text << "\t\t" << n_data << "\t\t" << fft_len - n_data << "\t\t"
+                  << len << "\t\t" << avg_cycles_nb << "\n";
+    }
+
+    T q;
+    unsigned word_size;
+    std::unique_ptr<std::uniform_int_distribution<uint32_t>> distribution;
+    std::vector<size_t> arr_vec_len =
+        {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384};
+    size_t iters_nb = 1e1;
+    std::vector<size_t> arr_fft_len = {16, 32, 64, 128, 256};
+    std::vector<size_t> arr_k = {8, 16, 32, 64, 128};
+};
+
+using AllTypes = ::testing::Types<uint32_t>;
+TYPED_TEST_CASE(PerfFnt, AllTypes);
+
+TYPED_TEST(PerfFnt, PerfSimdSingle) // NOLINT
+{
+    this->core_op_perf_single(
+        "Add", [](simd::VecType& x, const simd::VecType& y) {
+            x = simd::add<TypeParam>(x, y);
+        });
+
+    this->core_op_perf_single(
+        "Sub", [](simd::VecType& x, const simd::VecType& y) {
+            x = simd::sub<TypeParam>(x, y);
+        });
+
+    this->core_op_perf_single(
+        "Mul", [](simd::VecType& x, const simd::VecType& y) {
+            x = simd::mul<TypeParam>(x, y);
+        });
+
+    this->core_op_perf_single(
+        "Min", [](simd::VecType& x, const simd::VecType& y) {
+            x = simd::min<TypeParam>(x, y);
+        });
+}
+
+TYPED_TEST(PerfFnt, PerfSimdBuf) // NOLINT
+{
+    this->core_op_perf("Add", [](simd::VecType& x, const simd::VecType& y) {
+        x = simd::add<TypeParam>(x, y);
+    });
+
+    this->core_op_perf("Sub", [](simd::VecType& x, const simd::VecType& y) {
+        x = simd::sub<TypeParam>(x, y);
+    });
+
+    this->core_op_perf("Mul", [](simd::VecType& x, const simd::VecType& y) {
+        x = simd::mul<TypeParam>(x, y);
+    });
+
+    this->core_op_perf("Min", [](simd::VecType& x, const simd::VecType& y) {
+        x = simd::min<TypeParam>(x, y);
+    });
+}
+
+TYPED_TEST(PerfFnt, PerfModBuf) // NOLINT
+{
+    this->core_op_perf("ModAdd", [](simd::VecType& x, const simd::VecType& y) {
+        x = simd::mod_add<TypeParam>(x, y);
+    });
+
+    this->core_op_perf("ModSub", [](simd::VecType& x, const simd::VecType& y) {
+        x = simd::mod_sub<TypeParam>(x, y);
+    });
+
+    this->core_op_perf("ModMul", [](simd::VecType& x, const simd::VecType& y) {
+        x = simd::mod_mul<TypeParam>(x, y);
+    });
+}
+
+TYPED_TEST(PerfFnt, PerfPackUnpack) // NOLINT
+{
+    std::cout << "Pack & Unpack"
+              << "\n";
+    std::cout << "\tVectors nb\t#CPU cycles\n";
+    for (const auto vec_len : this->arr_vec_len) {
+        const size_t len = vec_len * simd::countof<TypeParam>();
+
+        // Init a Buffers to obtain aligned memory
+        quadiron::vec::Buffers<TypeParam> data_buf(1, len);
+        TypeParam* buf_data = data_buf.get(0);
+        this->gen_rand_data(buf_data, len);
+
+        quadiron::vec::Buffers<simd::MetaType> meta_buf(1, vec_len);
+        simd::MetaType* buf_meta = meta_buf.get(0);
+        this->gen_rand_data(buf_meta, vec_len);
+
+        simd::VecType* data = reinterpret_cast<simd::VecType*>(buf_data);
+        simd::MetaType* meta = buf_meta;
+
+        uint64_t start = quadiron::hw_timer();
+        for (unsigned i = 0; i < this->iters_nb; ++i) {
+            for (size_t j = 0; j < vec_len; ++j) {
+                simd::VecType lo, hi;
+
+                simd::VecType x = simd::load_to_reg(&data[j]);
+
+                simd::unpack<TypeParam>(meta[j], x, hi, lo);
+                simd::pack<TypeParam>(lo, hi, x, meta[j]);
+
+                simd::store_to_mem(&data[j], x);
+            }
+        }
+        uint64_t end = quadiron::hw_timer();
+        double avg_cycles_nb = static_cast<double>(end - start)
+                               / static_cast<double>(this->iters_nb)
+                               / static_cast<double>(vec_len);
+
+        std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n";
+    }
+    std::cout << "\n";
+}
+
+TYPED_TEST(PerfFnt, PerfButterfly) // NOLINT
+{
+    this->butterfly_perf(
+        "Butterfly_CT",
+        [](simd::CtGsCase ct_case,
+           const simd::VecType& c,
+           simd::VecType& x,
+           simd::VecType& y) {
+            simd::butterfly_ct<TypeParam>(ct_case, c, x, y);
+        });
+
+    this->butterfly_perf(
+        "Butterfly_GS",
+        [](simd::CtGsCase ct_case,
+           const simd::VecType& c,
+           simd::VecType& x,
+           simd::VecType& y) {
+            simd::butterfly_gs<TypeParam>(ct_case, c, x, y);
+        });
+}
+
+TYPED_TEST(PerfFnt, PerfFftRadix2) // NOLINT
+{
+    auto gf(gf::create<gf::Prime<TypeParam>>(this->q));
+
+    for (auto fft_len : this->arr_fft_len) {
+        fft::Radix2<TypeParam> fft_2n(gf, fft_len);
+
+        this->fft_perf(
+            "FFT",
+            fft_len,
+            [&fft_2n](
+                vec::Buffers<TypeParam>& output,
+                vec::Buffers<TypeParam>& input) { fft_2n.fft(output, input); });
+    }
+}
+
+TYPED_TEST(PerfFnt, PerfFntEnc) // NOLINT
+{
+    std::cout << "FNT performance\n";
+    std::cout << "\tType\t\tk\t\tm\t\tpkt_size\t\t#CPU cycles\n";
+
+    for (auto fft_len : this->arr_fft_len) {
+        for (auto n_data : this->arr_k) {
+            if (n_data >= fft_len) {
+                continue;
+            }
+            const int n_parities = fft_len - n_data;
+
+            for (auto vec_len : this->arr_vec_len) {
+                fec::RsFnt<TypeParam> fec(
+                    fec::FecType::NON_SYSTEMATIC,
+                    this->word_size,
+                    n_data,
+                    n_parities,
+                    vec_len);
+
+                this->fnt_perf("Enc", fec, fft_len, n_data, vec_len);
+            }
+        }
+    }
+}
+#endif
diff --git a/test/quadiron_c_utest.cpp b/test/quadiron_c_utest.cpp
index f87a0e38..9e290db5 100644
--- a/test/quadiron_c_utest.cpp
+++ b/test/quadiron_c_utest.cpp
@@ -169,22 +169,6 @@ class QuadironCTest : public ::testing::Test {
             }
         }
 
-        ASSERT_EQ(
-            quadiron_fnt32_decode(
-                inst,
-                _data.data(),
-                _parity.data(),
-                missing_idxs.data(),
-                block_size),
-            0);
-
-        for (int i = 0; i < n_data; i++) {
-            ASSERT_TRUE(std::equal(
-                _ref_data[i],
-                _ref_data[i] + block_size,
-                _data[i] + metadata_size));
-        }
-
         for (int i = 0; i < n_data; i++) {
             if (missing_idxs[i]) {
                 ASSERT_EQ(
@@ -213,6 +197,39 @@ class QuadironCTest : public ::testing::Test {
             }
         }
 
+        for (int i = 0; i < n_data; i++) {
+            if (missing_idxs[i]) {
+                std::fill_n(_data[i], block_size + metadata_size, 0);
+            }
+        }
+
+        for (int i = 0; i < n_parities; i++) {
+            if (missing_idxs[n_data + i]) {
+                std::fill_n(_parity[i], block_size + metadata_size, 0);
+            }
+        }
+
+        // FIXME: for non-systematic FNT, `quadiron_fnt32_decode` will
+        // overwrite `_data` (that stores actually encoded fragments)
+        // by decoded data. Hence, if we test `reconstruct` after, we could use
+        // wrong fragments. A quick fix is to move the test of
+        // `quadiron_fnt32_decode` at the end.
+        ASSERT_EQ(
+            quadiron_fnt32_decode(
+                inst,
+                _data.data(),
+                _parity.data(),
+                missing_idxs.data(),
+                block_size),
+            0);
+
+        for (int i = 0; i < n_data; i++) {
+            ASSERT_TRUE(std::equal(
+                _ref_data[i],
+                _ref_data[i] + block_size,
+                _data[i] + metadata_size));
+        }
+
         quadiron_fnt32_delete(inst);
     }
 
diff --git a/test/simd/test_simd_fnt.cpp b/test/simd/test_simd_fnt.cpp
new file mode 100644
index 00000000..81635f90
--- /dev/null
+++ b/test/simd/test_simd_fnt.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <vector>
+
+#include <functional>
+#include <gtest/gtest.h>
+
+#include "arith.h"
+#include "core.h"
+#include "misc.h"
+
+#ifdef QUADIRON_USE_SIMD
+
+#include "simd.h"
+#include "simd/simd.h"
+#include "simd_fnt.h"
+
+namespace simd = quadiron::simd;
+
+template <typename T>
+void show(simd::VecType val)
+{
+    const size_t n = simd::countof<T>();
+    T buffer[n];
+    simd::store_to_mem(reinterpret_cast<simd::VecType*>(buffer), val);
+    for (unsigned i = 0; i < n; i++) {
+        std::cout << unsigned(buffer[i]) << " ";
+    }
+    std::cout << "\n";
+}
+
+template <typename T>
+void dump(T* buf, size_t bytes)
+{
+    const size_t nb = bytes / sizeof(T);
+    for (size_t i = 0; i < nb; ++i) {
+        std::cout << unsigned(buf[i]) << " ";
+    }
+    std::cout << "\n";
+}
+
+template <typename T>
+class SimdTestFnt : public ::testing::Test {
+  public:
+    SimdTestFnt()
+    {
+        if (sizeof(T) == 2) {
+            this->q = 257;
+        } else if (sizeof(T) == 4) {
+            this->q = static_cast<T>(65537);
+        } else {
+            throw "Wrong TypeParam for SimdTestFnt tests";
+        }
+
+        this->distribution =
+            std::make_unique<std::uniform_int_distribution<uint32_t>>(0, q - 1);
+    }
+
+    simd::VecType rand_vec(T lower = 0, T upper_bound = 0)
+    {
+        const size_t n = simd::countof<T>();
+        T buf[n];
+        simd::VecType* vec = reinterpret_cast<simd::VecType*>(buf);
+
+        T bound = upper_bound ? upper_bound : q;
+        bound -= lower;
+
+        for (unsigned i = 0; i < n; i++) {
+            buf[i] = lower + distribution->operator()(quadiron::prng()) % bound;
+        }
+
+        return vec[0];
+    }
+
+    simd::VecType copy(simd::VecType x)
+    {
+        const size_t n = simd::countof<T>();
+        T buf[n];
+        T val[n];
+        simd::VecType* vec = reinterpret_cast<simd::VecType*>(buf);
+
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(val), x);
+        std::copy_n(val, n, buf);
+
+        return vec[0];
+    }
+
+    bool is_equal(simd::VecType x, simd::VecType y)
+    {
+        return simd::is_zero(simd::bit_xor(x, y));
+    }
+
+    simd::VecType from_msb8_mask(simd::MetaType meta)
+    {
+        const size_t n = simd::countof<uint8_t>();
+        uint8_t buf[n];
+        simd::VecType* vec = reinterpret_cast<simd::VecType*>(buf);
+
+        for (unsigned i = 0; i < n; ++i) {
+            buf[i] = meta & 1;
+            meta >>= 1;
+        }
+
+        return vec[0];
+    }
+
+    simd::VecType mod_mul(simd::VecType x, simd::VecType y)
+    {
+        const size_t n = simd::countof<T>();
+        T _x[n];
+        T _y[n];
+        T _z[n];
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(_x), x);
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(_y), y);
+        for (unsigned i = 0; i < n; i++) {
+            _z[i] = (quadiron::DoubleSizeVal<T>(_x[i]) * _y[i]) % q;
+        }
+
+        simd::VecType* vec = reinterpret_cast<simd::VecType*>(_z);
+
+        return vec[0];
+    }
+
+    /* Butterfly Cooley-Tukey operation
+     * x <- x + c * y
+     * y <- x - c * y
+     */
+    void butterfly_ct(simd::VecType c, simd::VecType& x, simd::VecType& y)
+    {
+        const size_t n = simd::countof<T>();
+        T c_buf[n];
+        T x_buf[n];
+        T y_buf[n];
+
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(c_buf), c);
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(x_buf), x);
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(y_buf), y);
+
+        for (unsigned i = 0; i < n; ++i) {
+            T mul = (quadiron::DoubleSizeVal<T>(c_buf[i]) * y_buf[i]) % q;
+            T u = (x_buf[i] + mul) % q;
+            T v = x_buf[i] >= mul ? x_buf[i] - mul : q + x_buf[i] - mul;
+
+            x_buf[i] = u;
+            y_buf[i] = v;
+        }
+
+        x = simd::load_to_reg(reinterpret_cast<simd::VecType*>(x_buf));
+        y = simd::load_to_reg(reinterpret_cast<simd::VecType*>(y_buf));
+    }
+
+    /* Butterfly Genteleman-Sande operation
+     * x <- x + y
+     * y <- c * (x - y)
+     */
+    void butterfly_gs(simd::VecType c, simd::VecType& x, simd::VecType& y)
+    {
+        const size_t n = simd::countof<T>();
+        T c_buf[n];
+        T x_buf[n];
+        T y_buf[n];
+
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(c_buf), c);
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(x_buf), x);
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(y_buf), y);
+
+        for (unsigned i = 0; i < n; ++i) {
+            T sub = x_buf[i] >= y_buf[i] ? x_buf[i] - y_buf[i]
+                                         : q + x_buf[i] - y_buf[i];
+            T u = (x_buf[i] + y_buf[i]) % q;
+            T v = (quadiron::DoubleSizeVal<T>(c_buf[i]) * sub) % q;
+            x_buf[i] = u;
+            y_buf[i] = v;
+        }
+
+        x = simd::load_to_reg(reinterpret_cast<simd::VecType*>(x_buf));
+        y = simd::load_to_reg(reinterpret_cast<simd::VecType*>(y_buf));
+    }
+
+    /* Butterfly Genteleman-Sande simple operation where y = 0
+     * y <- c * x
+     */
+    void butterfly_simple_gs(simd::VecType c, simd::VecType& x)
+    {
+        const size_t n = simd::countof<T>();
+        T c_buf[n];
+        T x_buf[n];
+
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(c_buf), c);
+        simd::store_to_mem(reinterpret_cast<simd::VecType*>(x_buf), x);
+
+        for (unsigned i = 0; i < n; ++i) {
+            x_buf[i] = (quadiron::DoubleSizeVal<T>(c_buf[i]) * x_buf[i]) % q;
+        }
+
+        x = simd::load_to_reg(reinterpret_cast<simd::VecType*>(x_buf));
+    }
+
+    T q;
+    std::unique_ptr<std::uniform_int_distribution<uint32_t>> distribution;
+    std::vector<size_t> arr_vec_len =
+        {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384};
+    size_t iters_nb = 1e3;
+    std::vector<size_t> arr_fft_len = {128};
+};
+
+using AllTypes = ::testing::Types<uint16_t, uint32_t>;
+TYPED_TEST_CASE(SimdTestFnt, AllTypes);
+
+TYPED_TEST(SimdTestFnt, TestMetaToMask) // NOLINT
+{
+    simd::MetaType meta = static_cast<simd::MetaType>(0x89abcdefUL);
+    simd::VecType got = simd::from_msb8_mask(meta);
+    simd::VecType expected = this->from_msb8_mask(meta);
+
+    ASSERT_TRUE(this->is_equal(expected, got));
+}
+
+TYPED_TEST(SimdTestFnt, TestModAddSub) // NOLINT
+{
+    for (unsigned i = 0; i < 100; ++i) {
+        simd::VecType x = this->rand_vec(this->q / 2);
+        simd::VecType y = this->rand_vec(this->q / 2);
+
+        simd::VecType u = simd::mod_add<TypeParam>(x, y);
+        simd::VecType v = simd::mod_sub<TypeParam>(u, x);
+        simd::VecType z = simd::mod_add<TypeParam>(v, x);
+
+        ASSERT_TRUE(this->is_equal(y, v));
+        ASSERT_TRUE(this->is_equal(u, z));
+    }
+}
+
+TYPED_TEST(SimdTestFnt, TestModNeg) // NOLINT
+{
+    for (unsigned i = 0; i < 100; ++i) {
+        simd::VecType x = this->rand_vec();
+
+        simd::VecType y = simd::mod_neg<TypeParam>(x);
+        simd::VecType u = simd::mod_sub<TypeParam>(simd::zero(), x);
+
+        ASSERT_TRUE(this->is_equal(u, y));
+    }
+}
+
+TYPED_TEST(SimdTestFnt, TestModMul) // NOLINT
+{
+    for (unsigned i = 0; i < 100; ++i) {
+        simd::VecType x = this->rand_vec(0, this->q - 1);
+        simd::VecType y = this->rand_vec();
+
+        // check mod_mul
+        simd::VecType u = simd::mod_mul<TypeParam>(x, y);
+        simd::VecType v = this->mod_mul(x, y);
+        ASSERT_TRUE(this->is_equal(v, u));
+
+        // check mod_mul_safe
+        simd::VecType a = simd::card_minus_one<TypeParam>();
+        simd::VecType b = simd::card_minus_one<TypeParam>();
+        simd::VecType c = this->mod_mul(a, b);
+        simd::VecType d = simd::mod_mul<TypeParam>(a, b);
+        simd::VecType e = simd::mod_mul_safe<TypeParam>(a, b);
+
+        ASSERT_FALSE(this->is_equal(c, d));
+        ASSERT_TRUE(this->is_equal(c, e));
+    }
+}
+
+TYPED_TEST(SimdTestFnt, TestButterflyCt) // NOLINT
+{
+    for (unsigned i = 0; i < 100; ++i) {
+        std::vector<TypeParam> r_values = {
+            1,
+            static_cast<TypeParam>(
+                this->distribution->operator()(quadiron::prng())),
+            static_cast<TypeParam>(this->q - 1)};
+
+        for (const TypeParam r : r_values) {
+            const simd::CtGsCase ct_case =
+                simd::get_case<TypeParam>(r, this->q);
+
+            simd::VecType c = simd::set_one(r);
+
+            simd::VecType x = this->rand_vec();
+            simd::VecType y = this->rand_vec();
+            simd::VecType x_expected = this->copy(x);
+            simd::VecType y_expected = this->copy(y);
+
+            this->butterfly_ct(c, x_expected, y_expected);
+            simd::butterfly_ct<TypeParam>(ct_case, c, x, y);
+
+            ASSERT_TRUE(this->is_equal(x_expected, x));
+            ASSERT_TRUE(this->is_equal(y_expected, y));
+        }
+    }
+}
+
+TYPED_TEST(SimdTestFnt, TestButterflyGs) // NOLINT
+{
+    for (unsigned i = 0; i < 100; ++i) {
+        std::vector<TypeParam> r_values = {
+            1,
+            static_cast<TypeParam>(
+                this->distribution->operator()(quadiron::prng())),
+            static_cast<TypeParam>(this->q - 1)};
+
+        for (const TypeParam r : r_values) {
+            const simd::CtGsCase ct_case =
+                simd::get_case<TypeParam>(r, this->q);
+
+            simd::VecType c = simd::set_one(r);
+
+            simd::VecType x = this->rand_vec();
+            simd::VecType y = this->rand_vec();
+            simd::VecType x_expected = this->copy(x);
+            simd::VecType y_expected = this->copy(y);
+
+            this->butterfly_gs(c, x_expected, y_expected);
+            simd::butterfly_gs<TypeParam>(ct_case, c, x, y);
+
+            ASSERT_TRUE(this->is_equal(x_expected, x));
+            ASSERT_TRUE(this->is_equal(y_expected, y));
+
+            this->butterfly_simple_gs(c, x_expected);
+            simd::butterfly_simple_gs<TypeParam>(ct_case, c, x);
+
+            ASSERT_TRUE(this->is_equal(x_expected, x));
+        }
+    }
+}
+
+#endif