diff --git a/.circleci/config.yml b/.circleci/config.yml index 89f1e8a2..05f59074 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -18,7 +18,7 @@ jobs: CXX=/usr/bin/clang++-6.0 cmake -G 'Unix Makefiles' -DCMAKE_BUILD_TYPE=Debug -DUSE_SIMD=SSE .. && make - run: - name: Compile with Clang 6 (Debug mode/SSE) + name: Compile with Clang 6 (Debug mode/AVX) command: > rm -rf build && mkdir build && cd build && CXX=/usr/bin/clang++-6.0 cmake -G 'Unix Makefiles' -DCMAKE_BUILD_TYPE=Debug -DUSE_SIMD=AVX .. && diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh index 9658c255..b64fe42e 100755 --- a/scripts/benchmark.sh +++ b/scripts/benchmark.sh @@ -44,24 +44,22 @@ fi chunk_size=51200 # for rs-fnt with different packet sizes -word_size=2 for word_size in 1 2; do - for type_size in 2 4; do - max_len=$((256**word_size)) - if ((type_size>word_size)); then - for ec_type in rs-fnt rs-fnt-sys; do - for k in 16 64; do - for n in 32 256 1024; do - if ((nk)); then - m=$((n-k)) - for pkt_size in 512; do - ${bin} -e ${ec_type} -w ${word_size} -t ${type_size} -k ${k} -m ${m} -c ${chunk_size} -s ${sce_type} -g ${threads_nb} -f ${show_type} -p ${pkt_size} -n ${samples_nb} - show_type=0 - done - fi + type_size=$((word_size*2)) + max_len=$((256**word_size)) + if ((type_size>word_size)); then + for ec_type in rs-fnt rs-fnt-sys; do + for k in 16 64; do + for n in 32 256 1024; do + if ((nk)); then + m=$((n-k)) + for pkt_size in 512; do + ${bin} -e ${ec_type} -w ${word_size} -t ${type_size} -k ${k} -m ${m} -c ${chunk_size} -s ${sce_type} -g ${threads_nb} -f ${show_type} -p ${pkt_size} -n ${samples_nb} + show_type=0 done - done + fi done - fi - done + done + done + fi done diff --git a/scripts/test_ec.sh b/scripts/test_ec.sh index 4b133150..c1767187 100755 --- a/scripts/test_ec.sh +++ b/scripts/test_ec.sh @@ -174,7 +174,7 @@ do fec_type=$(echo $i|cut -d_ -f1) word_size=$(echo $i|cut -d_ -f2) - do_test enconly ${fec_type} ${word_size} 50 50 "" "" +# do_test enconly ${fec_type} ${word_size} 50 50 "" "" do_test all ${fec_type} ${word_size} 3 3 "" "" do_test all ${fec_type} ${word_size} 3 3 "0 1" "0" do_test all ${fec_type} ${word_size} 3 5 "0 1" "0" diff --git a/src/fec_base.h b/src/fec_base.h index 2802c007..fcb4b7a3 100644 --- a/src/fec_base.h +++ b/src/fec_base.h @@ -118,7 +118,8 @@ class FecCode { unsigned word_size, unsigned n_data, unsigned n_parities, - size_t pkt_size = 8); + size_t pkt_size = 8, + bool use_meta_buf = false); virtual ~FecCode() = default; /** Return the number of output parts. @@ -173,8 +174,8 @@ class FecCode { bool readw(T* ptr, std::istream* stream); bool writew(T val, std::ostream* stream); - bool read_pkt(char* pkt, std::istream& stream); - bool write_pkt(char* pkt, std::ostream& stream, size_t bytes); + bool read_pkt(char* pkt, std::istream* stream); + bool write_pkt(char* pkt, std::ostream* stream, size_t bytes); void encode_streams_horizontal( std::vector input_data_bufs, @@ -250,6 +251,9 @@ class FecCode { std::unique_ptr> r_powers = nullptr; // buffers for intermediate symbols used for systematic FNT std::unique_ptr> dec_inter_codeword; + // vector for intermediate symbols used for systematic FNT + std::unique_ptr> vec_dec_inter_codeword; + bool use_meta_buf = false; // pure abstract methods that will be defined in derived class virtual void check_params() = 0; @@ -300,7 +304,8 @@ FecCode::FecCode( unsigned word_size, unsigned n_data, unsigned n_parities, - size_t pkt_size) + size_t pkt_size, + bool use_meta_buf) { assert(type == FecType::SYSTEMATIC || type == FecType::NON_SYSTEMATIC); @@ -312,7 +317,8 @@ FecCode::FecCode( this->n_outputs = (type == FecType::SYSTEMATIC) ? this->n_parities : this->code_len; this->pkt_size = pkt_size; - this->buf_size = pkt_size * word_size; + this->buf_size = use_meta_buf ? pkt_size * sizeof(T) : pkt_size * word_size; + this->use_meta_buf = use_meta_buf; } template @@ -384,15 +390,15 @@ inline bool FecCode::writew(T val, std::ostream* stream) } template -inline bool FecCode::read_pkt(char* pkt, std::istream& stream) +inline bool FecCode::read_pkt(char* pkt, std::istream* stream) { - return static_cast(stream.read(pkt, buf_size)); + return static_cast(stream->read(pkt, buf_size)); } template -inline bool FecCode::write_pkt(char* pkt, std::ostream& stream, size_t bytes) +inline bool FecCode::write_pkt(char* pkt, std::ostream* stream, size_t bytes) { - return static_cast(stream.write(pkt, bytes)); + return static_cast(stream->write(pkt, bytes)); } /** Encode streams @@ -477,21 +483,15 @@ void FecCode::encode_streams_vertical( bool cont = true; off_t offset = 0; - // vector of buffers storing data read from chunk - vec::Buffers words_char(n_data, buf_size); - const std::vector words_mem_char = words_char.get_mem(); // vector of buffers storing data that are performed in encoding, i.e. FFT - vec::Buffers words(n_data, pkt_size); - const std::vector words_mem_T = words.get_mem(); + vec::Buffers words(n_data, pkt_size, use_meta_buf); + const std::vector& words_mem = words.get_mem(); int output_len = get_n_outputs(); // vector of buffers storing data that are performed in encoding, i.e. FFT - vec::Buffers output(output_len, pkt_size); - const std::vector output_mem_T = output.get_mem(); - // vector of buffers storing data in output chunk - vec::Buffers output_char(output_len, buf_size); - const std::vector output_mem_char = output_char.get_mem(); + vec::Buffers output(output_len, pkt_size, use_meta_buf); + const std::vector& output_mem = output.get_mem(); reset_stats_enc(); @@ -501,11 +501,13 @@ void FecCode::encode_streams_vertical( while (cont) { for (unsigned i = 0; i < n_data; i++) { - if (!read_pkt(words_mem_char.at(i), *(input_data_bufs[i]))) { + if (!read_pkt( + reinterpret_cast(words_mem.at(i)), + input_data_bufs[i])) { read_bytes = input_data_bufs[i]->gcount(); // Zero-out trailing part std::fill_n( - words_mem_char.at(i) + read_bytes, + reinterpret_cast(words_mem.at(i)) + read_bytes, buf_size - read_bytes, 0); @@ -517,8 +519,9 @@ void FecCode::encode_streams_vertical( break; } - vec::pack( - words_mem_char, words_mem_T, n_data, pkt_size, word_size); + if (use_meta_buf) { + words.reset_meta(); + } timeval t1 = tick(); uint64_t start = hw_timer(); @@ -530,12 +533,11 @@ void FecCode::encode_streams_vertical( total_encode_cycles += (end - start) / buf_size; n_encode_ops++; - vec::unpack( - output_mem_T, output_mem_char, output_len, pkt_size, word_size); - for (unsigned i = 0; i < n_outputs; i++) { write_pkt( - output_mem_char.at(i), *(output_parities_bufs[i]), read_bytes); + reinterpret_cast(output_mem.at(i)), + output_parities_bufs[i], + read_bytes); } offset += pkt_size; } @@ -749,6 +751,11 @@ void FecCode::decode( // Lagrange interpolation decode_apply(context, output, words); + + if (type == FecType::SYSTEMATIC) { + this->fft->fft(*vec_dec_inter_codeword, output); + output.copy(vec_dec_inter_codeword.get(), n_data); + } } /* Initialize context for decoding @@ -804,7 +811,13 @@ void FecCode::decode_prepare( { const vec::Vector& fragments_ids = context.get_fragments_id(); for (unsigned i = 0; i < this->n_data; ++i) { - const int j = fragments_ids.get(i); + unsigned j = fragments_ids.get(i); + if (type == FecType::SYSTEMATIC && j < this->n_data) { + continue; + } + if (type == FecType::SYSTEMATIC) { + j -= this->n_data; + } if (props[j].is_marked(context.props_indices[j], offset)) { // Check if the symbol is a special case whick is marked by // `OOR_MARK`, i.e. true. Note: this check is necessary when @@ -957,21 +970,15 @@ bool FecCode::decode_streams_vertical( decode_build(); - // vector of buffers storing data read from chunk - vec::Buffers words_char(n_data, buf_size); - const std::vector words_mem_char = words_char.get_mem(); // vector of buffers storing data that are performed in encoding, i.e. FFT - vec::Buffers words(n_data, pkt_size); - const std::vector words_mem_T = words.get_mem(); + vec::Buffers words(n_data, pkt_size, use_meta_buf); + const std::vector& words_mem = words.get_mem(); int output_len = n_data; // vector of buffers storing data that are performed in decoding, i.e. FFT - vec::Buffers output(output_len, pkt_size); - const std::vector output_mem_T = output.get_mem(); - // vector of buffers storing data in output chunk - vec::Buffers output_char(output_len, buf_size); - const std::vector output_mem_char = output_char.get_mem(); + vec::Buffers output(output_len, pkt_size, use_meta_buf); + const std::vector& output_mem = output.get_mem(); std::unique_ptr> context = init_context_dec( fragments_ids, input_parities_props, pkt_size, &output); @@ -987,11 +994,12 @@ bool FecCode::decode_streams_vertical( for (unsigned i = 0; i < avail_data_nb; i++) { unsigned data_idx = fragments_ids.get(i); if (!read_pkt( - words_mem_char.at(i), *(input_data_bufs[data_idx]))) { + reinterpret_cast(words_mem.at(i)), + input_data_bufs[data_idx])) { read_bytes = input_data_bufs[data_idx]->gcount(); // Zero-out trailing part std::fill_n( - words_mem_char.at(i) + read_bytes, + reinterpret_cast(words_mem.at(i)) + read_bytes, buf_size - read_bytes, 0); @@ -1002,12 +1010,13 @@ bool FecCode::decode_streams_vertical( for (unsigned i = 0; i < n_data - avail_data_nb; ++i) { unsigned parity_idx = avail_parity_ids.get(i); if (!read_pkt( - words_mem_char.at(avail_data_nb + i), - *(input_parities_bufs[parity_idx]))) { + reinterpret_cast(words_mem.at(avail_data_nb + i)), + input_parities_bufs[parity_idx])) { read_bytes = input_parities_bufs[parity_idx]->gcount(); // Zero-out trailing part std::fill_n( - words_mem_char.at(avail_data_nb + i) + read_bytes, + reinterpret_cast(words_mem.at(avail_data_nb + i)) + + read_bytes, buf_size - read_bytes, 0); @@ -1019,8 +1028,9 @@ bool FecCode::decode_streams_vertical( break; } - vec::pack( - words_mem_char, words_mem_T, n_data, pkt_size, word_size); + if (use_meta_buf) { + words.reset_meta(); + } timeval t1 = tick(); uint64_t start = hw_timer(); @@ -1032,13 +1042,12 @@ bool FecCode::decode_streams_vertical( total_decode_cycles += (end - start) / word_size; n_decode_ops++; - vec::unpack( - output_mem_T, output_mem_char, output_len, pkt_size, word_size); - for (unsigned i = 0; i < n_data; i++) { if (output_data_bufs[i] != nullptr) { write_pkt( - output_mem_char.at(i), *(output_data_bufs[i]), read_bytes); + reinterpret_cast(output_mem.at(i)), + output_data_bufs[i], + read_bytes); } } offset += pkt_size; @@ -1080,23 +1089,18 @@ void FecCode::encode_blocks_vertical( } size_t offset = 0; - size_t block_size = block_size_bytes / word_size; + const size_t element_size = use_meta_buf ? sizeof(T) : word_size; + size_t block_size = block_size_bytes / element_size; - // vector of buffers storing data read from chunk - vec::Buffers words_char(n_data, buf_size); - const std::vector words_mem_char = words_char.get_mem(); // vector of buffers storing data that are performed in encoding, i.e. FFT - vec::Buffers words(n_data, pkt_size); - const std::vector words_mem_T = words.get_mem(); + vec::Buffers words(n_data, pkt_size, use_meta_buf); + const std::vector& words_mem = words.get_mem(); int output_len = get_n_outputs(); // vector of buffers storing data that are performed in encoding, i.e. FFT - vec::Buffers output(output_len, pkt_size); - const std::vector output_mem_T = output.get_mem(); - // vector of buffers storing data in output chunk - vec::Buffers output_char(output_len, buf_size); - const std::vector output_mem_char = output_char.get_mem(); + vec::Buffers output(output_len, pkt_size, use_meta_buf); + const std::vector& output_mem = output.get_mem(); reset_stats_enc(); @@ -1105,25 +1109,26 @@ void FecCode::encode_blocks_vertical( size_t copy_size = std::min(pkt_size, remain_size); for (unsigned i = 0; i < n_data; i++) { memcpy( - reinterpret_cast(words_mem_char.at(i)), - data_bufs[i] + offset * word_size, - copy_size * word_size); + reinterpret_cast(words_mem.at(i)), + data_bufs[i] + offset * element_size, + copy_size * element_size); } // Zero-out trailing part of data if (copy_size < pkt_size) { - const size_t copy_bytes = copy_size * word_size; + const size_t copy_bytes = copy_size * element_size; const size_t trailing_bytes = buf_size - copy_bytes; for (unsigned i = 0; i < n_data; i++) { memset( - reinterpret_cast(words_mem_char.at(i)) + copy_bytes, + reinterpret_cast(words_mem.at(i)) + copy_bytes, 0, trailing_bytes); } } - vec::pack( - words_mem_char, words_mem_T, n_data, pkt_size, word_size); + if (use_meta_buf) { + words.reset_meta(); + } timeval t1 = tick(); uint64_t start = hw_timer(); @@ -1132,18 +1137,15 @@ void FecCode::encode_blocks_vertical( uint64_t t2 = hrtime_usec(t1); total_enc_usec += t2; - total_encode_cycles += (end - start) / (copy_size * word_size); + total_encode_cycles += (end - start) / (copy_size * element_size); n_encode_ops++; - vec::unpack( - output_mem_T, output_mem_char, output_len, pkt_size, word_size); - for (unsigned i = 0; i < n_outputs; i++) { if (wanted_idxs[i]) { memcpy( - parities_bufs[i] + offset * word_size, - reinterpret_cast(output_mem_char.at(i)), - copy_size * word_size); + parities_bufs[i] + offset * element_size, + reinterpret_cast(output_mem.at(i)), + copy_size * element_size); } } offset += pkt_size; @@ -1183,7 +1185,8 @@ bool FecCode::decode_blocks_vertical( size_t block_size_bytes) { size_t offset = 0; - size_t block_size = block_size_bytes / word_size; + const size_t element_size = use_meta_buf ? sizeof(T) : word_size; + size_t block_size = block_size_bytes / element_size; unsigned fragment_index = 0; unsigned parity_index = 0; @@ -1237,21 +1240,15 @@ bool FecCode::decode_blocks_vertical( decode_build(); - // vector of buffers storing data read from chunk - vec::Buffers words_char(n_data, buf_size); - const std::vector words_mem_char = words_char.get_mem(); // vector of buffers storing data that are performed in encoding, i.e. FFT - vec::Buffers words(n_data, pkt_size); - const std::vector words_mem_T = words.get_mem(); + vec::Buffers words(n_data, pkt_size, use_meta_buf); + const std::vector& words_mem = words.get_mem(); int output_len = n_data; // vector of buffers storing data that are performed in decoding, i.e. FFT - vec::Buffers output(output_len, pkt_size); - const std::vector output_mem_T = output.get_mem(); - // vector of buffers storing data in output chunk - vec::Buffers output_char(output_len, buf_size); - const std::vector output_mem_char = output_char.get_mem(); + vec::Buffers output(output_len, pkt_size, use_meta_buf); + const std::vector& output_mem = output.get_mem(); std::unique_ptr> context = init_context_dec(fragments_ids, parities_props, pkt_size, &output); @@ -1265,33 +1262,34 @@ bool FecCode::decode_blocks_vertical( for (unsigned i = 0; i < avail_data_nb; i++) { unsigned data_idx = fragments_ids.get(i); memcpy( - reinterpret_cast(words_mem_char.at(i)), - data_bufs[data_idx] + offset * word_size, - copy_size * word_size); + reinterpret_cast(words_mem.at(i)), + data_bufs[data_idx] + offset * element_size, + copy_size * element_size); } } for (unsigned i = 0; i < n_data - avail_data_nb; ++i) { unsigned parity_idx = avail_parity_ids.get(i); memcpy( - reinterpret_cast(words_mem_char.at(avail_data_nb + i)), - parities_bufs[parity_idx] + offset * word_size, - copy_size * word_size); + reinterpret_cast(words_mem.at(avail_data_nb + i)), + parities_bufs[parity_idx] + offset * element_size, + copy_size * element_size); } // Zero-out trailing part of data if (copy_size < pkt_size) { - const size_t copy_bytes = copy_size * word_size; + const size_t copy_bytes = copy_size * element_size; const size_t trailing_bytes = buf_size - copy_bytes; for (unsigned i = 0; i < n_data; i++) { memset( - reinterpret_cast(words_mem_char.at(i)) + copy_bytes, + reinterpret_cast(words_mem.at(i)) + copy_bytes, 0, trailing_bytes); } } - vec::pack( - words_mem_char, words_mem_T, n_data, pkt_size, word_size); + if (use_meta_buf) { + words.reset_meta(); + } timeval t1 = tick(); uint64_t start = hw_timer(); @@ -1300,18 +1298,15 @@ bool FecCode::decode_blocks_vertical( uint64_t t2 = hrtime_usec(t1); total_dec_usec += t2; - total_decode_cycles += (end - start) / word_size; + total_decode_cycles += (end - start) / element_size; n_decode_ops++; - vec::unpack( - output_mem_T, output_mem_char, output_len, pkt_size, word_size); - for (unsigned i = 0; i < n_data; i++) { if (wanted_idxs[i]) { memcpy( - data_bufs[i] + offset * word_size, - reinterpret_cast(output_mem_char.at(i)), - copy_size * word_size); + data_bufs[i] + offset * element_size, + reinterpret_cast(output_mem.at(i)), + copy_size * element_size); } } offset += pkt_size; @@ -1388,15 +1383,23 @@ void FecCode::decode_prepare( // As loc.offset := offset + j const size_t j = (loc_offset - offset); - // Check if the symbol is a special case whick is marked by - // `OOR_MARK`. - // Note: this check is necessary when word_size is not large - // enough to cover all symbols of the field. Following check is - // used for FFT over FNT where the single special case symbol - // equals card - 1 - if (props[frag_id].marker(context.props_indices.at(frag_id)) - == OOR_MARK) { - chunk[j] = thres; + if (use_meta_buf) { + T meta = + props[frag_id].marker(context.props_indices.at(frag_id)); + if (meta) { + words.set_meta(i, j, meta); + } + } else { + // Check if the symbol is a special case whick is marked by + // `OOR_MARK`. + // Note: this check is necessary when word_size is not large + // enough to cover all symbols of the field. Following check is + // used for FFT over FNT where the single special case symbol + // equals card - 1 + if (props[frag_id].marker(context.props_indices.at(frag_id)) + == OOR_MARK) { + chunk[j] = thres; + } } context.props_indices.at(frag_id)++; } diff --git a/src/fec_context.h b/src/fec_context.h index 0e153267..03c9560c 100644 --- a/src/fec_context.h +++ b/src/fec_context.h @@ -117,11 +117,14 @@ class DecodeContext { } else { assert(output != nullptr); + bool b_meta = output->has_meta(); + // Buffers each of which is fully allocated // Buffer of length `len_2k` - buf1_2k = std::make_unique>(len_2k, size); + buf1_2k = std::make_unique>(len_2k, size, b_meta); // Buffer of length `max_n_2k - k` - bNmK = std::make_unique>(max_n_2k - k, size); + bNmK = + std::make_unique>(max_n_2k - k, size, b_meta); // Buffers that are derived from the two above ones // Buffer sliced from `k` first elements of `buf1_2k` diff --git a/src/fec_rs_fnt.h b/src/fec_rs_fnt.h index 730d08df..0996b39c 100644 --- a/src/fec_rs_fnt.h +++ b/src/fec_rs_fnt.h @@ -60,6 +60,13 @@ class RsFnt : public FecCode { // decoding context used in encoding of systematic FNT std::unique_ptr> enc_context; + // vector for intermediate symbols used for systematic FNT + std::unique_ptr> vec_inter_words; + // vector of `n` symbols used for systematic FNT + std::unique_ptr> vec_inter_codeword; + // decoding context used in horizontal encoding of systematic FNT + std::unique_ptr> vec_enc_context; + // Indices used for accelerated functions size_t simd_vec_len; size_t simd_trailing_len; @@ -72,7 +79,7 @@ class RsFnt : public FecCode { unsigned n_data, unsigned n_parities, size_t pkt_size = 8) - : FecCode(type, word_size, n_data, n_parities, pkt_size) + : FecCode(type, word_size, n_data, n_parities, pkt_size, true) { this->fec_init(); @@ -143,18 +150,29 @@ class RsFnt : public FecCode { enc_frag_ids->set(i, i); } - inter_words = - std::make_unique>(this->n_data, this->pkt_size); + inter_words = std::make_unique>( + this->n_data, this->pkt_size, true); suffix_words = std::make_unique>( - this->n - this->n_data - this->n_outputs, this->pkt_size); + this->n - this->n_data - this->n_outputs, this->pkt_size, true); std::vector dummy_props; enc_context = this->init_context_dec( *enc_frag_ids, dummy_props, this->pkt_size, inter_words.get()); // for decoding - this->dec_inter_codeword = - std::make_unique>(this->n, this->pkt_size); + this->dec_inter_codeword = std::make_unique>( + this->n, this->pkt_size, true); + + vec_inter_words = + std::make_unique>(*(this->gf), this->n_data); + vec_inter_codeword = + std::make_unique>(*(this->gf), this->n); + vec_enc_context = + this->init_context_dec(*enc_frag_ids, dummy_props); + + // for decoding + this->vec_dec_inter_codeword = + std::make_unique>(*(this->gf), this->n); } } @@ -177,7 +195,14 @@ class RsFnt : public FecCode { off_t offset, vec::Vector& words) override { - this->fft->fft(output, words); + if (this->type == FecType::SYSTEMATIC) { + this->decode_apply(*vec_enc_context, *vec_inter_words, words); + this->fft->fft(*vec_inter_codeword, *vec_inter_words); + output.copy( + vec_inter_codeword.get(), this->n_parities, 0, this->n_data); + } else { + this->fft->fft(output, words); + } encode_post_process(output, props, offset); } @@ -237,6 +262,7 @@ class RsFnt : public FecCode { { if (this->type == FecType::SYSTEMATIC) { decode_data(*enc_context, *inter_words, words); + // this->decode_apply(*enc_context, *inter_words, words); vec::Buffers _tmp(words, output); vec::Buffers _output(_tmp, *suffix_words); this->fft->fft(_output, *inter_words); @@ -251,14 +277,26 @@ class RsFnt : public FecCode { std::vector& props, off_t offset) override { - // check for out of range value in output unsigned size = output.get_size(); - T thres = (this->gf->card() - 1); - for (unsigned i = 0; i < this->n_outputs; ++i) { - T* chunk = output.get(i); - for (unsigned j = 0; j < size; ++j) { - if (chunk[j] & thres) { - props[i].add(offset + j, OOR_MARK); + if (output.has_meta()) { + // check meta of elements + for (unsigned i = 0; i < this->n_outputs; ++i) { + for (unsigned j = 0; j < size; ++j) { + T meta = output.get_meta(i, j); + if (meta) { + props[i].add(offset + j, meta); + } + } + } + } else { + // check for out of range value in output + T thres = (this->gf->card() - 1); + for (unsigned i = 0; i < this->n_outputs; ++i) { + T* chunk = output.get(i); + for (unsigned j = 0; j < size; ++j) { + if (chunk[j] & thres) { + props[i].add(offset + j, OOR_MARK); + } } } } diff --git a/src/fec_rs_nf4.h b/src/fec_rs_nf4.h index 67893405..37217a35 100644 --- a/src/fec_rs_nf4.h +++ b/src/fec_rs_nf4.h @@ -110,6 +110,9 @@ class RsNf4 : public FecCode { for (unsigned i = 0; i < this->n; i++) { this->r_powers->set(i, ngff4->exp(this->r, i)); } + + work_buf = + std::make_unique>(this->n_data, this->pkt_size); } int get_n_outputs() override @@ -168,6 +171,7 @@ class RsNf4 : public FecCode { } private: + std::unique_ptr> work_buf; const gf::Field* sub_field; gf::NF4* ngff4; int gf_n; @@ -258,13 +262,28 @@ class RsNf4 : public FecCode { off_t offset, vec::Buffers& words) override { + // as Buffers has not meta, `words` contains only `buf_size = pkt_size * + // word_size` bytes from source. It is stored in first `buf_size / + // sizeof(T)` elements of `words` + + const unsigned nb_words_per_element = sizeof(T) / this->word_size; for (unsigned i = 0; i < this->n_data; ++i) { T* chunk = words.get(i); - for (size_t j = 0; j < this->pkt_size; ++j) { - chunk[j] = ngff4->pack(chunk[j]); + T* work = work_buf->get(i); + + size_t u = 0; + T element = chunk[u]; + for (size_t j = 0, u = 0; j < this->pkt_size; ++j) { + work[j] = ngff4->pack(element); + + (j + 1) % nb_words_per_element == 0 + ? element = chunk[++u] + : element = static_cast(element) + >> (CHAR_BIT * this->word_size); } } - this->fft->fft(output, words); + + this->fft->fft(output, *work_buf); encode_post_process(output, props, offset); } @@ -273,17 +292,42 @@ class RsNf4 : public FecCode { std::vector& props, off_t offset) override { - size_t size = output.get_size(); + // as Buffers has not meta, output write only `buf_size = pkt_size * + // word_size` bytes to destination + // This data should be stored in first `buf_size / sizeof(T)` elements. + + const unsigned nb_words_per_element = sizeof(T) / this->word_size; + const size_t size = output.get_size(); GroupedValues true_val; for (unsigned frag_id = 0; frag_id < this->code_len; ++frag_id) { T* chunk = output.get(frag_id); - for (size_t symb_id = 0; symb_id < size; symb_id++) { + + size_t out_symb_id = 0; + unsigned symb_offset = 0; + T element = 0; + for (size_t symb_id = 0; symb_id < size; ++symb_id) { ngff4->unpack(chunk[symb_id], true_val); if (true_val.flag > 0) { const off_t loc = offset + symb_id; props[frag_id].add(loc, true_val.flag); } - chunk[symb_id] = true_val.values; + + // for test + chunk[symb_id] = 0; + + element |= (static_cast(true_val.values) << symb_offset); + if ((symb_id + 1) % nb_words_per_element == 0) { + chunk[out_symb_id] = element; + out_symb_id++; + symb_offset = 0; + element = 0; + } else { + symb_offset += (CHAR_BIT * this->word_size); + } + } + // for the last no-full element + if (symb_offset > 0) { + chunk[out_symb_id] = element; } } } @@ -295,23 +339,35 @@ class RsNf4 : public FecCode { vec::Buffers& words) override { const vec::Vector& fragments_ids = context.get_fragments_id(); + // as Buffers has not meta, `words` contains only `buf_size = pkt_size * + // word_size` bytes from source. It is stored in first `buf_size / + // sizeof(T)` elements of `words` + + const unsigned nb_words_per_element = sizeof(T) / this->word_size; for (unsigned i = 0; i < this->n_data; ++i) { const int frag_id = fragments_ids.get(i); T* chunk = words.get(i); + T* work = work_buf->get(i); - // pack marked symbols - for (size_t index = 0; index < this->pkt_size; ++index) { + size_t u = 0; + T element = chunk[u]; + for (size_t j = 0, u = 0; j < this->pkt_size; ++j) { if (props[frag_id].is_marked( - context.props_indices[frag_id], offset + index)) { + context.props_indices[frag_id], offset + j)) { // pack marked symbol - chunk[index] = ngff4->pack( - chunk[index], + work[j] = ngff4->pack( + element, props[frag_id].marker(context.props_indices[frag_id])); context.props_indices.at(frag_id)++; } else { // pack un-marked symbol - chunk[index] = ngff4->pack(chunk[index]); + work[j] = ngff4->pack(element); } + + (j + 1) % nb_words_per_element == 0 + ? element = chunk[++u] + : element = static_cast(element) + >> (CHAR_BIT * this->word_size); } } } @@ -319,15 +375,42 @@ class RsNf4 : public FecCode { void decode_apply( DecodeContext& context, vec::Buffers& output, - vec::Buffers& words) override + vec::Buffers&) override { + // as Buffers has not meta, output write only `buf_size = pkt_size * + // word_size` bytes to destination + // This data should be stored in first `buf_size / sizeof(T)` elements. + // decode_apply: do the same thing as in fec_base - FecCode::decode_apply(context, output, words); + FecCode::decode_apply(context, output, *work_buf); + + const unsigned nb_words_per_element = sizeof(T) / this->word_size; + GroupedValues true_val; // unpack decoded symbols - for (unsigned i = 0; i < this->n_data; ++i) { - T* chunk = output.get(i); - for (unsigned j = 0; j < this->pkt_size; ++j) { - chunk[j] = ngff4->unpack(chunk[j]).values; + for (unsigned frag_id = 0; frag_id < this->n_data; ++frag_id) { + T* chunk = output.get(frag_id); + + size_t out_symb_id = 0; + unsigned symb_offset = 0; + T element = 0; + for (size_t symb_id = 0; symb_id < this->pkt_size; ++symb_id) { + ngff4->unpack(chunk[symb_id], true_val); + // for test + chunk[symb_id] = 0; + + element |= (static_cast(true_val.values) << symb_offset); + if ((symb_id + 1) % nb_words_per_element == 0) { + chunk[out_symb_id] = element; + out_symb_id++; + symb_offset = 0; + element = 0; + } else { + symb_offset += (CHAR_BIT * this->word_size); + } + } + // for the last no-full element + if (symb_offset > 0) { + chunk[out_symb_id] = element; } } } diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp index ed82fab8..bc412fd2 100644 --- a/src/fec_vectorisation.cpp +++ b/src/fec_vectorisation.cpp @@ -50,18 +50,16 @@ void RsFnt::encode_post_process( off_t offset) { size_t size = this->pkt_size; - uint16_t threshold = this->gf->card_minus_one(); unsigned code_len = this->n_outputs; - simd::encode_post_process( - output, props, offset, code_len, threshold, simd_vec_len); + simd::encode_post_process(output, props, offset, code_len, simd_vec_len); if (simd_trailing_len > 0) { for (unsigned i = 0; i < code_len; ++i) { - uint16_t* chunk = output.get(i); - for (size_t j = simd_offset; j < size; ++j) { - if (chunk[j] == threshold) { - props[i].add(offset + j, OOR_MARK); + for (unsigned j = simd_offset; j < size; ++j) { + uint16_t meta = output.get_meta(i, j); + if (meta) { + props[i].add(offset + j, meta); } } } @@ -75,18 +73,16 @@ void RsFnt::encode_post_process( off_t offset) { const size_t size = this->pkt_size; - const uint32_t threshold = this->gf->card_minus_one(); const unsigned code_len = this->n_outputs; - simd::encode_post_process( - output, props, offset, code_len, threshold, simd_vec_len); + simd::encode_post_process(output, props, offset, code_len, simd_vec_len); if (simd_trailing_len > 0) { for (unsigned i = 0; i < code_len; ++i) { - uint32_t* chunk = output.get(i); - for (size_t j = simd_offset; j < size; ++j) { - if (chunk[j] == threshold) { - props[i].add(offset + j, OOR_MARK); + for (unsigned j = simd_offset; j < size; ++j) { + uint32_t meta = output.get_meta(i, j); + if (meta) { + props[i].add(offset + j, meta); } } } diff --git a/src/fft_2n.h b/src/fft_2n.h index 1862362d..ceab08c6 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -465,14 +465,36 @@ void Radix2::butterfly_ct_step_slow( unsigned step, size_t offset) { - for (int i = start; i < this->n; i += step) { - T* a = buf.get(i); - T* b = buf.get(i + m); - // perform butterfly operation for Cooley-Tukey FFT algorithm - for (size_t j = offset; j < this->pkt_size; ++j) { - T x = this->gf->mul(coef, b[j]); - b[j] = this->gf->sub(a[j], x); - a[j] = this->gf->add(a[j], x); + if (buf.has_meta()) { + for (int i = start; i < this->n; i += step) { + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + T a_lo = 0, a_hi = 0, b_lo = 0, b_hi = 0; + buf.get(i, j, a_hi, a_lo); + buf.get(i + m, j, b_hi, b_lo); + + T x = this->gf->mul(coef, b_lo); + b_lo = this->gf->sub(a_lo, x); + a_lo = this->gf->add(a_lo, x); + + x = this->gf->mul(coef, b_hi); + b_hi = this->gf->sub(a_hi, x); + a_hi = this->gf->add(a_hi, x); + + buf.set(i, j, a_hi, a_lo); + buf.set(i + m, j, b_hi, b_lo); + } + } + } else { + for (int i = start; i < this->n; i += step) { + T* a = buf.get(i); + T* b = buf.get(i + m); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + T x = this->gf->mul(coef, b[j]); + b[j] = this->gf->sub(a[j], x); + a[j] = this->gf->add(a[j], x); + } } } } @@ -564,14 +586,36 @@ void Radix2::butterfly_gs_step_slow( unsigned step, size_t offset) { - for (int i = start; i < this->n; i += step) { - T* a = buf.get(i); - T* b = buf.get(i + m); - // perform butterfly operation for Cooley-Tukey FFT algorithm - for (size_t j = offset; j < this->pkt_size; ++j) { - T x = this->gf->sub(a[j], b[j]); - a[j] = this->gf->add(a[j], b[j]); - b[j] = this->gf->mul(coef, x); + if (buf.has_meta()) { + for (int i = start; i < this->n; i += step) { + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + T a_lo = 0, a_hi = 0, b_lo = 0, b_hi = 0; + buf.get(i, j, a_hi, a_lo); + buf.get(i + m, j, b_hi, b_lo); + + T x = this->gf->sub(a_lo, b_lo); + a_lo = this->gf->add(a_lo, b_lo); + b_lo = this->gf->mul(coef, x); + + x = this->gf->sub(a_hi, b_hi); + a_hi = this->gf->add(a_hi, b_hi); + b_hi = this->gf->mul(coef, x); + + buf.set(i, j, a_hi, a_lo); + buf.set(i + m, j, b_hi, b_lo); + } + } + } else { + for (int i = start; i < this->n; i += step) { + T* a = buf.get(i); + T* b = buf.get(i + m); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + T x = this->gf->sub(a[j], b[j]); + a[j] = this->gf->add(a[j], b[j]); + b[j] = this->gf->mul(coef, x); + } } } } @@ -585,12 +629,28 @@ void Radix2::butterfly_gs_step_simple_slow( unsigned step, size_t offset) { - for (int i = start; i < this->n; i += step) { - T* a = buf.get(i); - T* b = buf.get(i + m); - // perform butterfly operation for Cooley-Tukey FFT algorithm - for (size_t j = offset; j < this->pkt_size; ++j) { - b[j] = this->gf->mul(coef, a[j]); + if (buf.has_meta()) { + for (int i = start; i < this->n; i += step) { + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + T a_lo = 0, a_hi = 0, b_lo = 0, b_hi = 0; + buf.get(i, j, a_hi, a_lo); + buf.get(i + m, j, b_hi, b_lo); + + b_lo = this->gf->mul(coef, a_lo); + b_hi = this->gf->mul(coef, a_hi); + + buf.set(i + m, j, b_hi, b_lo); + } + } + } else { + for (int i = start; i < this->n; i += step) { + T* a = buf.get(i); + T* b = buf.get(i + m); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + b[j] = this->gf->mul(coef, a[j]); + } } } } diff --git a/src/fft_naive.h b/src/fft_naive.h index cf312f21..16c12dc0 100644 --- a/src/fft_naive.h +++ b/src/fft_naive.h @@ -148,16 +148,35 @@ void Naive::_fft( vec::Buffers& input, vec::Matrix* _W) { + assert(output.has_meta() == input.has_meta()); const unsigned len = this->n; const unsigned size = this->pkt_size; - for (unsigned i = 0; i < len; ++i) { - output.fill(i, 0); - T* buf = output.get(i); - for (unsigned j = 0; j < len; ++j) { - T* ibuf = input.get(j); - T r = _W->get(i, j); - for (unsigned u = 0; u < size; ++u) { - buf[u] = this->gf->add(buf[u], this->gf->mul(r, ibuf[u])); + + if (output.has_meta()) { + for (unsigned i = 0; i < len; ++i) { + output.fill(i, 0); + for (unsigned j = 0; j < len; ++j) { + T r = _W->get(i, j); + for (unsigned u = 0; u < size; ++u) { + T o_hi = 0, o_lo = 0, i_hi = 0, i_lo = 0; + output.get(i, u, o_hi, o_lo); + input.get(j, u, i_hi, i_lo); + o_hi = this->gf->add(o_hi, this->gf->mul(r, i_hi)); + o_lo = this->gf->add(o_lo, this->gf->mul(r, i_lo)); + output.set(i, u, o_hi, o_lo); + } + } + } + } else { + for (unsigned i = 0; i < len; ++i) { + output.fill(i, 0); + T* buf = output.get(i); + for (unsigned j = 0; j < len; ++j) { + T* ibuf = input.get(j); + T r = _W->get(i, j); + for (unsigned u = 0; u < size; ++u) { + buf[u] = this->gf->add(buf[u], this->gf->mul(r, ibuf[u])); + } } } } diff --git a/src/gf_ring.cpp b/src/gf_ring.cpp index da1ed530..0a340a60 100644 --- a/src/gf_ring.cpp +++ b/src/gf_ring.cpp @@ -38,83 +38,36 @@ namespace quadiron { namespace gf { template <> -void RingModN::neg(size_t n, uint16_t* x) const +void RingModN::neg(vec::Buffers& buf, size_t buf_id) const { - simd::neg(n, x, this->_card); + simd::neg(*this, buf, buf_id); } template <> -void RingModN::neg(size_t n, uint32_t* x) const +void RingModN::neg(vec::Buffers& buf, size_t buf_id) const { - simd::neg(n, x, this->_card); -} - -template <> -void RingModN::mul_coef_to_buf( - uint32_t a, - uint32_t* src, - uint32_t* dest, - size_t len) const -{ - simd::mul_coef_to_buf(a, src, dest, len, this->_card); -} - -template <> -void RingModN::add_two_bufs(uint32_t* src, uint32_t* dest, size_t len) - const -{ - simd::add_two_bufs(src, dest, len, this->_card); -} - -template <> -void RingModN::sub_two_bufs( - uint32_t* bufa, - uint32_t* bufb, - uint32_t* res, - size_t len) const -{ - simd::sub_two_bufs(bufa, bufb, res, len, this->_card); + simd::neg(*this, buf, buf_id); } +// @note We specialize the function for the case Buffers having meta template <> void RingModN::mul_coef_to_buf( - uint16_t a, - uint16_t* src, - uint16_t* dest, - size_t len) const + uint16_t coef, + vec::Buffers& src, + vec::Buffers& dest, + size_t buf_id) const { - simd::mul_coef_to_buf(a, src, dest, len, this->_card); + simd::mul_coef_to_buf(*this, coef, src, dest, buf_id); } template <> -void RingModN::add_two_bufs(uint16_t* src, uint16_t* dest, size_t len) - const -{ - simd::add_two_bufs(src, dest, len, this->_card); -} - -template <> -void RingModN::sub_two_bufs( - uint16_t* bufa, - uint16_t* bufb, - uint16_t* res, - size_t len) const -{ - simd::sub_two_bufs(bufa, bufb, res, len, this->_card); -} - -template <> -void RingModN::hadamard_mul(int n, uint16_t* x_u16, uint16_t* y_u16) - const -{ - simd::mul_two_bufs(y_u16, x_u16, n, this->_card); -} - -template <> -void RingModN::hadamard_mul(int n, uint32_t* x_u32, uint32_t* y_u32) - const +void RingModN::mul_coef_to_buf( + uint32_t coef, + vec::Buffers& src, + vec::Buffers& dest, + size_t buf_id) const { - simd::mul_two_bufs(y_u32, x_u32, n, this->_card); + simd::mul_coef_to_buf(*this, coef, src, dest, buf_id); } } // namespace gf diff --git a/src/gf_ring.h b/src/gf_ring.h index 523d7dd6..31f9dad7 100644 --- a/src/gf_ring.h +++ b/src/gf_ring.h @@ -95,7 +95,11 @@ class RingModN { T exp_quick(T base, T exponent) const; T log_naive(T base, T exponent) const; virtual T replicate(T a) const; - virtual void mul_coef_to_buf(T a, T* src, T* dest, size_t len) const; + virtual void mul_coef_to_buf( + T coef, + vec::Buffers& src, + vec::Buffers& dest, + size_t buf_id) const; virtual void mul_vec_to_vecp( vec::Vector& u, vec::Buffers& src, @@ -127,8 +131,8 @@ class RingModN { T get_code_len(T n) const; T get_code_len_high_compo(T n) const; virtual void hadamard_mul(int n, T* x, T* y) const; - virtual void neg(size_t n, T* x) const; - virtual void neg(vec::Buffers& buf) const; + void neg(vec::Buffers& buf, size_t buf_id) const; + void neg(vec::Buffers& buf) const; RingModN(RingModN&&) = default; @@ -359,15 +363,30 @@ inline T RingModN::replicate(T a) const return a; } -// For each i, dest[i] = a * src[i] template -inline void RingModN::mul_coef_to_buf(T a, T* src, T* dest, size_t len) const +inline void RingModN::mul_coef_to_buf( + T coef, + vec::Buffers& src, + vec::Buffers& dest, + size_t buf_id) const { - size_t i; - DoubleSizeVal coef = DoubleSizeVal(a); - for (i = 0; i < len; i++) { - // perform multiplication - dest[i] = mul(coef, src[i]); + assert(buf_id >= 0 && buf_id < static_cast(src.get_n())); + + size_t len = src.get_size(); + if (src.has_meta()) { + for (size_t i = 0; i < len; i++) { + T lo = 0, hi = 0; + src.get(buf_id, i, hi, lo); + dest.set(buf_id, i, mul(coef, hi), mul(coef, lo)); + } + } else { + T* src_mem = src.get(buf_id); + T* dest_mem = dest.get(buf_id); + DoubleSizeVal d_coef = DoubleSizeVal(coef); + + for (size_t i = 0; i < len; i++) { + dest_mem[i] = mul(d_coef, src_mem[i]); + } } } @@ -378,24 +397,28 @@ inline void RingModN::mul_vec_to_vecp( vec::Buffers& dest) const { assert(u.get_n() == src.get_n()); - int i; - int n = u.get_n(); - size_t len = src.get_size(); - T h = this->card_minus_one(); - const std::vector& src_mem = src.get_mem(); - const std::vector& dest_mem = dest.get_mem(); + + const size_t n = src.get_n(); + const size_t h = this->card_minus_one(); + const bool same_obj = std::addressof(src) == std::addressof(dest); T* coef_vec = u.get_mem(); - for (i = 0; i < n; i++) { - T coef = coef_vec[i]; - if (coef > 1 && coef < h) { - this->mul_coef_to_buf(coef, src_mem[i], dest_mem[i], len); - } else if (coef == 1) { - dest.copy(src, i, i); - } else if (coef == 0) { + + for (size_t i = 0; i < n; ++i) { + const T coef = coef_vec[i]; + if (coef == 0) { dest.fill(i, 0); - } else if (coef == h) { + } else if (coef == 1) { + if (same_obj) { + continue; + } dest.copy(src, i, i); - this->neg(len, dest_mem[i]); + } else if (coef < h) { + this->mul_coef_to_buf(coef, src, dest, i); + } else { // coef == card - 1 + if (!same_obj) { + dest.copy(src, i, i); + } + this->neg(dest, i); } } } @@ -830,11 +853,21 @@ inline void RingModN::hadamard_mul(int n, T* x, T* y) const } template -inline void RingModN::neg(size_t n, T* x) const +inline void RingModN::neg(vec::Buffers& buf, size_t buf_id) const { - // add y to the first half of `x` - for (size_t i = 0; i < n; i++) { - x[i] = sub(0, x[i]); + size_t size = buf.get_size(); + if (buf.has_meta()) { + for (size_t i = 0; i < size; ++i) { + T hi = 0, lo = 0; + buf.get(buf_id, i, hi, lo); + buf.set(buf_id, i, neg(hi), neg(lo)); + } + } else { + T* x = buf.get(buf_id); + // add y to the first half of `x` + for (size_t i = 0; i < size; ++i) { + x[i] = sub(0, x[i]); + } } } @@ -842,8 +875,18 @@ template inline void RingModN::neg(vec::Buffers& buf) const { size_t size = buf.get_size(); - for (int i = 0; i < buf.get_n(); i++) { - neg(size, buf.get(i)); + if (buf.has_meta()) { + for (int i = 0; i < buf.get_n(); i++) { + for (size_t j = 0; j < size; ++j) { + T hi = 0, lo = 0; + buf.get(i, j, hi, lo); + buf.set(i, j, neg(hi), neg(lo)); + } + } + } else { + for (int i = 0; i < buf.get_n(); i++) { + neg(buf, i); + } } } @@ -851,56 +894,23 @@ inline void RingModN::neg(vec::Buffers& buf) const /* Operations are vectorized by SIMD */ template <> -void RingModN::neg(size_t n, uint16_t* x) const; - +void RingModN::neg(vec::Buffers& buf, size_t buf_id) const; template <> -void RingModN::neg(size_t n, uint32_t* x) const; +void RingModN::neg(vec::Buffers& buf, size_t buf_id) const; template <> void RingModN::mul_coef_to_buf( - uint16_t a, - uint16_t* src, - uint16_t* dest, - size_t len) const; + uint16_t coef, + vec::Buffers& src, + vec::Buffers& dest, + size_t buf_id) const; template <> void RingModN::mul_coef_to_buf( - uint32_t a, - uint32_t* src, - uint32_t* dest, - size_t len) const; - -template <> -void RingModN::add_two_bufs(uint16_t* src, uint16_t* dest, size_t len) - const; - -template <> -void RingModN::add_two_bufs(uint32_t* src, uint32_t* dest, size_t len) - const; - -template <> -void RingModN::sub_two_bufs( - uint16_t* bufa, - uint16_t* bufb, - uint16_t* res, - size_t len) const; - -template <> -void RingModN::sub_two_bufs( - uint32_t* bufa, - uint32_t* bufb, - uint32_t* res, - size_t len) const; - -template <> -void RingModN::hadamard_mul(int n, uint16_t* x, uint16_t* y) const; -template <> -void RingModN::hadamard_mul(int n, uint32_t* x, uint32_t* y) const; -// template <> -// void RingModN::hadamard_mul(int n, uint64_t* x, uint64_t* y) const; -// template <> -// void RingModN<__uint128_t>::hadamard_mul(int n, __uint128_t* x, __uint128_t* -// y) const; + uint32_t coef, + vec::Buffers& src, + vec::Buffers& dest, + size_t buf_id) const; #endif // #ifdef QUADIRON_USE_SIMD diff --git a/src/property.h b/src/property.h index 2fc6a564..c83f93b8 100644 --- a/src/property.h +++ b/src/property.h @@ -110,6 +110,7 @@ class Properties { unsigned i = 2; for (auto const& item : props) { dwords[i++] = htonl(narrow_cast(item.first)); + dwords[i++] = htonl(narrow_cast(item.second)); } dwords[1] = htonl(i - 2); std::fill(dwords + i, dwords + n_dwords - 1, htonl(0)); @@ -135,8 +136,9 @@ class Properties { if ((2 + _n_dwords) > n_dwords) { return -1; } - for (unsigned i = 0; i < _n_dwords; i++) { - add(static_cast(ntohl(dwords[i + 2])), OOR_MARK); + for (unsigned i = 0; i < _n_dwords; i += 2) { + add(static_cast(ntohl(dwords[i + 2])), + ntohl(dwords[i + 3])); } return 0; } diff --git a/src/simd/definitions.h b/src/simd/definitions.h index 50617a38..304bde9f 100644 --- a/src/simd/definitions.h +++ b/src/simd/definitions.h @@ -76,6 +76,7 @@ enum class InstructionSet { using RegisterType = __m256; using MaskType = __m256i; +using MetaType = uint32_t; static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::AVX; @@ -86,6 +87,7 @@ static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::AVX; using RegisterType = __m128; using MaskType = __m128i; +using MetaType = uint16_t; static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::SSE; @@ -98,9 +100,13 @@ static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::SSE; #if __LP64__ == 1 using RegisterType = uint64_t; using MaskType = uint64_t; +using MetaType = uint8_t; + #else using RegisterType = uint32_t; using MaskType = uint32_t; +using MetaType = uint8_t; + #endif static constexpr InstructionSet INSTRUCTION_SET = InstructionSet::NONE; diff --git a/src/simd_128.h b/src/simd_128.h index 856588ec..399b1e6d 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -43,26 +43,23 @@ typedef __m128i VecType; // @note: using const leads to an lint error of initialization of 'variable' // with static storage duration may throw an exception that cannot be caught -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F4_U32 = _mm_set1_epi32(65537); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F4_MINUS_ONE_U32 = _mm_set1_epi32(65536); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F3_U32 = _mm_set1_epi32(257); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F3_MINUS_ONE_U32 = _mm_set1_epi32(256); - -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F3_U16 = _mm_set1_epi16(257); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F3_MINUS_ONE_U16 = _mm_set1_epi16(256); +template +inline VecType one(); +template <> +inline VecType one() +{ + return _mm_set1_epi16(1); +} +template <> +inline VecType one() +{ + return _mm_set1_epi32(1); +} -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType ZERO = _mm_setzero_si128(); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType ONE_U16 = _mm_set1_epi16(1); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType ONE_U32 = _mm_set1_epi32(1); +inline VecType zero() +{ + return _mm_setzero_si128(); +} // NOLINTNEXTLINE(cert-err58-cpp) const VecType MASK8_LO = _mm_set1_epi16(0x80); @@ -78,25 +75,25 @@ inline void store_to_mem(VecType* address, VecType reg) _mm_store_si128(address, reg); } -inline VecType bit_and(VecType x, VecType y) +inline VecType bit_and(const VecType& x, const VecType& y) { return _mm_and_si128(x, y); } -inline VecType bit_xor(VecType x, VecType y) +inline VecType bit_xor(const VecType& x, const VecType& y) { return _mm_xor_si128(x, y); } -inline uint16_t msb8_mask(VecType x) +inline uint16_t msb8_mask(const VecType& x) { return _mm_movemask_epi8(x); } -inline bool and_is_zero(VecType x, VecType y) +inline bool and_is_zero(const VecType& x, const VecType& y) { return _mm_testz_si128(x, y); } -inline bool is_zero(VecType x) +inline bool is_zero(const VecType& x) { - return _mm_testc_si128(ZERO, x); + return _mm_testc_si128(zero(), x); } #define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8)) @@ -119,70 +116,165 @@ inline VecType set_one(uint16_t val) } template -inline VecType add(VecType x, VecType y); +inline VecType add(const VecType& x, const VecType& y); template <> -inline VecType add(VecType x, VecType y) +inline VecType add(const VecType& x, const VecType& y) { return _mm_add_epi32(x, y); } template <> -inline VecType add(VecType x, VecType y) +inline VecType add(const VecType& x, const VecType& y) { return _mm_add_epi16(x, y); } template -inline VecType sub(VecType x, VecType y); +inline VecType sub(const VecType& x, const VecType& y); template <> -inline VecType sub(VecType x, VecType y) +inline VecType sub(const VecType& x, const VecType& y) { return _mm_sub_epi32(x, y); } template <> -inline VecType sub(VecType x, VecType y) +inline VecType sub(const VecType& x, const VecType& y) { return _mm_sub_epi16(x, y); } template -inline VecType mul(VecType x, VecType y); +inline VecType mul(const VecType& x, const VecType& y); template <> -inline VecType mul(VecType x, VecType y) +inline VecType mul(const VecType& x, const VecType& y) { return _mm_mullo_epi32(x, y); } template <> -inline VecType mul(VecType x, VecType y) +inline VecType mul(const VecType& x, const VecType& y) { return _mm_mullo_epi16(x, y); } template -inline VecType compare_eq(VecType x, VecType y); +inline VecType compare_eq(const VecType& x, const VecType& y); template <> -inline VecType compare_eq(VecType x, VecType y) +inline VecType compare_eq(const VecType& x, const VecType& y) { return _mm_cmpeq_epi32(x, y); } template <> -inline VecType compare_eq(VecType x, VecType y) +inline VecType compare_eq(const VecType& x, const VecType& y) { return _mm_cmpeq_epi16(x, y); } template -inline VecType min(VecType x, VecType y); +inline VecType min(const VecType& x, const VecType& y); template <> -inline VecType min(VecType x, VecType y) +inline VecType min(const VecType& x, const VecType& y) { return _mm_min_epu32(x, y); } template <> -inline VecType min(VecType x, VecType y) +inline VecType min(const VecType& x, const VecType& y) { return _mm_min_epu16(x, y); } +template +void show(simd::VecType val) +{ + const size_t n = simd::countof(); + T buffer[n]; + simd::store_to_mem(reinterpret_cast(buffer), val); + for (unsigned i = 0; i < n; i++) { + std::cout << unsigned(buffer[i]) << " "; + } + std::cout << "\n"; +} + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType mask1 = + _mm_set_epi16(0x0101, 0x0101, 0x0101, 0x0101, 0x0, 0x0, 0x0, 0x0); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType mask2 = _mm_set_epi16( + static_cast(0x8040), + 0x2010, + 0x0804, + 0x0201, + static_cast(0x8040), + 0x2010, + 0x0804, + 0x0201); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType mask3 = _mm_set1_epi8(1); + +inline VecType from_msb8_mask(MetaType m) +{ + // set `m` to all elements + VecType x = _mm_set1_epi16(m); + // re-arrange byte-by-byte + x = _mm_shuffle_epi8(x, mask1); + // get appropriated bit per byte + x = _mm_and_si128(x, mask2); + // generate mask + x = _mm_cmpeq_epi8(x, mask2); + // result + x = _mm_and_si128(x, mask3); + + return x; +} + +template +inline void unpack(MetaType m, const VecType& x, VecType& hi, VecType& lo); +template <> +inline void +unpack(MetaType m, const VecType& x, VecType& hi, VecType& lo) +{ + VecType m_mask = m ? from_msb8_mask(m) : zero(); + lo = _mm_unpacklo_epi16(x, m_mask); + hi = _mm_unpackhi_epi16(x, m_mask); +} +template <> +inline void +unpack(MetaType m, const VecType& x, VecType& hi, VecType& lo) +{ + VecType m_mask = m ? from_msb8_mask(m) : zero(); + lo = _mm_unpacklo_epi8(x, m_mask); + hi = _mm_unpackhi_epi8(x, m_mask); +} + +template +inline void pack(const VecType& lo, const VecType& hi, VecType& x, MetaType& m); +template <> +inline void +pack(const VecType& lo, const VecType& hi, VecType& x, MetaType& m) +{ + VecType hi_data = BLEND16(zero(), hi, 0x55); + VecType hi_meta = BLEND16(zero(), SHIFTR(hi, 2), 0x55); + VecType lo_data = BLEND16(zero(), lo, 0x55); + VecType lo_meta = BLEND16(zero(), SHIFTR(lo, 2), 0x55); + + x = _mm_packus_epi32(lo_data, hi_data); + VecType meta_x = _mm_packus_epi32(lo_meta, hi_meta); + meta_x = _mm_cmpgt_epi8(meta_x, zero()); + m = _mm_movemask_epi8(meta_x); +} +template <> +inline void +pack(const VecType& lo, const VecType& hi, VecType& x, MetaType& m) +{ + VecType hi_data = BLEND8(zero(), hi, MASK8_LO); + VecType hi_meta = BLEND8(hi, zero(), MASK8_LO); + VecType lo_data = BLEND8(zero(), lo, MASK8_LO); + VecType lo_meta = BLEND8(lo, zero(), MASK8_LO); + + x = _mm_packus_epi16(lo_data, hi_data); + VecType meta_x = _mm_packus_epi16(lo_meta, hi_meta); + m = _mm_movemask_epi8(meta_x); +} + } // namespace simd } // namespace quadiron diff --git a/src/simd_256.h b/src/simd_256.h index 8084d95e..299519a2 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -51,31 +51,40 @@ namespace simd { typedef __m256i VecType; typedef __m128i HalfVecType; +template +void show(simd::VecType val) +{ + const size_t n = simd::countof(); + T buffer[n]; + _mm256_storeu_si256(reinterpret_cast(buffer), val); + for (unsigned i = 0; i < n; i++) { + std::cout << unsigned(buffer[i]) << " "; + } + std::cout << "\n"; +} + /* ============= Constant variable ============ */ // @note: using const leads to an lint error of initialization of 'variable' // with static storage duration may throw an exception that cannot be caught -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F4_U32 = _mm256_set1_epi32(65537); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F4_MINUS_ONE_U32 = _mm256_set1_epi32(65536); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F3_U32 = _mm256_set1_epi32(257); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F3_MINUS_ONE_U32 = _mm256_set1_epi32(256); - -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F3_U16 = _mm256_set1_epi16(257); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType F3_MINUS_ONE_U16 = _mm256_set1_epi16(256); +template +inline VecType one(); +template <> +inline VecType one() +{ + return _mm256_set1_epi16(1); +} +template <> +inline VecType one() +{ + return _mm256_set1_epi32(1); +} -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType ZERO = _mm256_setzero_si256(); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType ONE_U16 = _mm256_set1_epi16(1); -// NOLINTNEXTLINE(cert-err58-cpp) -const VecType ONE_U32 = _mm256_set1_epi32(1); +inline VecType zero() +{ + return _mm256_setzero_si256(); +} // NOLINTNEXTLINE(cert-err58-cpp) const VecType MASK8_LO = _mm256_set1_epi16(0x80); @@ -86,30 +95,30 @@ inline VecType load_to_reg(VecType* address) { return _mm256_load_si256(address); } -inline void store_to_mem(VecType* address, VecType reg) +inline void store_to_mem(VecType* address, const VecType& reg) { _mm256_store_si256(address, reg); } -inline VecType bit_and(VecType x, VecType y) +inline VecType bit_and(const VecType& x, const VecType& y) { return _mm256_and_si256(x, y); } -inline VecType bit_xor(VecType x, VecType y) +inline VecType bit_xor(const VecType& x, const VecType& y) { return _mm256_xor_si256(x, y); } -inline uint32_t msb8_mask(VecType x) +inline uint32_t msb8_mask(const VecType& x) { return _mm256_movemask_epi8(x); } -inline bool and_is_zero(VecType x, VecType y) +inline bool and_is_zero(const VecType& x, const VecType& y) { return _mm256_testz_si256(x, y); } -inline bool is_zero(VecType x) +inline bool is_zero(const VecType& x) { - return _mm256_testc_si256(ZERO, x); + return _mm256_testc_si256(zero(), x); } #define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8)) @@ -132,70 +141,160 @@ inline VecType set_one(uint16_t val) } template -inline VecType add(VecType x, VecType y); +inline VecType add(const VecType& x, const VecType& y); template <> -inline VecType add(VecType x, VecType y) +inline VecType add(const VecType& x, const VecType& y) { return _mm256_add_epi32(x, y); } template <> -inline VecType add(VecType x, VecType y) +inline VecType add(const VecType& x, const VecType& y) { return _mm256_add_epi16(x, y); } template -inline VecType sub(VecType x, VecType y); +inline VecType sub(const VecType& x, const VecType& y); template <> -inline VecType sub(VecType x, VecType y) +inline VecType sub(const VecType& x, const VecType& y) { return _mm256_sub_epi32(x, y); } template <> -inline VecType sub(VecType x, VecType y) +inline VecType sub(const VecType& x, const VecType& y) { return _mm256_sub_epi16(x, y); } template -inline VecType mul(VecType x, VecType y); +inline VecType mul(const VecType& x, const VecType& y); template <> -inline VecType mul(VecType x, VecType y) +inline VecType mul(const VecType& x, const VecType& y) { return _mm256_mullo_epi32(x, y); } template <> -inline VecType mul(VecType x, VecType y) +inline VecType mul(const VecType& x, const VecType& y) { return _mm256_mullo_epi16(x, y); } template -inline VecType compare_eq(VecType x, VecType y); +inline VecType compare_eq(const VecType& x, const VecType& y); template <> -inline VecType compare_eq(VecType x, VecType y) +inline VecType compare_eq(const VecType& x, const VecType& y) { return _mm256_cmpeq_epi32(x, y); } template <> -inline VecType compare_eq(VecType x, VecType y) +inline VecType compare_eq(const VecType& x, const VecType& y) { return _mm256_cmpeq_epi16(x, y); } template -inline VecType min(VecType x, VecType y); +inline VecType min(const VecType& x, const VecType& y); template <> -inline VecType min(VecType x, VecType y) +inline VecType min(const VecType& x, const VecType& y) { return _mm256_min_epu32(x, y); } template <> -inline VecType min(VecType x, VecType y) +inline VecType min(const VecType& x, const VecType& y) { return _mm256_min_epu16(x, y); } +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType mask1 = _mm256_set_epi32( + 0x03030303, + 0x03030303, + 0x02020202, + 0x02020202, + 0x01010101, + 0x01010101, + 0x0, + 0x0); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType mask2 = _mm256_set_epi32( + 0x80402010, + 0x08040201, + 0x80402010, + 0x08040201, + 0x80402010, + 0x08040201, + 0x80402010, + 0x08040201); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType mask3 = _mm256_set1_epi8(1); + +inline VecType from_msb8_mask(MetaType m) +{ + // set `m` to all elements + VecType x = _mm256_set1_epi32(m); + // re-arrange byte-by-byte + x = _mm256_shuffle_epi8(x, mask1); + // get appropriated bit per byte + x = _mm256_and_si256(x, mask2); + // generate mask + x = _mm256_cmpeq_epi8(x, mask2); + // result + x = _mm256_and_si256(x, mask3); + + return x; +} + +template +inline void unpack(MetaType m, const VecType& x, VecType& hi, VecType& lo); +template <> +inline void +unpack(MetaType m, const VecType& x, VecType& hi, VecType& lo) +{ + VecType m_mask = m ? from_msb8_mask(m) : zero(); + lo = _mm256_unpacklo_epi16(x, m_mask); + hi = _mm256_unpackhi_epi16(x, m_mask); +} +template <> +inline void +unpack(MetaType m, const VecType& x, VecType& hi, VecType& lo) +{ + VecType m_mask = m ? from_msb8_mask(m) : zero(); + lo = _mm256_unpacklo_epi8(x, m_mask); + hi = _mm256_unpackhi_epi8(x, m_mask); +} + +template +inline void pack(const VecType& lo, const VecType& hi, VecType& x, MetaType& m); +template <> +inline void +pack(const VecType& lo, const VecType& hi, VecType& x, MetaType& m) +{ + VecType hi_data = BLEND16(zero(), hi, 0x55); + VecType hi_meta = BLEND16(zero(), SHIFTR(hi, 2), 0x55); + VecType lo_data = BLEND16(zero(), lo, 0x55); + VecType lo_meta = BLEND16(zero(), SHIFTR(lo, 2), 0x55); + + x = _mm256_packus_epi32(lo_data, hi_data); + VecType meta_x = _mm256_packus_epi32(lo_meta, hi_meta); + meta_x = _mm256_cmpgt_epi8(meta_x, zero()); + m = _mm256_movemask_epi8(meta_x); +} +template <> +inline void +pack(const VecType& lo, const VecType& hi, VecType& x, MetaType& m) +{ + VecType hi_data = BLEND8(zero(), hi, MASK8_LO); + VecType hi_meta = BLEND8(hi, zero(), MASK8_LO); + VecType lo_data = BLEND8(zero(), lo, MASK8_LO); + VecType lo_meta = BLEND8(lo, zero(), MASK8_LO); + + x = _mm256_packus_epi16(lo_data, hi_data); + VecType meta_x = _mm256_packus_epi16(lo_meta, hi_meta); + m = _mm256_movemask_epi8(meta_x); +} + } // namespace simd } // namespace quadiron diff --git a/src/simd_fnt.h b/src/simd_fnt.h index 59e52410..64273223 100644 --- a/src/simd_fnt.h +++ b/src/simd_fnt.h @@ -33,46 +33,61 @@ #include +#include "vec_buffers.h" + namespace quadiron { namespace simd { template -inline VecType card(T q); +inline VecType card(); template <> -inline VecType card(uint16_t) +inline VecType card() { - return F3_U16; + return set_one(257); } template <> -inline VecType card(uint32_t q) +inline VecType card() { - return (q == F3) ? F3_U32 : F4_U32; + return set_one(65537); } template -inline VecType card_minus_one(T q); +inline VecType card_minus_one(); template <> -inline VecType card_minus_one(uint16_t) +inline VecType card_minus_one() { - return F3_MINUS_ONE_U16; + return set_one(256); } template <> -inline VecType card_minus_one(uint32_t q) +inline VecType card_minus_one() { - return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32; + return set_one(65536); } template -inline VecType get_low_half(VecType x, T q) +inline VecType get_low_half(const VecType& x); +template <> +inline VecType get_low_half(const VecType& x) +{ + return BLEND8(zero(), x, MASK8_LO); +} +template <> +inline VecType get_low_half(const VecType& x) { - return (q == F3) ? BLEND8(ZERO, x, MASK8_LO) : BLEND16(ZERO, x, 0x55); + return BLEND16(zero(), x, 0x55); } template -inline VecType get_high_half(VecType x, T q) +inline VecType get_high_half(const VecType& x); +template <> +inline VecType get_high_half(const VecType& x) +{ + return BLEND8(zero(), SHIFTR(x, 1), MASK8_LO); +} +template <> +inline VecType get_high_half(const VecType& x) { - return (q == F3) ? BLEND8(ZERO, SHIFTR(x, 1), MASK8_LO) - : BLEND16(ZERO, SHIFTR(x, 2), 0x55); + return BLEND16(zero(), SHIFTR(x, 2), 0x55); } /* ================= Basic Operations ================= */ @@ -82,14 +97,13 @@ inline VecType get_high_half(VecType x, T q) * * @param x input register * @param y input register - * @param q modulo * @return (x + y) mod q */ template -inline VecType mod_add(VecType x, VecType y, T q) +inline VecType mod_add(const VecType& x, const VecType& y) { const VecType res = add(x, y); - return min(res, sub(res, card(q))); + return min(res, sub(res, card())); } /** @@ -97,28 +111,26 @@ inline VecType mod_add(VecType x, VecType y, T q) * * @param x input register * @param y input register - * @param q modulo * @return (x - y) mod q */ template -inline VecType mod_sub(VecType x, VecType y, T q) +inline VecType mod_sub(const VecType& x, const VecType& y) { const VecType res = sub(x, y); - return min(res, add(res, card(q))); + return min(res, add(res, card())); } /** * Modular negation for packed unsigned 32-bit integers * * @param x input register - * @param q modulo * @return (-x) mod q */ template -inline VecType mod_neg(VecType x, T q) +inline VecType mod_neg(const VecType& x) { - const VecType res = sub(card(q), x); - return min(res, sub(res, card(q))); + const VecType res = sub(card(), x); + return min(res, sub(res, card())); } /** @@ -129,16 +141,15 @@ inline VecType mod_neg(VecType x, T q) * * @param x input register * @param y input register - * @param q modulo * @return (x * y) mod q */ template -inline VecType mod_mul(VecType x, VecType y, T q) +inline VecType mod_mul(const VecType& x, const VecType& y) { const VecType res = mul(x, y); - const VecType lo = get_low_half(res, q); - const VecType hi = get_high_half(res, q); - return mod_sub(lo, hi, q); + const VecType lo = get_low_half(res); + const VecType hi = get_high_half(res); + return mod_sub(lo, hi); } /** @@ -148,53 +159,68 @@ inline VecType mod_mul(VecType x, VecType y, T q) * * @param x input register * @param y input register - * @param q modulo * @return (x * y) mod q */ template -inline VecType mod_mul_safe(VecType x, VecType y, T q) +inline VecType mod_mul_safe(const VecType& x, const VecType& y) { - const VecType res = mod_mul(x, y, q); + const VecType res = mod_mul(x, y); // filter elements of both of a & b = card-1 const VecType cmp = bit_and( - compare_eq(x, card_minus_one(q)), - compare_eq(y, card_minus_one(q))); + compare_eq(x, card_minus_one()), + compare_eq(y, card_minus_one())); if (is_zero(cmp)) { return res; } - return (q == F3) ? bit_xor(res, bit_and(F4_U32, cmp)) - : add(res, bit_and(ONE_U32, cmp)); + return add(res, bit_and(one(), cmp)); } /** - * Update property for a given register for packed unsigned 32-bit integers + * Update property given an output Buffers * + * @param output output Buffers * @param props properties bound to fragments - * @param threshold register storing max value in its elements - * @param mask a specific mask - * @param symb input register * @param offset offset in the data fragments + * @param code_len erasure codes' length + * @param vecs_nb number of vectors corresponding to the data */ template -inline void add_props( - Properties& props, - VecType threshold, - VecType mask, - VecType symb, +inline void encode_post_process( + vec::Buffers& output, + std::vector& props, off_t offset, - T) + unsigned code_len, + size_t vecs_nb) { - const VecType b = compare_eq(threshold, symb); - const VecType c = bit_and(mask, b); - auto d = msb8_mask(c); - const unsigned element_size = sizeof(T); - while (d > 0) { - const unsigned byte_idx = __builtin_ctz(d); - const size_t _offset = offset + byte_idx / element_size; - props.add(_offset, OOR_MARK); - d ^= 1 << byte_idx; + // nb of elements per vector + const unsigned vec_size = countof(); + // size of meta element in bits + const unsigned ele_size_in_bits = sizeof(MetaType) * CHAR_BIT / vec_size; + // mask to get meta + const T mask = ((static_cast(1) << ele_size_in_bits) - 1); + + const std::vector& meta = output.get_meta(); + for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { + const MetaType* meta_frag = reinterpret_cast(meta[frag_id]); + for (size_t vec_id = 0; vec_id < vecs_nb; ++vec_id) { + if (meta_frag[vec_id]) { + const off_t curr_offset = offset + vec_id * vec_size; + MetaType val = meta_frag[vec_id]; + unsigned idx = 0; + while (val) { + const T m_val = val & mask; + if (m_val) { + const size_t _offset = curr_offset + idx; + props[frag_id].add(_offset, m_val); + } + + val >>= ele_size_in_bits; + idx++; + } + } + } } } diff --git a/src/simd_nf4.h b/src/simd_nf4.h index f9ec1172..06a46ebf 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -180,7 +180,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) HalfVecType res; VecType vec_a = load_to_reg(a); VecType vec_b = load_to_reg(b); - store_low_half_to_mem(&res, mod_add(vec_a, vec_b, F4)); + store_low_half_to_mem(&res, mod_add(vec_a, vec_b)); return reinterpret_cast<__uint128_t>(res); } @@ -189,7 +189,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) HalfVecType res; VecType vec_a = load_to_reg(a); VecType vec_b = load_to_reg(b); - store_low_half_to_mem(&res, mod_sub(vec_a, vec_b, F4)); + store_low_half_to_mem(&res, mod_sub(vec_a, vec_b)); return reinterpret_cast<__uint128_t>(res); } @@ -198,7 +198,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) HalfVecType res; VecType vec_a = load_to_reg(a); VecType vec_b = load_to_reg(b); - store_low_half_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4)); + store_low_half_to_mem(&res, mod_mul_safe(vec_a, vec_b)); return reinterpret_cast<__uint128_t>(res); } @@ -217,8 +217,8 @@ inline void add_buf_to_two_bufs_rem( VecType _x_next_p = load_to_reg(_x_half[i]); VecType _y_p = load_to_reg(_y[i]); - store_low_half_to_mem(_x + i, mod_add(_x_p, _y_p, F4)); - store_low_half_to_mem(_x_half + i, mod_add(_x_next_p, _y_p, F4)); + store_low_half_to_mem(_x + i, mod_add(_x_p, _y_p)); + store_low_half_to_mem(_x_half + i, mod_add(_x_next_p, _y_p)); } } @@ -230,7 +230,7 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) VecType _x_p = load_to_reg(_x[i]); VecType _y_p = load_to_reg(_y[i]); - store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4)); + store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p)); } } @@ -248,8 +248,9 @@ inline void hadamard_mul_doubled_rem( VecType _x_next_p = load_to_reg(_x_half[i]); VecType _y_p = load_to_reg(_y[i]); - store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4)); - store_low_half_to_mem(_x_half + i, mod_mul_safe(_x_next_p, _y_p, F4)); + store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p)); + store_low_half_to_mem( + _x_half + i, mod_mul_safe(_x_next_p, _y_p)); } } @@ -266,7 +267,7 @@ inline __uint128_t add(__uint128_t a, __uint128_t b) VecType res; VecType vec_a = load_to_reg(a); VecType vec_b = load_to_reg(b); - store_to_mem(&res, mod_add(vec_a, vec_b, F4)); + store_to_mem(&res, mod_add(vec_a, vec_b)); return reinterpret_cast<__uint128_t>(res); } @@ -275,7 +276,7 @@ inline __uint128_t sub(__uint128_t a, __uint128_t b) VecType res; VecType vec_a = load_to_reg(a); VecType vec_b = load_to_reg(b); - store_to_mem(&res, mod_sub(vec_a, vec_b, F4)); + store_to_mem(&res, mod_sub(vec_a, vec_b)); return reinterpret_cast<__uint128_t>(res); } @@ -284,7 +285,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b) VecType res; VecType vec_a = load_to_reg(a); VecType vec_b = load_to_reg(b); - store_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4)); + store_to_mem(&res, mod_mul_safe(vec_a, vec_b)); return reinterpret_cast<__uint128_t>(res); } @@ -327,12 +328,12 @@ inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y) // add y to the first half of `x` for (i = 0; i < vec_len; ++i) { - x[i] = mod_add(x[i], y[i], F4); + x[i] = mod_add(x[i], y[i]); } // add y to the second half of `x` for (i = 0; i < vec_len; ++i) { - x_next[i] = mod_add(x_next[i], y[i], F4); + x_next[i] = mod_add(x_next[i], y[i]); } if (rem_len > 0) { @@ -354,7 +355,7 @@ inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y) // multiply y to the first half of `x` for (i = 0; i < vec_len; ++i) { - x[i] = mod_mul_safe(x[i], y[i], F4); + x[i] = mod_mul_safe(x[i], y[i]); } if (rem_len > 0) { diff --git a/src/simd_radix2_fft.h b/src/simd_radix2_fft.h index 8405f8f0..531f8b78 100644 --- a/src/simd_radix2_fft.h +++ b/src/simd_radix2_fft.h @@ -36,6 +36,24 @@ namespace quadiron { namespace simd { +enum class CtGsCase { + SIMPLE, + NORMAL, + EXTREME, +}; + +template +inline CtGsCase get_case(T r, T q) +{ + if (r == 1) { + return CtGsCase::SIMPLE; + } else if (r < q - 1) { + return CtGsCase::NORMAL; + } else { + return CtGsCase::EXTREME; + } +} + /* ================= Vectorized Operations ================= */ /** @@ -44,22 +62,31 @@ namespace simd { * x <- x + r * y * y <- x - r * y * - * @param rp1 coefficient `r` plus one + * @param ct_case coefficient case * @param c a register stores coefficient `r` * @param x working register * @param y working register - * @param q modular */ template -inline void butterfly_ct(T rp1, VecType c, VecType* x, VecType* y, T q) +inline void +butterfly_ct(CtGsCase ct_case, const VecType& c, VecType& x, VecType& y) { - VecType z = (rp1 == 2) ? *y : mod_mul(c, *y, q); - if (rp1 < q) { - *y = mod_sub(*x, z, q); - *x = mod_add(*x, z, q); - } else { // i.e. r == q - 1 - *y = mod_add(*x, z, q); - *x = mod_sub(*x, z, q); + VecType z = y; + switch (ct_case) { + case CtGsCase::SIMPLE: + y = mod_sub(x, z); + x = mod_add(x, z); + break; + case CtGsCase::EXTREME: + y = mod_add(x, z); + x = mod_sub(x, z); + break; + case CtGsCase::NORMAL: + default: + z = mod_mul(c, y); + y = mod_sub(x, z); + x = mod_add(x, z); + break; } } @@ -69,25 +96,30 @@ inline void butterfly_ct(T rp1, VecType c, VecType* x, VecType* y, T q) * x <- x + y * y <- r * (x - y) * - * @param rp1 coefficient `r` plus one + * @param gs_case coefficient case * @param c a register stores coefficient `r` * @param x working register * @param y working register - * @param q modular */ template -inline void butterfly_gs(T rp1, VecType c, VecType* x, VecType* y, T q) +inline void +butterfly_gs(CtGsCase gs_case, const VecType& c, VecType& x, VecType& y) { - VecType add = mod_add(*x, *y, q); - if (rp1 == 2) { - *y = mod_sub(*x, *y, q); - } else if (rp1 < q) { - VecType sub = mod_sub(*x, *y, q); - *y = mod_mul(c, sub, q); - } else { // i.e. r == q - 1 - *y = mod_sub(*y, *x, q); + VecType add = mod_add(x, y); + switch (gs_case) { + case CtGsCase::SIMPLE: + y = mod_sub(x, y); + break; + case CtGsCase::EXTREME: + y = mod_sub(y, x); + break; + case CtGsCase::NORMAL: + default: + VecType sub = mod_sub(x, y); + y = mod_mul(c, sub); + break; } - *x = add; + x = add; } /** @@ -96,24 +128,25 @@ inline void butterfly_gs(T rp1, VecType c, VecType* x, VecType* y, T q) * x <- x, i.e. no operation * y <- r * x * - * @param rp1 coefficient `r` plus one + * @param gs_case coefficient case * @param c a register stores coefficient `r` * @param x working register - * @param q modular - * @return r * x */ template -inline VecType butterfly_simple_gs(T rp1, VecType c, VecType x, T q) +inline void butterfly_simple_gs(CtGsCase gs_case, const VecType& c, VecType& x) { - if (rp1 == 2) { - return x; - } else if (rp1 < q) { - return mod_mul(c, x, q); - } else { - return mod_neg(x, q); + switch (gs_case) { + case CtGsCase::EXTREME: + x = mod_neg(x); + break; + case CtGsCase::NORMAL: + x = mod_mul(c, x); + break; + case CtGsCase::SIMPLE: + default: + break; } } - /** * Vectorized butterfly CT step * @@ -139,55 +172,45 @@ inline void butterfly_ct_step( size_t len, T card) { - if (len == 0) { - return; - } - const T rp1 = r + 1; - VecType c = set_one(r); + const CtGsCase ct_case = get_case(r, card); + const VecType c = set_one(r); - const size_t end = (len > 1) ? len - 1 : 0; const unsigned bufs_nb = buf.get_n(); const std::vector& mem = buf.get_mem(); + const std::vector& meta = buf.get_meta(); for (unsigned i = start; i < bufs_nb; i += step) { - VecType x1, y1; - VecType x2, y2; VecType* p = reinterpret_cast(mem[i]); VecType* q = reinterpret_cast(mem[i + m]); + MetaType* m_p = reinterpret_cast(meta[i]); + MetaType* m_q = reinterpret_cast(meta[i + m]); - size_t j = 0; - for (; j < end; j += 2) { - x1 = load_to_reg(p + j); - y1 = load_to_reg(q + j); - - butterfly_ct(rp1, c, &x1, &y1, card); + for (size_t j = 0; j < len; ++j) { + VecType x1 = load_to_reg(p); + VecType y1 = load_to_reg(q); - x2 = load_to_reg(p + j + 1); - y2 = load_to_reg(q + j + 1); + VecType x1_lo, x1_hi; + VecType y1_lo, y1_hi; - butterfly_ct(rp1, c, &x2, &y2, card); + unpack(m_p[j], x1, x1_hi, x1_lo); + unpack(m_q[j], y1, y1_hi, y1_lo); - // Store back to memory - store_to_mem(p + j, x1); - store_to_mem(p + j + 1, x2); - store_to_mem(q + j, y1); - store_to_mem(q + j + 1, y2); - } - for (; j < len; ++j) { - x1 = load_to_reg(p + j); - y1 = load_to_reg(q + j); + butterfly_ct(ct_case, c, x1_lo, y1_lo); + butterfly_ct(ct_case, c, x1_hi, y1_hi); - butterfly_ct(rp1, c, &x1, &y1, card); + pack(x1_lo, x1_hi, x1, m_p[j]); + pack(y1_lo, y1_hi, y1, m_q[j]); // Store back to memory - store_to_mem(p + j, x1); - store_to_mem(q + j, y1); + store_to_mem(p++, x1); + store_to_mem(q++, y1); } } } template -inline static void do_butterfly_ct_2_layers( +inline void do_butterfly_ct_2_layers( const std::vector& mem, + const std::vector& meta, T r1, T r2, T r3, @@ -196,9 +219,9 @@ inline static void do_butterfly_ct_2_layers( size_t len, T card) { - const T r1p1 = r1 + 1; - const T r2p1 = r2 + 1; - const T r3p1 = r3 + 1; + const CtGsCase case1 = get_case(r1, card); + const CtGsCase case2 = get_case(r2, card); + const CtGsCase case3 = get_case(r3, card); VecType c1 = set_one(r1); VecType c2 = set_one(r2); @@ -209,68 +232,47 @@ inline static void do_butterfly_ct_2_layers( VecType* r = reinterpret_cast(mem[start + 2 * m]); VecType* s = reinterpret_cast(mem[start + 3 * m]); - size_t j = 0; - const size_t end = (len > 1) ? len - 1 : 0; - while (j < end) { - // First layer (c1, x, y) & (c1, u, v) + MetaType* m_p = reinterpret_cast(meta[start]); + MetaType* m_q = reinterpret_cast(meta[start + m]); + MetaType* m_r = reinterpret_cast(meta[start + 2 * m]); + MetaType* m_s = reinterpret_cast(meta[start + 3 * m]); + + for (size_t j = 0; j < len; ++j) { VecType x1 = load_to_reg(p); - VecType x2 = load_to_reg(p + 1); VecType y1 = load_to_reg(q); - VecType y2 = load_to_reg(q + 1); - - butterfly_ct(r1p1, c1, &x1, &y1, card); - butterfly_ct(r1p1, c1, &x2, &y2, card); - VecType u1 = load_to_reg(r); - VecType u2 = load_to_reg(r + 1); VecType v1 = load_to_reg(s); - VecType v2 = load_to_reg(s + 1); - - butterfly_ct(r1p1, c1, &u1, &v1, card); - butterfly_ct(r1p1, c1, &u2, &v2, card); - - // Second layer (c2, x, u) & (c3, y, v) - butterfly_ct(r2p1, c2, &x1, &u1, card); - butterfly_ct(r2p1, c2, &x2, &u2, card); - - butterfly_ct(r3p1, c3, &y1, &v1, card); - butterfly_ct(r3p1, c3, &y2, &v2, card); - - // Store back to memory - store_to_mem(p, x1); - store_to_mem(p + 1, x2); - store_to_mem(q, y1); - store_to_mem(q + 1, y2); - - store_to_mem(r, u1); - store_to_mem(r + 1, u2); - store_to_mem(s, v1); - store_to_mem(s + 1, v2); - p = p + 2; - q = q + 2; - r = r + 2; - s = s + 2; - j = j + 2; - }; - - for (; j < len; ++j) { - // First layer (c1, x, y) & (c1, u, v) - VecType x1 = load_to_reg(p + j); - VecType y1 = load_to_reg(q + j); - VecType u1 = load_to_reg(r + j); - VecType v1 = load_to_reg(s + j); - - // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card); - butterfly_ct(r1p1, c1, &x1, &y1, card); - butterfly_ct(r1p1, c1, &u1, &v1, card); - butterfly_ct(r2p1, c2, &x1, &u1, card); - butterfly_ct(r3p1, c3, &y1, &v1, card); - - // Store back to memory - store_to_mem(p + j, x1); - store_to_mem(q + j, y1); - store_to_mem(r + j, u1); - store_to_mem(s + j, v1); + + VecType x1_lo, x1_hi; + VecType y1_lo, y1_hi; + VecType u1_lo, u1_hi; + VecType v1_lo, v1_hi; + + unpack(m_p[j], x1, x1_hi, x1_lo); + unpack(m_q[j], y1, y1_hi, y1_lo); + unpack(m_r[j], u1, u1_hi, u1_lo); + unpack(m_s[j], v1, v1_hi, v1_lo); + + butterfly_ct(case1, c1, x1_lo, y1_lo); + butterfly_ct(case1, c1, x1_hi, y1_hi); + butterfly_ct(case1, c1, u1_lo, v1_lo); + butterfly_ct(case1, c1, u1_hi, v1_hi); + + butterfly_ct(case2, c2, x1_lo, u1_lo); + butterfly_ct(case2, c2, x1_hi, u1_hi); + + butterfly_ct(case3, c3, y1_lo, v1_lo); + butterfly_ct(case3, c3, y1_hi, v1_hi); + + pack(x1_lo, x1_hi, x1, m_p[j]); + pack(y1_lo, y1_hi, y1, m_q[j]); + pack(u1_lo, u1_hi, u1, m_r[j]); + pack(v1_lo, v1_hi, v1, m_s[j]); + + store_to_mem(p++, x1); + store_to_mem(q++, y1); + store_to_mem(r++, u1); + store_to_mem(s++, v1); } } @@ -320,8 +322,9 @@ inline void butterfly_ct_two_layers_step( const unsigned bufs_nb = buf.get_n(); const std::vector& mem = buf.get_mem(); + const std::vector& meta = buf.get_meta(); for (unsigned i = start; i < bufs_nb; i += step) { - do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card); + do_butterfly_ct_2_layers(mem, meta, r1, r2, r3, i, m, len, card); } } @@ -352,53 +355,36 @@ inline void butterfly_gs_step( return; } const unsigned step = m << 1; - const T rp1 = r + 1; + const CtGsCase gs_case = get_case(r, card); VecType c = set_one(r); - const size_t end = (len > 3) ? len - 3 : 0; const unsigned bufs_nb = buf.get_n(); const std::vector& mem = buf.get_mem(); + const std::vector& meta = buf.get_meta(); for (unsigned i = start; i < bufs_nb; i += step) { - VecType x1, x2, x3, x4; - VecType y1, y2, y3, y4; VecType* p = reinterpret_cast(mem[i]); VecType* q = reinterpret_cast(mem[i + m]); + MetaType* m_p = reinterpret_cast(meta[i]); + MetaType* m_q = reinterpret_cast(meta[i + m]); - size_t j = 0; - for (; j < end; j += 4) { - x1 = load_to_reg(p + j); - x2 = load_to_reg(p + j + 1); - x3 = load_to_reg(p + j + 2); - x4 = load_to_reg(p + j + 3); - y1 = load_to_reg(q + j); - y2 = load_to_reg(q + j + 1); - y3 = load_to_reg(q + j + 2); - y4 = load_to_reg(q + j + 3); - - butterfly_gs(rp1, c, &x1, &y1, card); - butterfly_gs(rp1, c, &x2, &y2, card); - butterfly_gs(rp1, c, &x3, &y3, card); - butterfly_gs(rp1, c, &x4, &y4, card); + for (size_t j = 0; j < len; ++j) { + VecType x1 = load_to_reg(p); + VecType y1 = load_to_reg(q); - // Store back to memory - store_to_mem(p + j, x1); - store_to_mem(p + j + 1, x2); - store_to_mem(p + j + 2, x3); - store_to_mem(p + j + 3, x4); - store_to_mem(q + j, y1); - store_to_mem(q + j + 1, y2); - store_to_mem(q + j + 2, y3); - store_to_mem(q + j + 3, y4); - } - for (; j < len; ++j) { - x1 = load_to_reg(p + j); - y1 = load_to_reg(q + j); + VecType x1_lo, x1_hi; + VecType y1_lo, y1_hi; - butterfly_gs(rp1, c, &x1, &y1, card); + unpack(m_p[j], x1, x1_hi, x1_lo); + unpack(m_q[j], y1, y1_hi, y1_lo); - // Store back to memory - store_to_mem(p + j, x1); - store_to_mem(q + j, y1); + butterfly_gs(gs_case, c, x1_lo, y1_lo); + butterfly_gs(gs_case, c, x1_hi, y1_hi); + + pack(x1_lo, x1_hi, x1, m_p[j]); + pack(y1_lo, y1_hi, y1, m_q[j]); + + store_to_mem(p++, x1); + store_to_mem(q++, y1); } } } @@ -429,95 +415,30 @@ inline void butterfly_gs_step_simple( return; } const unsigned step = m << 1; - const T rp1 = r + 1; + const CtGsCase gs_case = get_case(r, card); VecType c = set_one(r); - const size_t end = (len > 1) ? len - 1 : 0; const unsigned bufs_nb = buf.get_n(); const std::vector& mem = buf.get_mem(); + const std::vector& meta = buf.get_meta(); for (unsigned i = start; i < bufs_nb; i += step) { - VecType x1, y1; - VecType x2, y2; VecType* p = reinterpret_cast(mem[i]); VecType* q = reinterpret_cast(mem[i + m]); + MetaType* m_p = reinterpret_cast(meta[i]); + MetaType* m_q = reinterpret_cast(meta[i + m]); - size_t j = 0; - for (; j < end; j += 2) { - x1 = load_to_reg(p + j); - x2 = load_to_reg(p + j + 1); + for (size_t j = 0; j < len; ++j) { + VecType x = load_to_reg(p++); + VecType x_lo, x_hi; - y1 = butterfly_simple_gs(rp1, c, x1, card); - y2 = butterfly_simple_gs(rp1, c, x2, card); + unpack(m_p[j], x, x_hi, x_lo); - // Store back to memory - store_to_mem(q + j, y1); - store_to_mem(q + j + 1, y2); - } - for (; j < len; ++j) { - x1 = load_to_reg(p + j); - - y1 = butterfly_simple_gs(rp1, c, x1, card); + butterfly_simple_gs(gs_case, c, x_lo); + butterfly_simple_gs(gs_case, c, x_hi); - // Store back to memory - store_to_mem(q + j, y1); - } - } -} + pack(x_lo, x_hi, x, m_q[j]); -template -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - T threshold, - size_t vecs_nb) -{ - const unsigned vec_size = countof(); - const T max = 1U << (sizeof(T) * CHAR_BIT - 1); - const VecType _threshold = set_one(threshold); - const VecType mask_hi = set_one(max); - - const std::vector& mem = output.get_mem(); - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - VecType* buf = reinterpret_cast(mem[frag_id]); - - size_t vec_id = 0; - size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0; - for (; vec_id < end; vec_id += 4) { - VecType a1 = load_to_reg(buf + vec_id); - VecType a2 = load_to_reg(buf + vec_id + 1); - VecType a3 = load_to_reg(buf + vec_id + 2); - VecType a4 = load_to_reg(buf + vec_id + 3); - - if (!and_is_zero(a1, _threshold)) { - const off_t curr_offset = offset + vec_id * vec_size; - add_props( - props[frag_id], _threshold, mask_hi, a1, curr_offset, max); - } - if (!and_is_zero(a2, _threshold)) { - const off_t curr_offset = offset + (vec_id + 1) * vec_size; - add_props( - props[frag_id], _threshold, mask_hi, a2, curr_offset, max); - } - if (!and_is_zero(a3, _threshold)) { - const off_t curr_offset = offset + (vec_id + 2) * vec_size; - add_props( - props[frag_id], _threshold, mask_hi, a3, curr_offset, max); - } - if (!and_is_zero(a4, _threshold)) { - const off_t curr_offset = offset + (vec_id + 3) * vec_size; - add_props( - props[frag_id], _threshold, mask_hi, a4, curr_offset, max); - } - } - for (; vec_id < vecs_nb; ++vec_id) { - VecType a = load_to_reg(buf + vec_id); - if (!and_is_zero(a, _threshold)) { - const off_t curr_offset = offset + vec_id * vec_size; - add_props( - props[frag_id], _threshold, mask_hi, a, curr_offset, max); - } + store_to_mem(q++, x); } } } diff --git a/src/simd_ring.h b/src/simd_ring.h index 47edce99..9b1a8797 100644 --- a/src/simd_ring.h +++ b/src/simd_ring.h @@ -33,134 +33,93 @@ #include +#include "gf_ring.h" + namespace quadiron { namespace simd { /* ==================== Operations for RingModN =================== */ -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` +/** Perform a multiplication of a coefficient `c` to each element of `buf_id`th + * buffers of `src` and store result to correspondent element of `dest` * + * @note: Buffers `src` and `dest` have meta * @note: 1 < `a` < card - 1 */ template -inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card) +inline void mul_coef_to_buf( + const gf::RingModN& gf, + T c, + vec::Buffers& src, + vec::Buffers& dest, + size_t buf_id) { - const VecType coef = set_one(a); - - VecType* _src = reinterpret_cast(src); - VecType* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i = 0; - const size_t end = (_len > 3) ? _len - 3 : 0; - for (; i < end; i += 4) { - _dest[i] = mod_mul(coef, _src[i], card); - _dest[i + 1] = mod_mul(coef, _src[i + 1], card); - _dest[i + 2] = mod_mul(coef, _src[i + 2], card); - _dest[i + 3] = mod_mul(coef, _src[i + 3], card); - } - for (; i < _len; ++i) { - _dest[i] = mod_mul(coef, _src[i], card); - } + const size_t size = src.get_size(); + const unsigned ratio = simd::countof(); + const size_t simd_vec_len = size / ratio; + const size_t simd_trailing_len = size - simd_vec_len * ratio; - if (_last_len > 0) { - const DoubleSizeVal coef_double = DoubleSizeVal(a); - for (size_t i = _len * ratio; i < len; i++) { - dest[i] = static_cast((coef_double * src[i]) % card); - } - } -} + const VecType coef = set_one(c); -template -inline void add_two_bufs(T* src, T* dest, size_t len, T card) -{ - VecType* _src = reinterpret_cast(src); - VecType* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _dest[i] = mod_add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - const T tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} + VecType* s_data = reinterpret_cast(src.get(buf_id)); + VecType* d_data = reinterpret_cast(dest.get(buf_id)); -template -inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card) -{ - VecType* _bufa = reinterpret_cast(bufa); - VecType* _bufb = reinterpret_cast(bufb); - VecType* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = mod_sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) { - res[i] = bufa[i] - bufb[i]; - } else { - res[i] = card - (bufb[i] - bufa[i]); - } - } - } -} + MetaType* s_meta = reinterpret_cast(src.get_meta(buf_id)); + MetaType* d_meta = reinterpret_cast(dest.get_meta(buf_id)); -template -inline void mul_two_bufs(T* src, T* dest, size_t len, T card) -{ - VecType* _src = reinterpret_cast(src); - VecType* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mod_mul_safe(_src[i], _dest[i], card); + for (size_t i = 0; i < simd_vec_len; ++i) { + VecType lo, hi; + VecType x = load_to_reg(&s_data[i]); + + unpack(s_meta[i], x, hi, lo); + hi = mod_mul(coef, hi); + lo = mod_mul(coef, lo); + pack(lo, hi, x, d_meta[i]); + + store_to_mem(&d_data[i], x); } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = T((DoubleSizeVal(src[i]) * dest[i]) % card); + + if (simd_trailing_len) { + const size_t simd_offset = simd_vec_len * ratio; + for (size_t i = simd_offset; i < size; ++i) { + T hi, lo; + src.get(buf_id, i, hi, lo); + dest.set(buf_id, i, gf.mul(c, hi), gf.mul(c, lo)); } } } /** Apply an element-wise negation to a buffer + * @note: Buffers `src` and `dest` have meta */ template -inline void neg(size_t len, T* buf, T card) +inline void neg(const gf::RingModN& gf, vec::Buffers& buf, size_t buf_id) { - VecType* _buf = reinterpret_cast(buf); - const unsigned ratio = sizeof(*_buf) / sizeof(*buf); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = mod_neg(_buf[i], card); + const size_t size = buf.get_size(); + const unsigned ratio = simd::countof(); + const size_t simd_vec_len = size / ratio; + const size_t simd_trailing_len = size - simd_vec_len * ratio; + + VecType* vec_data = reinterpret_cast(buf.get(buf_id)); + MetaType* vec_meta = reinterpret_cast(buf.get_meta(buf_id)); + + for (size_t i = 0; i < simd_vec_len; ++i) { + VecType lo, hi; + VecType x = load_to_reg(&vec_data[i]); + + unpack(vec_meta[i], x, hi, lo); + hi = mod_neg(hi); + lo = mod_neg(lo); + pack(lo, hi, x, vec_meta[i]); + + store_to_mem(&vec_data[i], x); } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; + + if (simd_trailing_len) { + const size_t simd_offset = simd_vec_len * ratio; + for (size_t i = simd_offset; i < size; ++i) { + T hi, lo; + buf.get(buf_id, i, hi, lo); + buf.set(buf_id, i, gf.neg(hi), gf.neg(lo)); } } } diff --git a/src/vec_buffers.h b/src/vec_buffers.h index a721a25b..4f993d9e 100644 --- a/src/vec_buffers.h +++ b/src/vec_buffers.h @@ -407,7 +407,7 @@ Buffers::Buffers(const Buffers& vec, int begin, int end) this->n = end - begin; this->size = vec.get_size(); this->mem_len = this->n * this->size; - const std::vector vec_mem = vec.get_mem(); + const std::vector& vec_mem = vec.get_mem(); mem.reserve(this->n); // slice from input buffers @@ -427,7 +427,7 @@ Buffers::Buffers(const Buffers& vec, int begin, int end) this->m_meta = vec.has_meta(); if (this->m_meta) { - const std::vector vec_meta = vec.get_meta(); + const std::vector& vec_meta = vec.get_meta(); this->init_meta(); meta.reserve(this->n); // slice from input buffers @@ -531,7 +531,7 @@ Buffers::Buffers( if (this->m_meta) { this->init_meta(); - const std::vector vec_meta = vec.get_meta(); + const std::vector& vec_meta = vec.get_meta(); // output is sliced & shuffled from `vec` meta.reserve(this->n); if (vec_n < n) { // output is zero-extended & shuffled from `vec` diff --git a/src/vec_cast.h b/src/vec_cast.h index d4eef92d..13b52727 100644 --- a/src/vec_cast.h +++ b/src/vec_cast.h @@ -173,7 +173,7 @@ std::vector* cast_mem_of_vecp(vec::Buffers* s) // std::cout << "\ninput: "; s->dump(); - const std::vector mem_s = s->get_mem(); + const std::vector& mem_s = s->get_mem(); std::vector* mem_d = new std::vector(n, nullptr); for (i = 0; i < n; i++) { mem_d->at(i) = reinterpret_cast(mem_s.at(i)); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b97ac468..f57bf5e9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -127,6 +127,7 @@ set(TEST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/simd/test_allocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/simd/test_definitions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/simd/test_simd.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/simd/test_simd_fnt.cpp CACHE INTERNAL diff --git a/test/buffers_utest.cpp b/test/buffers_utest.cpp index 7a939ac4..b0d2538c 100644 --- a/test/buffers_utest.cpp +++ b/test/buffers_utest.cpp @@ -67,7 +67,7 @@ class BuffersTest : public ::testing::Test { std::uniform_int_distribution dis(0, max - 1); auto vec = std::make_unique>(n, size, has_meta); - const std::vector mem = vec->get_mem(); + const std::vector& mem = vec->get_mem(); for (int i = 0; i < n; i++) { for (int j = 0; j < size; j++) { mem[i][j] = dis(quadiron::prng()); @@ -75,7 +75,7 @@ class BuffersTest : public ::testing::Test { } if (has_meta) { - const std::vector meta = vec->get_meta(); + const std::vector& meta = vec->get_meta(); const size_t meta_size = vec->get_meta_size(); for (int i = 0; i < n; ++i) { for (size_t j = 0; j < meta_size; ++j) { @@ -175,8 +175,8 @@ TYPED_TEST(BuffersTest, TestConstructors) // NOLINT vec::Buffers vec2(*vec1, begin, end); - const std::vector mem1 = vec1->get_mem(); - const std::vector mem2 = vec2.get_mem(); + const std::vector& mem1 = vec1->get_mem(); + const std::vector& mem2 = vec2.get_mem(); // Check Slice constructor ASSERT_EQ(vec2.get_n(), end - begin); @@ -189,8 +189,8 @@ TYPED_TEST(BuffersTest, TestConstructors) // NOLINT ASSERT_EQ(vec2.has_meta(), vec1->has_meta()); ASSERT_EQ(vec2.get_meta_size(), meta_size); - const std::vector meta1 = vec1->get_meta(); - const std::vector meta2 = vec2.get_meta(); + const std::vector& meta1 = vec1->get_meta(); + const std::vector& meta2 = vec2.get_meta(); for (int i = begin, j = 0; i < end; ++i, ++j) { ASSERT_TRUE( @@ -206,7 +206,7 @@ TYPED_TEST(BuffersTest, TestConstructors) // NOLINT } if (has_meta) { - const std::vector meta1 = vec1->get_meta(); + const std::vector& meta1 = vec1->get_meta(); std::vector meta3(end - begin); for (int i = 0; i < end - begin; i++) { meta3[i] = this->allocator_meta.allocate(meta_size); @@ -283,11 +283,11 @@ TYPED_TEST(BuffersTest, TestPackUnpack) // NOLINT for (bool const& has_meta : tests) { auto words = this->gen_buffers_rand_data(n, size, has_meta, max); - const std::vector mem_T = words->get_mem(); + const std::vector& mem_T = words->get_mem(); // Pack manually from TypeParam to uint8_t. vec::Buffers vec_char(n, bytes_size); - const std::vector mem_char = vec_char.get_mem(); + const std::vector& mem_char = vec_char.get_mem(); for (int j = 0; j < n; j++) { int t = 0; TypeParam* buf_T = mem_T.at(j); @@ -308,14 +308,14 @@ TYPED_TEST(BuffersTest, TestPackUnpack) // NOLINT // Pack bufs of type uint8_t to bufs of type TypeParam. vec::Buffers vec_T_tmp(n, size); - const std::vector mem_T_tmp = vec_T_tmp.get_mem(); + const std::vector& mem_T_tmp = vec_T_tmp.get_mem(); vec::pack( mem_char, mem_T_tmp, n, size, word_size); // Unpack bufs of type TypeParam to bufs of type uint8_t. vec::Buffers vec_char_tmp(n, bytes_size); - const std::vector mem_char_tmp = vec_char_tmp.get_mem(); + const std::vector& mem_char_tmp = vec_char_tmp.get_mem(); vec::unpack( mem_T_tmp, mem_char_tmp, n, size, word_size); diff --git a/test/ec_driver.cpp b/test/ec_driver.cpp index 0166f848..6a03d76a 100644 --- a/test/ec_driver.cpp +++ b/test/ec_driver.cpp @@ -582,15 +582,15 @@ int main(int argc, char** argv) data_zpad = count_digits(n_data - 1); if (eflag == EC_TYPE_RS_FNT) { - if (word_size <= 4) { - run_fec_rs_fnt( + if (word_size == 1) { + run_fec_rs_fnt( word_size, n_data, n_parities, rflag, quadiron::fec::FecType::NON_SYSTEMATIC); - } else if (word_size <= 8) { - run_fec_rs_fnt( + } else if (word_size == 2) { + run_fec_rs_fnt( word_size, n_data, n_parities, @@ -598,15 +598,15 @@ int main(int argc, char** argv) quadiron::fec::FecType::NON_SYSTEMATIC); } } else if (eflag == EC_TYPE_RS_FNT_SYS) { - if (word_size <= 4) { - run_fec_rs_fnt( + if (word_size == 1) { + run_fec_rs_fnt( word_size, n_data, n_parities, rflag, quadiron::fec::FecType::SYSTEMATIC); - } else if (word_size <= 8) { - run_fec_rs_fnt( + } else if (word_size == 2) { + run_fec_rs_fnt( word_size, n_data, n_parities, diff --git a/test/fec_utest.cpp b/test/fec_utest.cpp index 72e17a14..304907c3 100644 --- a/test/fec_utest.cpp +++ b/test/fec_utest.cpp @@ -40,9 +40,12 @@ class FecTestCommon : public ::testing::Test { public: const unsigned n_data = 3; const unsigned n_parities = 3; + const size_t pkt_size = 9; - void - run_test(fec::FecCode& fec, bool props_flag = false, bool is_nf4 = false) + void run_test_horizontal( + fec::FecCode& fec, + bool props_flag = false, + bool is_nf4 = false) { const int code_len = n_data + n_parities; @@ -52,7 +55,7 @@ class FecTestCommon : public ::testing::Test { vec::Vector data_frags(gf, n_data); vec::Vector copied_data_frags(gf, n_data); - vec::Vector encoded_frags(gf, fec.n); + vec::Vector encoded_frags(gf, fec.get_n_outputs()); vec::Vector received_frags(gf, n_data); vec::Vector decoded_frags(gf, n_data); std::vector ids; @@ -62,10 +65,10 @@ class FecTestCommon : public ::testing::Test { ids.push_back(i); } - std::vector props(code_len); + std::vector props(fec.get_n_outputs()); for (int j = 0; j < 1000; j++) { if (props_flag) { - for (int i = 0; i < code_len; i++) { + for (int i = 0; i < fec.get_n_outputs(); i++) { props[i] = quadiron::Properties(); } } @@ -83,8 +86,25 @@ class FecTestCommon : public ::testing::Test { std::random_shuffle(ids.begin(), ids.end()); for (unsigned i = 0; i < n_data; i++) { fragments_ids.set(i, ids.at(i)); - received_frags.set(i, encoded_frags.get(ids.at(i))); } + if (fec.type == fec::FecType::SYSTEMATIC) { + for (unsigned i = 0; i < n_data; i++) { + if (fragments_ids.get(i) < n_data) { + received_frags.set( + i, data_frags.get(fragments_ids.get(i))); + } else { + received_frags.set( + i, + encoded_frags.get(fragments_ids.get(i) - n_data)); + } + } + } else { + for (unsigned i = 0; i < n_data; i++) { + received_frags.set( + i, encoded_frags.get(fragments_ids.get(i))); + } + } + std::unique_ptr> context = fec.init_context_dec(fragments_ids, props); @@ -93,22 +113,97 @@ class FecTestCommon : public ::testing::Test { ASSERT_EQ(copied_data_frags, decoded_frags); } } -}; -using AllTypes = ::testing::Types; -TYPED_TEST_CASE(FecTestCommon, AllTypes); + void run_test_vertical( + fec::FecCode& fec, + bool props_flag = false, + bool has_meta = false) + { + const int code_len = n_data + n_parities; -TYPED_TEST(FecTestCommon, TestNf4) // NOLINT -{ - const int iter_count = arith::log2(sizeof(TypeParam)); + const quadiron::gf::Field& gf = fec.get_gf(); - for (int i = 1; i < iter_count; i++) { - const unsigned word_size = 1 << i; - fec::RsNf4 fec(word_size, this->n_data, this->n_parities); + vec::Buffers data_frags(n_data, pkt_size, has_meta); + vec::Buffers encoded_frags(fec.get_n_outputs(), pkt_size, has_meta); + vec::Buffers received_frags(n_data, pkt_size, has_meta); + vec::Buffers decoded_frags(n_data, pkt_size, has_meta); + std::vector ids; + vec::Vector fragments_ids(gf, n_data); + + // It's necessary to set `data_frags` all zeros for `RsNf4` as + // `data_frags` has not meta + data_frags.zero_fill(); + + for (int i = 0; i < code_len; i++) { + ids.push_back(i); + } + + std::vector props(fec.get_n_outputs()); + for (int j = 0; j < 1; j++) { + if (props_flag) { + for (int i = 0; i < fec.get_n_outputs(); i++) { + props[i] = quadiron::Properties(); + } + } + const std::vector& mem = data_frags.get_mem(); + for (unsigned i = 0; i < n_data; i++) { + char* buf = reinterpret_cast(mem[i]); + for (size_t j = 0; j < fec.buf_size; ++j) { + buf[j] = static_cast(gf.rand()); + } + } + if (has_meta) { + data_frags.reset_meta(); + } + + fec.encode(encoded_frags, props, 0, data_frags); + + std::random_shuffle(ids.begin(), ids.end()); + for (unsigned i = 0; i < n_data; i++) { + fragments_ids.set(i, ids.at(i)); + } + if (fec.type == fec::FecType::SYSTEMATIC) { + for (unsigned i = 0; i < n_data; i++) { + if (fragments_ids.get(i) < n_data) { + received_frags.copy( + data_frags, fragments_ids.get(i), i); + } else { + received_frags.copy( + encoded_frags, fragments_ids.get(i) - n_data, i); + } + } + } else { + for (unsigned i = 0; i < n_data; i++) { + received_frags.copy(encoded_frags, fragments_ids.get(i), i); + } + } - this->run_test(fec, true, true); + std::unique_ptr> context = + fec.init_context_dec( + fragments_ids, props, pkt_size, &decoded_frags); + + fec.decode(*context, decoded_frags, props, 0, received_frags); + + ASSERT_EQ(data_frags, decoded_frags); + } } -} + + void run_test( + fec::FecCode& fec, + bool props_flag = false, + bool is_nf4 = false, + bool vertical_support = false, + bool has_meta = false) + { + run_test_horizontal(fec, props_flag, is_nf4); + if (vertical_support) { + run_test_vertical(fec, props_flag, has_meta); + } + } +}; + +using AllTypes = ::testing::Types; +TYPED_TEST_CASE(FecTestCommon, AllTypes); TYPED_TEST(FecTestCommon, TestGf2nFft) // NOLINT { @@ -130,36 +225,64 @@ TYPED_TEST(FecTestCommon, TestGf2nFftAdd) // NOLINT } template -class FecTestNo128 : public FecTestCommon { +class FecTestNf4 : public FecTestCommon { }; -using No128 = ::testing::Types; -TYPED_TEST_CASE(FecTestNo128, No128); +using Nf4Type = ::testing::Types; +TYPED_TEST_CASE(FecTestNf4, Nf4Type); -TYPED_TEST(FecTestNo128, TestFnt) // NOLINT +TYPED_TEST(FecTestNf4, TestNf4) // NOLINT { - for (unsigned word_size = 1; word_size <= 2; ++word_size) { - fec::RsFnt fec( - fec::FecType::NON_SYSTEMATIC, - word_size, - this->n_data, - this->n_parities); - this->run_test(fec, true); + const int iter_count = arith::log2(sizeof(TypeParam)); + + for (int i = 1; i < iter_count; i++) { + const unsigned word_size = 1 << i; + fec::RsNf4 fec( + word_size, this->n_data, this->n_parities, this->pkt_size); + + this->run_test(fec, true, true, true); } } -TYPED_TEST(FecTestNo128, TestFntSys) // NOLINT +template +class FecTestFnt : public FecTestCommon { +}; + +using FntType = ::testing::Types; +TYPED_TEST_CASE(FecTestFnt, FntType); + +TYPED_TEST(FecTestFnt, TestFnt) // NOLINT { - for (unsigned word_size = 1; word_size <= 2; ++word_size) { - fec::RsFnt fec( - fec::FecType::SYSTEMATIC, - word_size, - this->n_data, - this->n_parities); - this->run_test(fec, true); - } + const unsigned word_size = sizeof(TypeParam) / 2; + fec::RsFnt fec( + fec::FecType::NON_SYSTEMATIC, + word_size, + this->n_data, + this->n_parities, + this->pkt_size); + + this->run_test(fec, true, false, true, true); } +TYPED_TEST(FecTestFnt, TestFntSys) // NOLINT +{ + const unsigned word_size = sizeof(TypeParam) / 2; + fec::RsFnt fec( + fec::FecType::SYSTEMATIC, + word_size, + this->n_data, + this->n_parities, + this->pkt_size); + this->run_test(fec, true, false, true, true); +} + +template +class FecTestNo128 : public FecTestCommon { +}; + +using No128 = ::testing::Types; +TYPED_TEST_CASE(FecTestNo128, No128); + TYPED_TEST(FecTestNo128, TestGfpFft) // NOLINT { for (size_t word_size = 1; word_size <= 4 && word_size < sizeof(TypeParam); diff --git a/test/fft_utest.cpp b/test/fft_utest.cpp index e13695d2..d4a21c0c 100644 --- a/test/fft_utest.cpp +++ b/test/fft_utest.cpp @@ -335,7 +335,10 @@ TYPED_TEST(FftTest, TestFft2kVecp) // NOLINT { auto gf(gf::create>(this->q)); const unsigned R = gf.get_primitive_root(); - const size_t size = 4; + const size_t size = 128; + const std::vector tests = {true}; + + const unsigned half_elelemnt_in_bits = sizeof(TypeParam) * CHAR_BIT / 2; ASSERT_EQ(arith::jacobi(R, this->q), -1); @@ -349,22 +352,33 @@ TYPED_TEST(FftTest, TestFft2kVecp) // NOLINT fft::Radix2 fft(gf, n, data_len, size); const int vec_n = fft.get_n(); - vec::Buffers v2(vec_n, size); - vec::Buffers _v2(vec_n, size); - for (unsigned len = 2; len <= n; len *= 2) { - vec::Buffers v(len, size); - vec::Buffers _v(_v2, 0, len); - for (int j = 0; j < 100; j++) { - for (unsigned i = 0; i < len; i++) { - TypeParam* mem = v.get(i); - for (size_t u = 0; u < size; u++) { - mem[u] = gf.rand(); + + for (bool const& has_meta : tests) { + vec::Buffers v2(vec_n, size, has_meta); + vec::Buffers _v2(vec_n, size, has_meta); + for (unsigned len = 2; len <= n; len *= 2) { + for (int j = 0; j < 100; j++) { + vec::Buffers v(len, size, has_meta); + vec::Buffers _v(_v2, 0, len); + for (unsigned i = 0; i < len; i++) { + TypeParam* mem = v.get(i); + if (has_meta) { + for (size_t u = 0; u < size; u++) { + mem[u] = + (gf.rand() << half_elelemnt_in_bits) + | gf.rand(); + } + } else { + for (size_t u = 0; u < size; u++) { + mem[u] = gf.rand(); + } + } } - } - fft.fft(v2, v); - fft.ifft(_v2, v2); + fft.fft(v2, v); + fft.ifft(_v2, v2); - ASSERT_EQ(v, _v); + ASSERT_EQ(v, _v); + } } } } @@ -375,7 +389,8 @@ TYPED_TEST(FftTest, TestNaiveVsFft2kVecp) // NOLINT { auto gf(gf::create>(this->q)); const unsigned R = gf.get_primitive_root(); - const size_t size = 2; + const size_t size = 128; + const std::vector tests = {true}; ASSERT_EQ(arith::jacobi(R, this->q), -1); @@ -393,29 +408,31 @@ TYPED_TEST(FftTest, TestNaiveVsFft2kVecp) // NOLINT ASSERT_EQ(fft_naive.get_n(), fft_2n.get_n()); - vec::Buffers v(n, size); - vec::Buffers fft1(n, size); - vec::Buffers fft2(n, size); - vec::Buffers ifft1(n, size); - vec::Buffers ifft2(n, size); - for (int j = 0; j < 100; j++) { - for (unsigned i = 0; i < n; i++) { - TypeParam* mem = v.get(i); - for (size_t u = 0; u < size; u++) { - mem[u] = gf.rand(); + for (bool const& has_meta : tests) { + vec::Buffers v(n, size, has_meta); + vec::Buffers fft1(n, size, has_meta); + vec::Buffers fft2(n, size, has_meta); + vec::Buffers ifft1(n, size, has_meta); + vec::Buffers ifft2(n, size, has_meta); + for (int j = 0; j < 100; j++) { + for (unsigned i = 0; i < n; i++) { + TypeParam* mem = v.get(i); + for (size_t u = 0; u < size; u++) { + mem[u] = gf.rand(); + } } - } - fft_naive.fft(fft1, v); - fft_2n.fft(fft2, v); + fft_naive.fft(fft1, v); + fft_2n.fft(fft2, v); - ASSERT_EQ(fft1, fft2); + ASSERT_EQ(fft1, fft2); - fft_naive.ifft(ifft1, fft1); - fft_2n.ifft(ifft2, fft2); + fft_naive.ifft(ifft1, fft1); + fft_2n.ifft(ifft2, fft2); - ASSERT_EQ(ifft1, ifft2); - ASSERT_EQ(ifft1, v); + ASSERT_EQ(ifft1, ifft2); + ASSERT_EQ(ifft1, v); + } } } } diff --git a/test/perf/fnt.cpp b/test/perf/fnt.cpp new file mode 100644 index 00000000..44758485 --- /dev/null +++ b/test/perf/fnt.cpp @@ -0,0 +1,526 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include + +#include +#include + +#include "arith.h" +#include "core.h" +#include "fec_rs_fnt.h" +#include "fft_2n.h" +#include "gf_prime.h" +#include "misc.h" +#include "vec_buffers.h" + +namespace vec = quadiron::vec; +namespace gf = quadiron::gf; +namespace fft = quadiron::fft; +namespace fec = quadiron::fec; + +#ifdef QUADIRON_USE_SIMD + +#include "simd.h" +#include "simd/simd.h" +#include "simd_fnt.h" + +namespace simd = quadiron::simd; + +template +void show(simd::VecType val) +{ + const size_t n = simd::countof(); + T buffer[n]; + simd::store_to_mem(reinterpret_cast(buffer), val); + for (unsigned i = 0; i < n; i++) { + std::cout << unsigned(buffer[i]) << " "; + } + std::cout << "\n"; +} + +template +void dump(T* buf, size_t bytes) +{ + const size_t nb = bytes / sizeof(T); + for (size_t i = 0; i < nb; ++i) { + std::cout << unsigned(buf[i]) << " "; + } + std::cout << "\n"; +} + +template +class PerfFnt : public ::testing::Test { + public: + PerfFnt() + { + if (sizeof(T) == 2) { + this->q = 257; + this->word_size = 1; + } else if (sizeof(T) == 4) { + this->q = static_cast(65537); + this->word_size = 2; + } else { + throw "Wrong TypeParam for PerfFnt tests"; + } + + this->distribution = + std::make_unique>(0, q - 1); + } + + void + buf_rand_data(vec::Buffers& vec, bool has_meta = false, int _max = 0) + { + const T max = (_max == 0) ? std::numeric_limits::max() : _max; + std::uniform_int_distribution dis(0, max - 1); + + const std::vector& mem = vec.get_mem(); + const size_t size = vec.get_size(); + const size_t n = vec.get_n(); + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < size; j++) { + mem[i][j] = dis(quadiron::prng()); + } + } + + if (has_meta) { + const std::vector& meta = vec.get_meta(); + const size_t meta_size = vec.get_meta_size(); + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < meta_size; ++j) { + meta[i][j] = static_cast(dis(quadiron::prng())); + } + } + } + } + + simd::VecType rand_vec(T lower = 0, T upper_bound = 0) + { + const size_t n = simd::countof(); + T buf[n]; + simd::VecType* vec = reinterpret_cast(buf); + + T bound = upper_bound ? upper_bound : q; + bound -= lower; + + for (unsigned i = 0; i < n; i++) { + buf[i] = lower + distribution->operator()(quadiron::prng()) % bound; + } + + return vec[0]; + } + + template + void gen_rand_data(Tx* vec, size_t len) + { + for (size_t i = 0; i < len; i++) { + vec[i] = distribution->operator()(quadiron::prng()); + } + } + + simd::VecType copy(simd::VecType x) + { + const size_t n = simd::countof(); + T buf[n]; + T val[n]; + simd::VecType* vec = reinterpret_cast(buf); + + simd::store_to_mem(reinterpret_cast(val), x); + std::copy_n(val, n, buf); + + return vec[0]; + } + + template + void core_op_perf_single(const std::string& text, const TFunc& f) + { + const size_t len = simd::countof(); + // Init a Buffers to obtain aligned memory + quadiron::vec::Buffers data_buf(2, len); + T* x = data_buf.get(0); + T* y = data_buf.get(1); + + for (unsigned i = 0; i < len; ++i) { + x[i] = + 1 + + (distribution->operator()(quadiron::prng()) % (this->q - 1)); + y[i] = + 1 + + (distribution->operator()(quadiron::prng()) % (this->q - 1)); + } + + simd::VecType* vec_x = reinterpret_cast(x); + simd::VecType* vec_y = reinterpret_cast(y); + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < iters_nb; ++i) { + simd::VecType _x = simd::load_to_reg(vec_x); + simd::VecType _y = simd::load_to_reg(vec_y); + + f(_x, _y); + + simd::store_to_mem(vec_x, _x); + } + uint64_t end = quadiron::hw_timer(); + double avg_cycles_nb = + static_cast(end - start) / static_cast(iters_nb); + std::cout << "#CPU cycles of " << text << ": " << avg_cycles_nb << "\n"; + } + + template + void core_op_perf(const std::string& text, const TFunc& f) + { + std::cout << text << "\n"; + std::cout << "\tVectors nb\t#CPU cycles\n"; + for (auto vec_len : arr_vec_len) { + const size_t len = vec_len * simd::countof(); + + // Init a Buffers to obtain aligned memory + quadiron::vec::Buffers data_buf(2, len); + T* buf_x = data_buf.get(0); + T* buf_y = data_buf.get(1); + gen_rand_data(buf_x, len); + gen_rand_data(buf_y, len); + + simd::VecType* data_x = reinterpret_cast(buf_x); + simd::VecType* data_y = reinterpret_cast(buf_y); + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < iters_nb; ++i) { + for (size_t j = 0; j < vec_len; ++j) { + simd::VecType x = simd::load_to_reg(&data_x[j]); + simd::VecType y = simd::load_to_reg(&data_y[j]); + + f(x, y); + + simd::store_to_mem(&data_x[j], x); + } + } + uint64_t end = quadiron::hw_timer(); + double avg_cycles_nb = static_cast(end - start) + / static_cast(iters_nb) + / static_cast(vec_len); + + std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n"; + } + std::cout << "\n"; + } + + template + void butterfly_perf(const std::string& text, const TFunc& f) + { + std::cout << text << "\n"; + std::cout << "\tVectors nb\t#CPU cycles\n"; + for (auto vec_len : arr_vec_len) { + const size_t len = vec_len * simd::countof(); + + // Init a Buffers to obtain aligned memory + quadiron::vec::Buffers data_buf(2, len); + T* buf_x = data_buf.get(0); + T* buf_y = data_buf.get(1); + gen_rand_data(buf_x, len); + gen_rand_data(buf_y, len); + + simd::VecType* data_x = reinterpret_cast(buf_x); + simd::VecType* data_y = reinterpret_cast(buf_y); + + T coef = 1 + + this->distribution->operator()(quadiron::prng()) + % (this->q - 2); + const simd::CtGsCase ct_case = simd::get_case(coef, this->q); + const simd::VecType c = simd::set_one(coef); + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < iters_nb; ++i) { + for (size_t j = 0; j < vec_len; ++j) { + simd::VecType x = simd::load_to_reg(&data_x[j]); + simd::VecType y = simd::load_to_reg(&data_y[j]); + + f(ct_case, c, x, y); + + simd::store_to_mem(&data_x[j], x); + simd::store_to_mem(&data_y[j], y); + } + } + uint64_t end = quadiron::hw_timer(); + + double avg_cycles_nb = static_cast(end - start) + / static_cast(iters_nb) + / static_cast(vec_len); + + std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n"; + } + std::cout << "\n"; + } + + template + void fft_perf(const std::string& text, size_t fft_len, const TFunc& f) + { + std::cout << text << " of length " << fft_len << "\n"; + std::cout << "\tVectors nb\t#CPU cycles\n"; + + for (auto vec_len : arr_vec_len) { + const size_t len = vec_len * simd::countof(); + + vec::Buffers input(fft_len, len, true); + vec::Buffers output(fft_len, len, true); + + buf_rand_data(input); + buf_rand_data(output); + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < iters_nb; ++i) { + (i % 2) ? f(output, input) : f(input, output); + } + uint64_t end = quadiron::hw_timer(); + + double avg_cycles_nb = static_cast(end - start) + / static_cast(iters_nb) + / static_cast(vec_len); + + std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n"; + } + } + + void fnt_perf( + const std::string& text, + fec::FecCode& fec, + size_t fft_len, + int n_data, + size_t vec_len) + { + const size_t len = vec_len * simd::countof(); + const bool has_meta = true; + + vec::Buffers data_frags(n_data, len, has_meta); + vec::Buffers encoded_frags(fft_len, len, has_meta); + + // It's necessary to set `data_frags` all zeros for `RsNf4` as + // `data_frags` has not meta + data_frags.zero_fill(); + + std::vector props(fec.get_n_outputs()); + + buf_rand_data(data_frags); + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < iters_nb; ++i) { + for (int i = 0; i < fec.get_n_outputs(); i++) { + props[i] = quadiron::Properties(); + } + data_frags.reset_meta(); + + fec.encode(encoded_frags, props, 0, data_frags); + } + uint64_t end = quadiron::hw_timer(); + + double avg_cycles_nb = static_cast(end - start) + / static_cast(iters_nb) + / static_cast(vec_len); + + std::cout << "\t" << text << "\t\t" << n_data << "\t\t" << fft_len - n_data << "\t\t" + << len << "\t\t" << avg_cycles_nb << "\n"; + } + + T q; + unsigned word_size; + std::unique_ptr> distribution; + std::vector arr_vec_len = + {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; + size_t iters_nb = 1e1; + std::vector arr_fft_len = {16, 32, 64, 128, 256}; + std::vector arr_k = {8, 16, 32, 64, 128}; +}; + +using AllTypes = ::testing::Types; +TYPED_TEST_CASE(PerfFnt, AllTypes); + +TYPED_TEST(PerfFnt, PerfSimdSingle) // NOLINT +{ + this->core_op_perf_single( + "Add", [](simd::VecType& x, const simd::VecType& y) { + x = simd::add(x, y); + }); + + this->core_op_perf_single( + "Sub", [](simd::VecType& x, const simd::VecType& y) { + x = simd::sub(x, y); + }); + + this->core_op_perf_single( + "Mul", [](simd::VecType& x, const simd::VecType& y) { + x = simd::mul(x, y); + }); + + this->core_op_perf_single( + "Min", [](simd::VecType& x, const simd::VecType& y) { + x = simd::min(x, y); + }); +} + +TYPED_TEST(PerfFnt, PerfSimdBuf) // NOLINT +{ + this->core_op_perf("Add", [](simd::VecType& x, const simd::VecType& y) { + x = simd::add(x, y); + }); + + this->core_op_perf("Sub", [](simd::VecType& x, const simd::VecType& y) { + x = simd::sub(x, y); + }); + + this->core_op_perf("Mul", [](simd::VecType& x, const simd::VecType& y) { + x = simd::mul(x, y); + }); + + this->core_op_perf("Min", [](simd::VecType& x, const simd::VecType& y) { + x = simd::min(x, y); + }); +} + +TYPED_TEST(PerfFnt, PerfModBuf) // NOLINT +{ + this->core_op_perf("ModAdd", [](simd::VecType& x, const simd::VecType& y) { + x = simd::mod_add(x, y); + }); + + this->core_op_perf("ModSub", [](simd::VecType& x, const simd::VecType& y) { + x = simd::mod_sub(x, y); + }); + + this->core_op_perf("ModMul", [](simd::VecType& x, const simd::VecType& y) { + x = simd::mod_mul(x, y); + }); +} + +TYPED_TEST(PerfFnt, PerfPackUnpack) // NOLINT +{ + std::cout << "Pack & Unpack" + << "\n"; + std::cout << "\tVectors nb\t#CPU cycles\n"; + for (const auto vec_len : this->arr_vec_len) { + const size_t len = vec_len * simd::countof(); + + // Init a Buffers to obtain aligned memory + quadiron::vec::Buffers data_buf(1, len); + TypeParam* buf_data = data_buf.get(0); + this->gen_rand_data(buf_data, len); + + quadiron::vec::Buffers meta_buf(1, vec_len); + simd::MetaType* buf_meta = meta_buf.get(0); + this->gen_rand_data(buf_meta, vec_len); + + simd::VecType* data = reinterpret_cast(buf_data); + simd::MetaType* meta = buf_meta; + + uint64_t start = quadiron::hw_timer(); + for (unsigned i = 0; i < this->iters_nb; ++i) { + for (size_t j = 0; j < vec_len; ++j) { + simd::VecType lo, hi; + + simd::VecType x = simd::load_to_reg(&data[j]); + + simd::unpack(meta[j], x, hi, lo); + simd::pack(lo, hi, x, meta[j]); + + simd::store_to_mem(&data[j], x); + } + } + uint64_t end = quadiron::hw_timer(); + double avg_cycles_nb = static_cast(end - start) + / static_cast(this->iters_nb) + / static_cast(vec_len); + + std::cout << "\t" << vec_len << "\t\t" << avg_cycles_nb << "\n"; + } + std::cout << "\n"; +} + +TYPED_TEST(PerfFnt, PerfButterfly) // NOLINT +{ + this->butterfly_perf( + "Butterfly_CT", + [](simd::CtGsCase ct_case, + const simd::VecType& c, + simd::VecType& x, + simd::VecType& y) { + simd::butterfly_ct(ct_case, c, x, y); + }); + + this->butterfly_perf( + "Butterfly_GS", + [](simd::CtGsCase ct_case, + const simd::VecType& c, + simd::VecType& x, + simd::VecType& y) { + simd::butterfly_gs(ct_case, c, x, y); + }); +} + +TYPED_TEST(PerfFnt, PerfFftRadix2) // NOLINT +{ + auto gf(gf::create>(this->q)); + + for (auto fft_len : this->arr_fft_len) { + fft::Radix2 fft_2n(gf, fft_len); + + this->fft_perf( + "FFT", + fft_len, + [&fft_2n]( + vec::Buffers& output, + vec::Buffers& input) { fft_2n.fft(output, input); }); + } +} + +TYPED_TEST(PerfFnt, PerfFntEnc) // NOLINT +{ + std::cout << "FNT performance\n"; + std::cout << "\tType\t\tk\t\tm\t\tpkt_size\t\t#CPU cycles\n"; + + for (auto fft_len : this->arr_fft_len) { + for (auto n_data : this->arr_k) { + if (n_data >= fft_len) { + continue; + } + const int n_parities = fft_len - n_data; + + for (auto vec_len : this->arr_vec_len) { + fec::RsFnt fec( + fec::FecType::NON_SYSTEMATIC, + this->word_size, + n_data, + n_parities, + vec_len); + + this->fnt_perf("Enc", fec, fft_len, n_data, vec_len); + } + } + } +} +#endif diff --git a/test/quadiron_c_utest.cpp b/test/quadiron_c_utest.cpp index f87a0e38..9e290db5 100644 --- a/test/quadiron_c_utest.cpp +++ b/test/quadiron_c_utest.cpp @@ -169,22 +169,6 @@ class QuadironCTest : public ::testing::Test { } } - ASSERT_EQ( - quadiron_fnt32_decode( - inst, - _data.data(), - _parity.data(), - missing_idxs.data(), - block_size), - 0); - - for (int i = 0; i < n_data; i++) { - ASSERT_TRUE(std::equal( - _ref_data[i], - _ref_data[i] + block_size, - _data[i] + metadata_size)); - } - for (int i = 0; i < n_data; i++) { if (missing_idxs[i]) { ASSERT_EQ( @@ -213,6 +197,39 @@ class QuadironCTest : public ::testing::Test { } } + for (int i = 0; i < n_data; i++) { + if (missing_idxs[i]) { + std::fill_n(_data[i], block_size + metadata_size, 0); + } + } + + for (int i = 0; i < n_parities; i++) { + if (missing_idxs[n_data + i]) { + std::fill_n(_parity[i], block_size + metadata_size, 0); + } + } + + // FIXME: for non-systematic FNT, `quadiron_fnt32_decode` will + // overwrite `_data` (that stores actually encoded fragments) + // by decoded data. Hence, if we test `reconstruct` after, we could use + // wrong fragments. A quick fix is to move the test of + // `quadiron_fnt32_decode` at the end. + ASSERT_EQ( + quadiron_fnt32_decode( + inst, + _data.data(), + _parity.data(), + missing_idxs.data(), + block_size), + 0); + + for (int i = 0; i < n_data; i++) { + ASSERT_TRUE(std::equal( + _ref_data[i], + _ref_data[i] + block_size, + _data[i] + metadata_size)); + } + quadiron_fnt32_delete(inst); } diff --git a/test/simd/test_simd_fnt.cpp b/test/simd/test_simd_fnt.cpp new file mode 100644 index 00000000..81635f90 --- /dev/null +++ b/test/simd/test_simd_fnt.cpp @@ -0,0 +1,359 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include + +#include +#include + +#include "arith.h" +#include "core.h" +#include "misc.h" + +#ifdef QUADIRON_USE_SIMD + +#include "simd.h" +#include "simd/simd.h" +#include "simd_fnt.h" + +namespace simd = quadiron::simd; + +template +void show(simd::VecType val) +{ + const size_t n = simd::countof(); + T buffer[n]; + simd::store_to_mem(reinterpret_cast(buffer), val); + for (unsigned i = 0; i < n; i++) { + std::cout << unsigned(buffer[i]) << " "; + } + std::cout << "\n"; +} + +template +void dump(T* buf, size_t bytes) +{ + const size_t nb = bytes / sizeof(T); + for (size_t i = 0; i < nb; ++i) { + std::cout << unsigned(buf[i]) << " "; + } + std::cout << "\n"; +} + +template +class SimdTestFnt : public ::testing::Test { + public: + SimdTestFnt() + { + if (sizeof(T) == 2) { + this->q = 257; + } else if (sizeof(T) == 4) { + this->q = static_cast(65537); + } else { + throw "Wrong TypeParam for SimdTestFnt tests"; + } + + this->distribution = + std::make_unique>(0, q - 1); + } + + simd::VecType rand_vec(T lower = 0, T upper_bound = 0) + { + const size_t n = simd::countof(); + T buf[n]; + simd::VecType* vec = reinterpret_cast(buf); + + T bound = upper_bound ? upper_bound : q; + bound -= lower; + + for (unsigned i = 0; i < n; i++) { + buf[i] = lower + distribution->operator()(quadiron::prng()) % bound; + } + + return vec[0]; + } + + simd::VecType copy(simd::VecType x) + { + const size_t n = simd::countof(); + T buf[n]; + T val[n]; + simd::VecType* vec = reinterpret_cast(buf); + + simd::store_to_mem(reinterpret_cast(val), x); + std::copy_n(val, n, buf); + + return vec[0]; + } + + bool is_equal(simd::VecType x, simd::VecType y) + { + return simd::is_zero(simd::bit_xor(x, y)); + } + + simd::VecType from_msb8_mask(simd::MetaType meta) + { + const size_t n = simd::countof(); + uint8_t buf[n]; + simd::VecType* vec = reinterpret_cast(buf); + + for (unsigned i = 0; i < n; ++i) { + buf[i] = meta & 1; + meta >>= 1; + } + + return vec[0]; + } + + simd::VecType mod_mul(simd::VecType x, simd::VecType y) + { + const size_t n = simd::countof(); + T _x[n]; + T _y[n]; + T _z[n]; + simd::store_to_mem(reinterpret_cast(_x), x); + simd::store_to_mem(reinterpret_cast(_y), y); + for (unsigned i = 0; i < n; i++) { + _z[i] = (quadiron::DoubleSizeVal(_x[i]) * _y[i]) % q; + } + + simd::VecType* vec = reinterpret_cast(_z); + + return vec[0]; + } + + /* Butterfly Cooley-Tukey operation + * x <- x + c * y + * y <- x - c * y + */ + void butterfly_ct(simd::VecType c, simd::VecType& x, simd::VecType& y) + { + const size_t n = simd::countof(); + T c_buf[n]; + T x_buf[n]; + T y_buf[n]; + + simd::store_to_mem(reinterpret_cast(c_buf), c); + simd::store_to_mem(reinterpret_cast(x_buf), x); + simd::store_to_mem(reinterpret_cast(y_buf), y); + + for (unsigned i = 0; i < n; ++i) { + T mul = (quadiron::DoubleSizeVal(c_buf[i]) * y_buf[i]) % q; + T u = (x_buf[i] + mul) % q; + T v = x_buf[i] >= mul ? x_buf[i] - mul : q + x_buf[i] - mul; + + x_buf[i] = u; + y_buf[i] = v; + } + + x = simd::load_to_reg(reinterpret_cast(x_buf)); + y = simd::load_to_reg(reinterpret_cast(y_buf)); + } + + /* Butterfly Genteleman-Sande operation + * x <- x + y + * y <- c * (x - y) + */ + void butterfly_gs(simd::VecType c, simd::VecType& x, simd::VecType& y) + { + const size_t n = simd::countof(); + T c_buf[n]; + T x_buf[n]; + T y_buf[n]; + + simd::store_to_mem(reinterpret_cast(c_buf), c); + simd::store_to_mem(reinterpret_cast(x_buf), x); + simd::store_to_mem(reinterpret_cast(y_buf), y); + + for (unsigned i = 0; i < n; ++i) { + T sub = x_buf[i] >= y_buf[i] ? x_buf[i] - y_buf[i] + : q + x_buf[i] - y_buf[i]; + T u = (x_buf[i] + y_buf[i]) % q; + T v = (quadiron::DoubleSizeVal(c_buf[i]) * sub) % q; + x_buf[i] = u; + y_buf[i] = v; + } + + x = simd::load_to_reg(reinterpret_cast(x_buf)); + y = simd::load_to_reg(reinterpret_cast(y_buf)); + } + + /* Butterfly Genteleman-Sande simple operation where y = 0 + * y <- c * x + */ + void butterfly_simple_gs(simd::VecType c, simd::VecType& x) + { + const size_t n = simd::countof(); + T c_buf[n]; + T x_buf[n]; + + simd::store_to_mem(reinterpret_cast(c_buf), c); + simd::store_to_mem(reinterpret_cast(x_buf), x); + + for (unsigned i = 0; i < n; ++i) { + x_buf[i] = (quadiron::DoubleSizeVal(c_buf[i]) * x_buf[i]) % q; + } + + x = simd::load_to_reg(reinterpret_cast(x_buf)); + } + + T q; + std::unique_ptr> distribution; + std::vector arr_vec_len = + {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; + size_t iters_nb = 1e3; + std::vector arr_fft_len = {128}; +}; + +using AllTypes = ::testing::Types; +TYPED_TEST_CASE(SimdTestFnt, AllTypes); + +TYPED_TEST(SimdTestFnt, TestMetaToMask) // NOLINT +{ + simd::MetaType meta = static_cast(0x89abcdefUL); + simd::VecType got = simd::from_msb8_mask(meta); + simd::VecType expected = this->from_msb8_mask(meta); + + ASSERT_TRUE(this->is_equal(expected, got)); +} + +TYPED_TEST(SimdTestFnt, TestModAddSub) // NOLINT +{ + for (unsigned i = 0; i < 100; ++i) { + simd::VecType x = this->rand_vec(this->q / 2); + simd::VecType y = this->rand_vec(this->q / 2); + + simd::VecType u = simd::mod_add(x, y); + simd::VecType v = simd::mod_sub(u, x); + simd::VecType z = simd::mod_add(v, x); + + ASSERT_TRUE(this->is_equal(y, v)); + ASSERT_TRUE(this->is_equal(u, z)); + } +} + +TYPED_TEST(SimdTestFnt, TestModNeg) // NOLINT +{ + for (unsigned i = 0; i < 100; ++i) { + simd::VecType x = this->rand_vec(); + + simd::VecType y = simd::mod_neg(x); + simd::VecType u = simd::mod_sub(simd::zero(), x); + + ASSERT_TRUE(this->is_equal(u, y)); + } +} + +TYPED_TEST(SimdTestFnt, TestModMul) // NOLINT +{ + for (unsigned i = 0; i < 100; ++i) { + simd::VecType x = this->rand_vec(0, this->q - 1); + simd::VecType y = this->rand_vec(); + + // check mod_mul + simd::VecType u = simd::mod_mul(x, y); + simd::VecType v = this->mod_mul(x, y); + ASSERT_TRUE(this->is_equal(v, u)); + + // check mod_mul_safe + simd::VecType a = simd::card_minus_one(); + simd::VecType b = simd::card_minus_one(); + simd::VecType c = this->mod_mul(a, b); + simd::VecType d = simd::mod_mul(a, b); + simd::VecType e = simd::mod_mul_safe(a, b); + + ASSERT_FALSE(this->is_equal(c, d)); + ASSERT_TRUE(this->is_equal(c, e)); + } +} + +TYPED_TEST(SimdTestFnt, TestButterflyCt) // NOLINT +{ + for (unsigned i = 0; i < 100; ++i) { + std::vector r_values = { + 1, + static_cast( + this->distribution->operator()(quadiron::prng())), + static_cast(this->q - 1)}; + + for (const TypeParam r : r_values) { + const simd::CtGsCase ct_case = + simd::get_case(r, this->q); + + simd::VecType c = simd::set_one(r); + + simd::VecType x = this->rand_vec(); + simd::VecType y = this->rand_vec(); + simd::VecType x_expected = this->copy(x); + simd::VecType y_expected = this->copy(y); + + this->butterfly_ct(c, x_expected, y_expected); + simd::butterfly_ct(ct_case, c, x, y); + + ASSERT_TRUE(this->is_equal(x_expected, x)); + ASSERT_TRUE(this->is_equal(y_expected, y)); + } + } +} + +TYPED_TEST(SimdTestFnt, TestButterflyGs) // NOLINT +{ + for (unsigned i = 0; i < 100; ++i) { + std::vector r_values = { + 1, + static_cast( + this->distribution->operator()(quadiron::prng())), + static_cast(this->q - 1)}; + + for (const TypeParam r : r_values) { + const simd::CtGsCase ct_case = + simd::get_case(r, this->q); + + simd::VecType c = simd::set_one(r); + + simd::VecType x = this->rand_vec(); + simd::VecType y = this->rand_vec(); + simd::VecType x_expected = this->copy(x); + simd::VecType y_expected = this->copy(y); + + this->butterfly_gs(c, x_expected, y_expected); + simd::butterfly_gs(ct_case, c, x, y); + + ASSERT_TRUE(this->is_equal(x_expected, x)); + ASSERT_TRUE(this->is_equal(y_expected, y)); + + this->butterfly_simple_gs(c, x_expected); + simd::butterfly_simple_gs(ct_case, c, x); + + ASSERT_TRUE(this->is_equal(x_expected, x)); + } + } +} + +#endif