diff --git a/third_party/jpeg-xl/lib/extras/size_constraints.h b/third_party/jpeg-xl/lib/extras/size_constraints.h index cf06f8cb22814..3f31eb1ac7a98 100644 --- a/third_party/jpeg-xl/lib/extras/size_constraints.h +++ b/third_party/jpeg-xl/lib/extras/size_constraints.h @@ -16,22 +16,24 @@ namespace jxl { struct SizeConstraints { // Upper limit on pixel dimensions/area, enforced by VerifyDimensions // (called from decoders). Fuzzers set smaller values to limit memory use. - uint32_t dec_max_xsize = 0xFFFFFFFFu; - uint32_t dec_max_ysize = 0xFFFFFFFFu; - uint64_t dec_max_pixels = 0xFFFFFFFFu; // Might be up to ~0ull + // Default values correspond to JXL level 10. + uint32_t dec_max_xsize = 1u << 30; + uint32_t dec_max_ysize = 1u << 30; + uint64_t dec_max_pixels = static_cast(1u) << 40; }; template ::value>::type> Status VerifyDimensions(const SizeConstraints* constraints, T xs, T ys) { - if (!constraints) return true; + SizeConstraints limit = {}; + if (constraints) limit = *constraints; if (xs == 0 || ys == 0) return JXL_FAILURE("Empty image."); - if (xs > constraints->dec_max_xsize) return JXL_FAILURE("Image too wide."); - if (ys > constraints->dec_max_ysize) return JXL_FAILURE("Image too tall."); + if (xs > limit.dec_max_xsize) return JXL_FAILURE("Image too wide."); + if (ys > limit.dec_max_ysize) return JXL_FAILURE("Image too tall."); const uint64_t num_pixels = static_cast(xs) * ys; - if (num_pixels > constraints->dec_max_pixels) { + if (num_pixels > limit.dec_max_pixels) { return JXL_FAILURE("Image too big."); } diff --git a/third_party/jpeg-xl/lib/jxl/base/float.h b/third_party/jpeg-xl/lib/jxl/base/float.h index 50af582b6f54a..f32d226248867 100644 --- a/third_party/jpeg-xl/lib/jxl/base/float.h +++ b/third_party/jpeg-xl/lib/jxl/base/float.h @@ -18,7 +18,7 @@ namespace jxl { namespace detail { -// Based on highway scalar implementation, for testing + static JXL_INLINE float LoadFloat16(uint16_t bits16) { const uint32_t sign = bits16 >> 15; const uint32_t biased_exp = (bits16 >> 10) & 0x1F; @@ -32,7 +32,8 @@ static JXL_INLINE float LoadFloat16(uint16_t bits16) { } // Normalized: convert the representation directly (faster than ldexp/tables). - const uint32_t biased_exp32 = biased_exp + (127 - 15); + const uint32_t biased_exp32 = + biased_exp == 0b11111 ? 0b11111111 : biased_exp + (127 - 15); const uint32_t mantissa32 = mantissa << (23 - 10); const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; diff --git a/third_party/jpeg-xl/lib/jxl/cms/color_encoding_cms.h b/third_party/jpeg-xl/lib/jxl/cms/color_encoding_cms.h index 28a74d7d76567..953e69dafdb97 100644 --- a/third_party/jpeg-xl/lib/jxl/cms/color_encoding_cms.h +++ b/third_party/jpeg-xl/lib/jxl/cms/color_encoding_cms.h @@ -511,7 +511,7 @@ struct ColorEncoding { // Checks if the color space and transfer function are the same, ignoring // rendering intent and ICC bytes bool SameColorEncoding(const ColorEncoding& other) const { - return SameColorSpace(other) && tf.IsSame(other.tf); + return SameColorSpace(other) && tf.IsSame(other.tf) && !cmyk && !other.cmyk; } // Returns true if all fields have been initialized (possibly to kUnknown). diff --git a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h index 73148b4ff7daf..e6641d8c60b97 100644 --- a/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h +++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal.h @@ -150,6 +150,7 @@ struct ColorEncoding : public Fields { Status SetICC(IccBytes&& icc, const JxlCmsInterface* cms) { JXL_ENSURE(cms != nullptr); JXL_ENSURE(!icc.empty()); + storage_.have_fields = true; want_icc_ = storage_.SetFieldsFromICC(std::move(icc), *cms); return want_icc_; } diff --git a/third_party/jpeg-xl/lib/jxl/dec_ans.cc b/third_party/jpeg-xl/lib/jxl/dec_ans.cc index d459d49181e38..5807301d9abed 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_ans.cc +++ b/third_party/jpeg-xl/lib/jxl/dec_ans.cc @@ -126,9 +126,9 @@ Status ReadHistogram(int precision_bits, std::vector* counts, input->Refill(); // for PeekFixedBits + Advance int idx = input->PeekFixedBits<7>(); input->Consume(huff[idx][0]); - logcounts[i] = huff[idx][1]; + logcounts[i] = int(huff[idx][1]) - 1; // The RLE symbol. - if (logcounts[i] == ANS_LOG_TAB_SIZE + 1) { + if (logcounts[i] == ANS_LOG_TAB_SIZE) { int rle_length = DecodeVarLenUint8(input); same[i] = rle_length + 5; i += rle_length + 3; @@ -142,7 +142,7 @@ Status ReadHistogram(int precision_bits, std::vector* counts, // Invalid input, e.g. due to invalid usage of RLE. if (omit_pos < 0) return JXL_FAILURE("Invalid histogram."); if (static_cast(omit_pos) + 1 < logcounts.size() && - logcounts[omit_pos + 1] == ANS_TAB_SIZE + 1) { + logcounts[omit_pos + 1] == ANS_LOG_TAB_SIZE) { return JXL_FAILURE("Invalid histogram."); } int prev = 0; @@ -158,18 +158,17 @@ Status ReadHistogram(int precision_bits, std::vector* counts, (*counts)[i] = prev; numsame--; } else { - unsigned int code = logcounts[i]; + int code = logcounts[i]; // omit_pos may not be negative at this point (checked before). - if (i == static_cast(omit_pos)) { + if (i == static_cast(omit_pos) || code < 0) { continue; - } else if (code == 0) { - continue; - } else if (code == 1) { - (*counts)[i] = 1; + } else if (shift == 0 || code == 0) { + // `shift = 0` means `bitcount = 0` + (*counts)[i] = 1 << code; } else { - int bitcount = GetPopulationCountPrecision(code - 1, shift); - (*counts)[i] = (1u << (code - 1)) + - (input->ReadBits(bitcount) << (code - 1 - bitcount)); + int bitcount = GetPopulationCountPrecision(code, shift); + (*counts)[i] = (1 << code) + + (input->ReadBits(bitcount) << (code - bitcount)); } } total_count += (*counts)[i]; diff --git a/third_party/jpeg-xl/lib/jxl/dec_ans.h b/third_party/jpeg-xl/lib/jxl/dec_ans.h index 4e437fffa80fb..f68203c6d47f0 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_ans.h +++ b/third_party/jpeg-xl/lib/jxl/dec_ans.h @@ -379,6 +379,8 @@ class ANSSymbolReader { bool IsSingleValueAndAdvance(size_t ctx, uint32_t* value, size_t count) { // TODO(veluca): No optimization for Huffman mode yet. if (use_prefix_code_) return false; + // TODO(eustas): Check if we could deal with copy tail as well. + if (num_to_copy_ != 0) return false; // TODO(eustas): propagate "degenerate_symbol" to simplify this method. const uint32_t res = state_ & (ANS_TAB_SIZE - 1u); const AliasTable::Entry* table = &alias_tables_[ctx << log_alpha_size_]; diff --git a/third_party/jpeg-xl/lib/jxl/dec_cache.cc b/third_party/jpeg-xl/lib/jxl/dec_cache.cc index 92a59c1487105..4c1de5a47c7b8 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_cache.cc +++ b/third_party/jpeg-xl/lib/jxl/dec_cache.cc @@ -106,9 +106,7 @@ Status PassesDecoderState::PreparePipeline(const FrameHeader& frame_header, PipelineOptions options) { JxlMemoryManager* memory_manager = this->memory_manager(); size_t num_c = 3 + frame_header.nonserialized_metadata->m.num_extra_channels; - bool render_noise = - (options.render_noise && (frame_header.flags & FrameHeader::kNoise) != 0); - size_t num_tmp_c = render_noise ? 3 : 0; + size_t num_tmp_c = options.render_noise ? 3 : 0; if (frame_header.CanBeReferenced()) { // Necessary so that SetInputSizes() can allocate output buffers as needed. @@ -169,7 +167,8 @@ Status PassesDecoderState::PreparePipeline(const FrameHeader& frame_header, ec++) { if (frame_header.extra_channel_upsampling[ec] != 1) { JXL_RETURN_IF_ERROR(builder.AddStage(GetUpsamplingStage( - frame_header.nonserialized_metadata->transform_data, 3 + ec, + memory_manager, frame_header.nonserialized_metadata->transform_data, + 3 + ec, CeilLog2Nonzero(frame_header.extra_channel_upsampling[ec])))); } } @@ -191,11 +190,11 @@ Status PassesDecoderState::PreparePipeline(const FrameHeader& frame_header, (late_ec_upsample ? frame_header.extra_channel_upsampling.size() : 0); for (size_t c = 0; c < nb_channels; c++) { JXL_RETURN_IF_ERROR(builder.AddStage(GetUpsamplingStage( - frame_header.nonserialized_metadata->transform_data, c, - CeilLog2Nonzero(frame_header.upsampling)))); + memory_manager, frame_header.nonserialized_metadata->transform_data, + c, CeilLog2Nonzero(frame_header.upsampling)))); } } - if (render_noise) { + if (options.render_noise) { JXL_RETURN_IF_ERROR(builder.AddStage(GetConvolveNoiseStage(num_c))); JXL_RETURN_IF_ERROR(builder.AddStage(GetAddNoiseStage( shared->image_features.noise_params, shared->cmap.base(), num_c))); @@ -330,6 +329,11 @@ Status PassesDecoderState::PreparePipeline(const FrameHeader& frame_header, } } linear = false; + } else { + auto cms_stage = GetCmsStage(output_encoding_info, false); + if (cms_stage) { + JXL_RETURN_IF_ERROR(builder.AddStage(std::move(cms_stage))); + } } (void)linear; diff --git a/third_party/jpeg-xl/lib/jxl/dec_cache.h b/third_party/jpeg-xl/lib/jxl/dec_cache.h index 9d3204faa7311..f728f883f6814 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_cache.h +++ b/third_party/jpeg-xl/lib/jxl/dec_cache.h @@ -171,7 +171,8 @@ struct PassesDecoderState { used_acs = 0; - upsampler8x = GetUpsamplingStage(shared->metadata->transform_data, 0, 3); + upsampler8x = GetUpsamplingStage(memory_manager, + shared->metadata->transform_data, 0, 3); if (frame_header.loop_filter.epf_iters > 0) { JXL_ASSIGN_OR_RETURN( sigma, diff --git a/third_party/jpeg-xl/lib/jxl/dec_frame.cc b/third_party/jpeg-xl/lib/jxl/dec_frame.cc index 04c3ee0c9c2f0..2b9d373dc201a 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_frame.cc +++ b/third_party/jpeg-xl/lib/jxl/dec_frame.cc @@ -539,7 +539,10 @@ Status FrameDecoder::ProcessACGroup(size_t ac_group_id, } decoded_passes_per_ac_group_[ac_group_id] += num_passes; - if ((frame_header_.flags & FrameHeader::kNoise) != 0) { + const bool render_noise = + ((frame_header_.flags & FrameHeader::kNoise) != 0) && + (frame_header_.dc_level == 0); + if (render_noise) { PrepareNoiseInput(*dec_state_, frame_dim_, frame_header_, ac_group_id, thread); } @@ -654,12 +657,15 @@ Status FrameDecoder::ProcessSections(const SectionInfo* sections, size_t num, "DecodeDCGroup")); } + const bool render_noise = + ((frame_header_.flags & FrameHeader::kNoise) != 0) && + (frame_header_.dc_level == 0); if (!HasDcGroupToDecode() && !finalized_dc_) { PassesDecoderState::PipelineOptions pipeline_options; pipeline_options.use_slow_render_pipeline = use_slow_rendering_pipeline_; pipeline_options.coalescing = coalescing_; pipeline_options.render_spotcolors = render_spotcolors_; - pipeline_options.render_noise = true; + pipeline_options.render_noise = render_noise; JXL_RETURN_IF_ERROR(dec_state_->PreparePipeline( frame_header_, &frame_header_.nonserialized_metadata->m, decoded_, pipeline_options)); diff --git a/third_party/jpeg-xl/lib/jxl/dec_frame.h b/third_party/jpeg-xl/lib/jxl/dec_frame.h index 26412fd2f1860..fecdc6c209492 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_frame.h +++ b/third_party/jpeg-xl/lib/jxl/dec_frame.h @@ -271,6 +271,8 @@ class FrameDecoder { JXL_RETURN_IF_ERROR(dec_state_->render_pipeline->PrepareForThreads( storage_size, use_group_ids)); } + JXL_RETURN_IF_ERROR( + dec_state_->upsampler8x->PrepareForThreads(num_threads)); return true; } diff --git a/third_party/jpeg-xl/lib/jxl/dec_huffman.cc b/third_party/jpeg-xl/lib/jxl/dec_huffman.cc index 849b1a5f647a1..9086f873d22bf 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_huffman.cc +++ b/third_party/jpeg-xl/lib/jxl/dec_huffman.cc @@ -239,20 +239,4 @@ bool HuffmanDecodingData::ReadFromBitStream(size_t alphabet_size, return (table_size > 0); } -// Decodes the next Huffman coded symbol from the bit-stream. -uint16_t HuffmanDecodingData::ReadSymbol(BitReader* br) const { - size_t n_bits; - const HuffmanCode* table = table_.data(); - table += br->PeekBits(kHuffmanTableBits); - n_bits = table->bits; - if (n_bits > kHuffmanTableBits) { - br->Consume(kHuffmanTableBits); - n_bits -= kHuffmanTableBits; - table += table->value; - table += br->PeekBits(n_bits); - } - br->Consume(table->bits); - return table->value; -} - } // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/dec_huffman.h b/third_party/jpeg-xl/lib/jxl/dec_huffman.h index 162c3e309c4d1..9d9eff1e90e27 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_huffman.h +++ b/third_party/jpeg-xl/lib/jxl/dec_huffman.h @@ -19,10 +19,24 @@ static constexpr size_t kHuffmanTableBits = 8u; struct HuffmanDecodingData { // Decodes the Huffman code lengths from the bit-stream and fills in the // pre-allocated table with the corresponding 2-level Huffman decoding table. - // Returns false if the Huffman code lengths can not de decoded. + // Returns false if the Huffman code lengths can not be decoded. bool ReadFromBitStream(size_t alphabet_size, BitReader* br); - uint16_t ReadSymbol(BitReader* br) const; + // Decodes the next Huffman coded symbol from the bit-stream. + JXL_INLINE uint16_t ReadSymbol(BitReader* br) const { + size_t n_bits; + const HuffmanCode* table = table_.data(); + table += br->PeekBits(kHuffmanTableBits); + n_bits = table->bits; + if (n_bits > kHuffmanTableBits) { + br->Consume(kHuffmanTableBits); + n_bits -= kHuffmanTableBits; + table += table->value; + table += br->PeekBits(n_bits); + } + br->Consume(table->bits); + return table->value; + } std::vector table_; }; diff --git a/third_party/jpeg-xl/lib/jxl/dec_modular.cc b/third_party/jpeg-xl/lib/jxl/dec_modular.cc index e8dc17fa7dc49..8840e99e4cb86 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_modular.cc +++ b/third_party/jpeg-xl/lib/jxl/dec_modular.cc @@ -127,6 +127,14 @@ Status int_to_float(const pixel_type* const JXL_RESTRICT row_in, } int exp = (f >> mant_bits); int mantissa = (f & ((1 << mant_bits) - 1)); + if (exp == (1 << exp_bits) - 1) { + // NaN or infinity + f = (signbit ? 0x80000000 : 0); + f |= 0b11111111 << 23; + f |= mantissa << mant_shift; + memcpy(&row_out[x], &f, 4); + continue; + } mantissa <<= mant_shift; // Try to normalize only if there is space for maneuver. if (exp == 0 && exp_bits < 8) { diff --git a/third_party/jpeg-xl/lib/jxl/dec_xyb.cc b/third_party/jpeg-xl/lib/jxl/dec_xyb.cc index 66a45a7dde596..59677f388b03d 100644 --- a/third_party/jpeg-xl/lib/jxl/dec_xyb.cc +++ b/third_party/jpeg-xl/lib/jxl/dec_xyb.cc @@ -237,7 +237,7 @@ Status OutputEncodingInfo::SetFromMetadata(const CodecMetadata& metadata) { bool orig_grey = orig_color_encoding.IsGray(); return SetColorEncoding(!xyb_encoded || orig_ok ? orig_color_encoding - : ColorEncoding::LinearSRGB(orig_grey)); + : ColorEncoding::SRGB(orig_grey)); } Status OutputEncodingInfo::MaybeSetColorEncoding( diff --git a/third_party/jpeg-xl/lib/jxl/decode.cc b/third_party/jpeg-xl/lib/jxl/decode.cc index 302e5115cd6c3..c1822bc721ced 100644 --- a/third_party/jpeg-xl/lib/jxl/decode.cc +++ b/third_party/jpeg-xl/lib/jxl/decode.cc @@ -2659,12 +2659,27 @@ JxlDecoderStatus JxlDecoderSetOutputColorProfile( if (dec->post_headers) { return JXL_API_ERROR("too late to set the color encoding"); } + auto& output_encoding = dec->passes_state->output_encoding_info; + auto& orig_encoding = dec->image_metadata.color_encoding; + jxl::ColorEncoding c_out; + bool same_encoding = false; + if (color_encoding) { + JXL_API_RETURN_IF_ERROR(c_out.FromExternal(*color_encoding)); + same_encoding = c_out.SameColorEncoding(output_encoding.color_encoding); + } if ((!dec->passes_state->output_encoding_info.cms_set) && - (icc_data != nullptr)) { + (icc_data != nullptr || + (!dec->image_metadata.xyb_encoded && !same_encoding))) { return JXL_API_ERROR( "must set color management system via JxlDecoderSetCms"); } - auto& output_encoding = dec->passes_state->output_encoding_info; + if (!orig_encoding.HaveFields() && + dec->passes_state->output_encoding_info.cms_set) { + std::vector tmp_icc = orig_encoding.ICC(); + JXL_API_RETURN_IF_ERROR(orig_encoding.SetICC( + std::move(tmp_icc), &output_encoding.color_management_system)); + output_encoding.orig_color_encoding = orig_encoding; + } if (color_encoding) { if (dec->image_metadata.color_encoding.IsGray() && color_encoding->color_space != JXL_COLOR_SPACE_GRAY && @@ -2674,13 +2689,9 @@ JxlDecoderStatus JxlDecoderSetOutputColorProfile( if (color_encoding->color_space == JXL_COLOR_SPACE_UNKNOWN) { return JXL_API_ERROR("Unknown output colorspace"); } - jxl::ColorEncoding c_out; - JXL_API_RETURN_IF_ERROR(c_out.FromExternal(*color_encoding)); JXL_API_RETURN_IF_ERROR(!c_out.ICC().empty()); - if (!c_out.SameColorEncoding(output_encoding.color_encoding)) { - JXL_API_RETURN_IF_ERROR(output_encoding.MaybeSetColorEncoding(c_out)); - dec->image_metadata.color_encoding = output_encoding.color_encoding; - } + JXL_API_RETURN_IF_ERROR(output_encoding.MaybeSetColorEncoding(c_out)); + dec->image_metadata.color_encoding = output_encoding.color_encoding; return JXL_DEC_SUCCESS; } // icc_data != nullptr diff --git a/third_party/jpeg-xl/lib/jxl/decode_test.cc b/third_party/jpeg-xl/lib/jxl/decode_test.cc index d6e610b9a5cd6..48deaa22bbd81 100644 --- a/third_party/jpeg-xl/lib/jxl/decode_test.cc +++ b/third_party/jpeg-xl/lib/jxl/decode_test.cc @@ -1096,16 +1096,16 @@ TEST(DecodeTest, IccProfileTestXybEncoded) { JxlDecoderGetColorAsEncodedProfile( dec, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries); - // The API returns LINEAR by default when the colorspace cannot be represented + // The API returns SRGB by default when the colorspace cannot be represented // by enum values. - EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function); + EXPECT_EQ(JXL_TRANSFER_FUNCTION_SRGB, pixel_encoding.transfer_function); // Test the same but with integer format. EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsEncodedProfile( dec, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding)); EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries); - EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function); + EXPECT_EQ(JXL_TRANSFER_FUNCTION_SRGB, pixel_encoding.transfer_function); // Test after setting the preferred color profile to non-linear sRGB: // for XYB images with ICC profile, this setting is expected to take effect. @@ -1800,11 +1800,14 @@ void SetPreferredColorProfileTest( xsize, ysize, num_channels, params); auto all_encodings = jxl::test::AllEncodings(); // TODO(firsching): understand why XYB does not work together with icc_dst. + // TODO(jon): fix XYB output space in general + /* if (!icc_dst) { all_encodings.push_back( {jxl::ColorSpace::kXYB, jxl::WhitePoint::kD65, jxl::Primaries::kCustom, jxl::TransferFunction::kUnknown, jxl::RenderingIntent::kPerceptual}); } + */ for (const auto& c1 : all_encodings) { jxl::ColorEncoding c_out = jxl::test::ColorEncodingFromDescriptor(c1); float intensity_out = intensity_in; diff --git a/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc index a8bf24160f617..1ba1d13e82bde 100644 --- a/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc +++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc @@ -810,7 +810,7 @@ StatusOr ReconstructImage( options.use_slow_render_pipeline = false; options.coalescing = false; options.render_spotcolors = false; - options.render_noise = true; + options.render_noise = ((frame_header.flags & FrameHeader::kNoise) != 0); JXL_RETURN_IF_ERROR(dec_state.PreparePipeline( frame_header, &shared.metadata->m, &decoded, options)); diff --git a/third_party/jpeg-xl/lib/jxl/enc_modular.cc b/third_party/jpeg-xl/lib/jxl/enc_modular.cc index 1620a5be640fd..24a4f143e1d84 100644 --- a/third_party/jpeg-xl/lib/jxl/enc_modular.cc +++ b/third_party/jpeg-xl/lib/jxl/enc_modular.cc @@ -164,12 +164,19 @@ Status float_to_int(const float* const row_in, pixel_type* const row_out, continue; } int exp = (f >> 23) - 127; - if (exp == 128) return JXL_FAILURE("Inf/NaN not allowed"); int mantissa = (f & 0x007fffff); // broke up the binary32 into its parts, now reassemble into // arbitrary float + if (exp == 128) { + // NaN or infinity + f = (signbit ? sign : 0); + f |= ((1 << exp_bits) - 1) << mant_bits; + f |= mantissa >> mant_shift; + row_out[x] = static_cast(f); + continue; + } exp += exp_bias; - if (exp < 0) { // will become a subnormal number + if (exp <= 0) { // will become a subnormal number // add implicit leading 1 to mantissa mantissa |= 0x00800000; if (exp < -mant_bits) { @@ -182,8 +189,8 @@ Status float_to_int(const float* const row_in, pixel_type* const row_out, exp = 0; } // exp should be representable in exp_bits, otherwise input was - // invalid - if (exp > max_exp) return JXL_FAILURE("Invalid float exponent"); + // invalid; max_exp is NaN or infinity + if (exp >= max_exp) return JXL_FAILURE("Invalid float exponent"); if (mantissa & ((1 << mant_shift) - 1)) { return JXL_FAILURE("%g is losing precision (mant: %x)", row_in[x], mantissa); diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h index 3c9b83addd56d..0bf60f708d26f 100644 --- a/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/render_pipeline_stage.h @@ -16,6 +16,8 @@ namespace jxl { +class FrameDecoder; + // The first pixel in the input to RenderPipelineStage will be located at // this position. Pixels before this position may be accessed as padding. // This should be at least the RoundUpTo(maximum padding / 2, maximum vector @@ -170,6 +172,7 @@ class RenderPipelineStage { friend class RenderPipeline; friend class SimpleRenderPipeline; friend class LowMemoryRenderPipeline; + friend FrameDecoder; // for PrepareStorage invoking PrepareForThreads }; } // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc index 15cce2b5eb705..e8688927fc165 100644 --- a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.cc @@ -24,10 +24,14 @@ namespace HWY_NAMESPACE { class CmsStage : public RenderPipelineStage { public: - explicit CmsStage(OutputEncodingInfo output_encoding_info) + explicit CmsStage(OutputEncodingInfo output_encoding_info, bool linear) : RenderPipelineStage(RenderPipelineStage::Settings()), output_encoding_info_(std::move(output_encoding_info)) { - c_src_ = output_encoding_info_.linear_color_encoding; + if (linear) { + c_src_ = output_encoding_info_.linear_color_encoding; + } else { + c_src_ = output_encoding_info_.orig_color_encoding; + } } bool IsNeeded() const { @@ -47,31 +51,57 @@ class CmsStage : public RenderPipelineStage { JXL_ENSURE(xsize <= xsize_); // TODO(firsching): handle grey case separately // interleave - float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); - float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); - float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); - float* mutable_buf_src = color_space_transform->BufSrc(thread_id); - - for (size_t x = 0; x < xsize; x++) { - mutable_buf_src[3 * x + 0] = row0[x]; - mutable_buf_src[3 * x + 1] = row1[x]; - mutable_buf_src[3 * x + 2] = row2[x]; - } - const float* buf_src = mutable_buf_src; - float* JXL_RESTRICT buf_dst = color_space_transform->BufDst(thread_id); - JXL_RETURN_IF_ERROR( - color_space_transform->Run(thread_id, buf_src, buf_dst, xsize)); - // de-interleave - for (size_t x = 0; x < xsize; x++) { - row0[x] = buf_dst[3 * x + 0]; - row1[x] = buf_dst[3 * x + 1]; - row2[x] = buf_dst[3 * x + 2]; + if (c_src_.IsCMYK()) { + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + float* JXL_RESTRICT row3 = GetInputRow(input_rows, 3, 0); + float* mutable_buf_src = color_space_transform->BufSrc(thread_id); + + for (size_t x = 0; x < xsize; x++) { + mutable_buf_src[4 * x + 0] = row0[x]; + mutable_buf_src[4 * x + 1] = row1[x]; + mutable_buf_src[4 * x + 2] = row2[x]; + mutable_buf_src[4 * x + 3] = row3[x]; + } + const float* buf_src = mutable_buf_src; + float* JXL_RESTRICT buf_dst = color_space_transform->BufDst(thread_id); + JXL_RETURN_IF_ERROR( + color_space_transform->Run(thread_id, buf_src, buf_dst, xsize)); + // de-interleave + for (size_t x = 0; x < xsize; x++) { + row0[x] = buf_dst[3 * x + 0]; + row1[x] = buf_dst[3 * x + 1]; + row2[x] = buf_dst[3 * x + 2]; + } + + } else { + float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0); + float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0); + float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0); + float* mutable_buf_src = color_space_transform->BufSrc(thread_id); + + for (size_t x = 0; x < xsize; x++) { + mutable_buf_src[3 * x + 0] = row0[x]; + mutable_buf_src[3 * x + 1] = row1[x]; + mutable_buf_src[3 * x + 2] = row2[x]; + } + const float* buf_src = mutable_buf_src; + float* JXL_RESTRICT buf_dst = color_space_transform->BufDst(thread_id); + JXL_RETURN_IF_ERROR( + color_space_transform->Run(thread_id, buf_src, buf_dst, xsize)); + // de-interleave + for (size_t x = 0; x < xsize; x++) { + row0[x] = buf_dst[3 * x + 0]; + row1[x] = buf_dst[3 * x + 1]; + row2[x] = buf_dst[3 * x + 2]; + } } return true; } RenderPipelineChannelMode GetChannelMode(size_t c) const final { - return c < 3 ? RenderPipelineChannelMode::kInPlace - : RenderPipelineChannelMode::kIgnored; + return c < (c_src_.IsCMYK() ? 4 : 3) ? RenderPipelineChannelMode::kInPlace + : RenderPipelineChannelMode::kIgnored; } const char* GetName() const override { return "Cms"; } @@ -104,8 +134,8 @@ class CmsStage : public RenderPipelineStage { }; std::unique_ptr GetCmsStage( - const OutputEncodingInfo& output_encoding_info) { - auto stage = jxl::make_unique(output_encoding_info); + const OutputEncodingInfo& output_encoding_info, bool linear) { + auto stage = jxl::make_unique(output_encoding_info, linear); if (!stage->IsNeeded()) return nullptr; return stage; } @@ -121,8 +151,8 @@ namespace jxl { HWY_EXPORT(GetCmsStage); std::unique_ptr GetCmsStage( - const OutputEncodingInfo& output_encoding_info) { - return HWY_DYNAMIC_DISPATCH(GetCmsStage)(output_encoding_info); + const OutputEncodingInfo& output_encoding_info, bool linear) { + return HWY_DYNAMIC_DISPATCH(GetCmsStage)(output_encoding_info, linear); } } // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h index 23277ae6f735a..964676adf24ce 100644 --- a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_cms.h @@ -14,7 +14,7 @@ namespace jxl { std::unique_ptr GetCmsStage( - const OutputEncodingInfo& output_encoding_info); + const OutputEncodingInfo& output_encoding_info, bool linear = true); } // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc index ad31d82be7c7d..63c4078c52336 100644 --- a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.cc @@ -8,6 +8,8 @@ #include "lib/jxl/base/sanitizers.h" #include "lib/jxl/base/status.h" +#include "lib/jxl/memory_manager_internal.h" + #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_upsampling.cc" #include @@ -20,28 +22,48 @@ namespace jxl { namespace HWY_NAMESPACE { // These templates are not found via ADL. +using hwy::HWY_NAMESPACE::Add; using hwy::HWY_NAMESPACE::Clamp; +using hwy::HWY_NAMESPACE::LoadU; using hwy::HWY_NAMESPACE::Max; using hwy::HWY_NAMESPACE::Min; +using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Store; class UpsamplingStage : public RenderPipelineStage { public: - explicit UpsamplingStage(const CustomTransformData& ups_factors, size_t c, + explicit UpsamplingStage(JxlMemoryManager* memory_manager, + const CustomTransformData& ups_factors, size_t c, size_t shift) : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric( /*shift=*/shift, /*border=*/2)), - c_(c) { + c_(c), + memory_manager_(memory_manager) { const float* weights = shift == 1 ? ups_factors.upsampling2_weights : shift == 2 ? ups_factors.upsampling4_weights : ups_factors.upsampling8_weights; - size_t N = 1 << (shift - 1); - for (size_t i = 0; i < 5 * N; i++) { - for (size_t j = 0; j < 5 * N; j++) { - size_t y = std::min(i, j); - size_t x = std::max(i, j); - kernel_[j / 5][i / 5][j % 5][i % 5] = - weights[5 * N * y - y * (y - 1) / 2 + x - y]; + size_t N = 1 << shift; + size_t H = N / 2; + for (size_t ky = 0; ky < H; ++ky) { + for (size_t kx = 0; kx < H; ++kx) { + size_t offset0 = (ky * N + kx) * 25; + size_t offset1 = (ky * N + (N - 1 - kx)) * 25; + size_t offset2 = ((N - 1 - ky) * N + kx) * 25; + size_t offset3 = ((N - 1 - ky) * N + (N - 1 - kx)) * 25; + for (size_t py = 0; py < 5; ++py) { + for (size_t px = 0; px < 5; ++px) { + size_t j = 5 * ky + py; + size_t i = 5 * kx + px; + size_t my = std::min(i, j); + size_t mx = std::max(i, j); + float w = weights[5 * H * my - my * (my - 1) / 2 + mx - my]; + kernel_[offset0 + py * 5 + px] = w; + kernel_[offset1 + py * 5 + (4 - px)] = w; + kernel_[offset2 + (4 - py) * 5 + px] = w; + kernel_[offset3 + (4 - py) * 5 + (4 - px)] = w; + } + } } } } @@ -49,7 +71,7 @@ class UpsamplingStage : public RenderPipelineStage { Status ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows, size_t xextra, size_t xsize, size_t xpos, size_t ypos, size_t thread_id) const final { - static HWY_FULL(float) df; + constexpr HWY_FULL(float) df; size_t shift = settings_.shift_x; size_t N = 1 << shift; const size_t xsize_v = RoundUpTo(xsize, Lanes(df)); @@ -58,16 +80,19 @@ class UpsamplingStage : public RenderPipelineStage { sizeof(float) * (xsize_v - xsize)); } JXL_ENSURE(xextra == 0); - ssize_t x0 = 0; - ssize_t x1 = xsize; - if (N == 2) { - ProcessRowImpl<2>(input_rows, output_rows, x0, x1); - } - if (N == 4) { - ProcessRowImpl<4>(input_rows, output_rows, x0, x1); - } - if (N == 8) { - ProcessRowImpl<8>(input_rows, output_rows, x0, x1); + for (size_t x = 0; x < xsize; x += kChunkSize) { + size_t xend = std::min(x + kChunkSize, xsize); + size_t len = xend - x; + PreProcessRowImpl(input_rows, x, len, thread_id); + if (N == 2) { + ProcessRowImpl<2>(input_rows, output_rows, x, len, thread_id); + } + if (N == 4) { + ProcessRowImpl<4>(input_rows, output_rows, x, len, thread_id); + } + if (N == 8) { + ProcessRowImpl<8>(input_rows, output_rows, x, len, thread_id); + } } for (size_t oy = 0; oy < N; oy++) { float* dst_row = GetOutputRow(output_rows, c_, oy); @@ -85,30 +110,87 @@ class UpsamplingStage : public RenderPipelineStage { const char* GetName() const override { return "Upsample"; } private: - template - JXL_INLINE float Kernel(size_t x, size_t y, ssize_t ix, ssize_t iy) const { - static_assert(N == 2 || N == 4 || N == 8, "N must be 2, 4, or 8"); - ix += 2; - iy += 2; - if (N == 2) { - return kernel_[0][0][y % 2 ? 4 - iy : iy][x % 2 ? 4 - ix : ix]; + JXL_INLINE float Kernel(size_t k, size_t i) const { + return kernel_[k * 25 + i]; + } + + Status PrepareForThreads(size_t num_threads) override { + size_t alloc_size = sizeof(float) * (kChunkSize + 4); + for (size_t i = 0; i < 3; ++i) { + temp_[i].resize(num_threads); + for (size_t t = 0; t < num_threads; ++t) { + JXL_ASSIGN_OR_RETURN( + temp_[i][t], AlignedMemory::Create(memory_manager_, alloc_size)); + } } - if (N == 4) { - return kernel_[y % 4 < 2 ? y % 2 : 1 - y % 2] - [x % 4 < 2 ? x % 2 : 1 - x % 2][y % 4 < 2 ? iy : 4 - iy] - [x % 4 < 2 ? ix : 4 - ix]; + return true; + } + + void PreProcessRowImpl(const RowInfo& input_rows, size_t x0, size_t len, + size_t thread_id) const { + constexpr HWY_FULL(float) df; + float* JXL_RESTRICT col_min = temp_[0][thread_id].address(); + float* JXL_RESTRICT col_max = temp_[1][thread_id].address(); + + std::array rows = { + GetInputRow(input_rows, c_, -2) + x0 - 2, + GetInputRow(input_rows, c_, -1) + x0 - 2, + GetInputRow(input_rows, c_, 0) + x0 - 2, + GetInputRow(input_rows, c_, 1) + x0 - 2, + GetInputRow(input_rows, c_, 2) + x0 - 2}; + + for (size_t x = 0; x < len + 4; x += Lanes(df)) { + const auto v0 = LoadU(df, rows[0] + x); + const auto v1 = LoadU(df, rows[1] + x); + const auto min0 = Min(v0, v1); + const auto max0 = Max(v0, v1); + const auto v2 = LoadU(df, rows[2] + x); + const auto v3 = LoadU(df, rows[3] + x); + const auto min1 = Min(v2, v3); + const auto max1 = Max(v2, v3); + const auto v4 = LoadU(df, rows[4] + x); + const auto min2 = Min(min0, min1); + const auto max2 = Max(max0, max1); + const auto min = Min(v4, min2); + const auto max = Max(v4, max2); + Store(min, df, col_min + x); + Store(max, df, col_max + x); } - if (N == 8) { - return kernel_[y % 8 < 4 ? y % 4 : 3 - y % 4] - [x % 8 < 4 ? x % 4 : 3 - x % 4][y % 8 < 4 ? iy : 4 - iy] - [x % 8 < 4 ? ix : 4 - ix]; + + float* JXL_RESTRICT mins = temp_[2][thread_id].address(); + for (size_t x = 0; x < len; x += Lanes(df)) { + const auto v0 = LoadU(df, col_min + x); + const auto v1 = LoadU(df, col_min + x + 1); + const auto min0 = Min(v0, v1); + const auto v2 = LoadU(df, col_min + x + 2); + const auto v3 = LoadU(df, col_min + x + 3); + const auto min1 = Min(v2, v3); + const auto v4 = LoadU(df, col_min + x + 4); + const auto min2 = Min(min0, min1); + const auto min = Min(v4, min2); + Store(min, df, mins + x); + } + + // col_mins will be overwritten + float* JXL_RESTRICT maxs = temp_[0][thread_id].address(); + for (size_t x = 0; x < len; x += Lanes(df)) { + const auto v0 = LoadU(df, col_max + x); + const auto v1 = LoadU(df, col_max + x + 1); + const auto max0 = Max(v0, v1); + const auto v2 = LoadU(df, col_max + x + 2); + const auto v3 = LoadU(df, col_max + x + 3); + const auto max1 = Max(v2, v3); + const auto v4 = LoadU(df, col_max + x + 4); + const auto max2 = Max(max0, max1); + const auto max = Max(v4, max2); + Store(max, df, maxs + x); } } template void ProcessRowImpl(const RowInfo& input_rows, const RowInfo& output_rows, - ssize_t x0, ssize_t x1) const { - static HWY_FULL(float) df; + size_t x0, size_t len, size_t thread_id) const { + constexpr HWY_FULL(float) df; using V = hwy::HWY_NAMESPACE::Vec; V ups0, ups1, ups2, ups3, ups4, ups5, ups6, ups7; // NOLINT (void)ups2, (void)ups3, (void)ups4, (void)ups5, (void)ups6, (void)ups7; @@ -131,23 +213,33 @@ class UpsamplingStage : public RenderPipelineStage { ups[7] = &ups7; } - for (size_t oy = 0; oy < N; oy++) { - float* dst_row = GetOutputRow(output_rows, c_, oy); - for (ssize_t x = x0; x < x1; x += Lanes(df)) { + float* JXL_RESTRICT mins = temp_[2][thread_id].address(); + float* JXL_RESTRICT maxs = temp_[0][thread_id].address(); + std::array input; + for (ssize_t iy = -2; iy <= 2; ++iy) { + for (ssize_t ix = -2; ix <= 2; ++ix) { + input[5 * (iy + 2) + (ix + 2)] = GetInputRow(input_rows, c_, iy) + ix; + } + } + + for (size_t x = 0; x < len; x += Lanes(df)) { + for (size_t oy = 0; oy < N; oy++) { + float* dst_row = GetOutputRow(output_rows, c_, oy); for (size_t ox = 0; ox < N; ox++) { - auto result = Zero(df); - auto min = LoadU(df, GetInputRow(input_rows, c_, 0) + x); - auto max = min; - for (ssize_t iy = -2; iy <= 2; iy++) { - for (ssize_t ix = -2; ix <= 2; ix++) { - auto v = LoadU(df, GetInputRow(input_rows, c_, iy) + x + ix); - result = MulAdd(Set(df, Kernel(ox, oy, ix, iy)), v, result); - min = Min(v, min); - max = Max(v, max); - } + size_t k = N * oy + ox; + auto acc0 = Mul(LoadU(df, input[0]), Set(df, Kernel(k, 0))); + auto acc1 = Mul(LoadU(df, input[1]), Set(df, Kernel(k, 1))); + auto acc2 = Mul(LoadU(df, input[2]), Set(df, Kernel(k, 2))); + for (size_t i = 3; i < 24; i += 3) { + acc0 = MulAdd(LoadU(df, input[i]), Set(df, Kernel(k, i)), acc0); + acc1 = MulAdd(LoadU(df, input[i + 1]), Set(df, Kernel(k, i + 1)), + acc1); + acc2 = MulAdd(LoadU(df, input[i + 2]), Set(df, Kernel(k, i + 2)), + acc2); } - // Avoid overshooting. - *ups[ox] = Clamp(result, min, max); + acc0 = MulAdd(LoadU(df, input[24]), Set(df, Kernel(k, 24)), acc0); + auto result = Add(Add(acc1, acc2), acc0); + *ups[ox] = Clamp(result, Load(df, mins + x), Load(df, maxs + x)); } if (N == 2) { StoreInterleaved(df, ups0, ups1, dst_row + x * N); @@ -160,16 +252,23 @@ class UpsamplingStage : public RenderPipelineStage { dst_row + x * N); } } + for (size_t i = 0; i < 25; ++i) input[i] += Lanes(df); } } + // Process row in chunks to keep per-thread buffers compact. + static const size_t kChunkSize = 1024; + std::array, 3> temp_; size_t c_; - float kernel_[4][4][5][5]; + float kernel_[64 * 25]; + JxlMemoryManager* memory_manager_; }; std::unique_ptr GetUpsamplingStage( - const CustomTransformData& ups_factors, size_t c, size_t shift) { - return jxl::make_unique(ups_factors, c, shift); + JxlMemoryManager* memory_manager, const CustomTransformData& ups_factors, + size_t c, size_t shift) { + return jxl::make_unique(memory_manager, ups_factors, c, + shift); } // NOLINTNEXTLINE(google-readability-namespace-comments) @@ -183,12 +282,14 @@ namespace jxl { HWY_EXPORT(GetUpsamplingStage); std::unique_ptr GetUpsamplingStage( - const CustomTransformData& ups_factors, size_t c, size_t shift) { + JxlMemoryManager* memory_manager, const CustomTransformData& ups_factors, + size_t c, size_t shift) { if ((shift < 1) || (shift > 3)) { JXL_DEBUG_ABORT("internal: (shift != 0) && (shift <= 3)"); return nullptr; } - return HWY_DYNAMIC_DISPATCH(GetUpsamplingStage)(ups_factors, c, shift); + return HWY_DYNAMIC_DISPATCH(GetUpsamplingStage)(memory_manager, ups_factors, + c, shift); } } // namespace jxl diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h index 7d5defd23cfc6..cad07e81efd95 100644 --- a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_upsampling.h @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -20,7 +21,8 @@ namespace jxl { // Upsamples the given channel by the given factor. std::unique_ptr GetUpsamplingStage( - const CustomTransformData& ups_factors, size_t c, size_t shift); + JxlMemoryManager* memory_manager, const CustomTransformData& ups_factors, + size_t c, size_t shift); } // namespace jxl #endif // LIB_JXL_RENDER_PIPELINE_STAGE_UPSAMPLING_H_ diff --git a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc index 533ff5bd48294..7737c1c7500b7 100644 --- a/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc +++ b/third_party/jpeg-xl/lib/jxl/render_pipeline/stage_write.cc @@ -43,51 +43,142 @@ using hwy::HWY_NAMESPACE::ShiftLeftSame; using hwy::HWY_NAMESPACE::ShiftRightSame; using hwy::HWY_NAMESPACE::VFromD; -// 8x8 ordered dithering pattern from -// https://en.wikipedia.org/wiki/Ordered_dithering -// scaled to have an average of 0 and be fully contained in (-0.5, 0.5). -// Matrix is duplicated in width to avoid inconsistencies or out-of-bound-reads -// if doing unaligned operations. -const float kDither[(2 * 8) * 8] = { - -0.4921875, 0.0078125, -0.3671875, 0.1328125, // - -0.4609375, 0.0390625, -0.3359375, 0.1640625, // - -0.4921875, 0.0078125, -0.3671875, 0.1328125, // - -0.4609375, 0.0390625, -0.3359375, 0.1640625, // - // - 0.2578125, -0.2421875, 0.3828125, -0.1171875, // - 0.2890625, -0.2109375, 0.4140625, -0.0859375, // - 0.2578125, -0.2421875, 0.3828125, -0.1171875, // - 0.2890625, -0.2109375, 0.4140625, -0.0859375, // - // - -0.3046875, 0.1953125, -0.4296875, 0.0703125, // - -0.2734375, 0.2265625, -0.3984375, 0.1015625, // - -0.3046875, 0.1953125, -0.4296875, 0.0703125, // - -0.2734375, 0.2265625, -0.3984375, 0.1015625, // - // - 0.4453125, -0.0546875, 0.3203125, -0.1796875, // - 0.4765625, -0.0234375, 0.3515625, -0.1484375, // - 0.4453125, -0.0546875, 0.3203125, -0.1796875, // - 0.4765625, -0.0234375, 0.3515625, -0.1484375, // - // - -0.4453125, 0.0546875, -0.3203125, 0.1796875, // - -0.4765625, 0.0234375, -0.3515625, 0.1484375, // - -0.4453125, 0.0546875, -0.3203125, 0.1796875, // - -0.4765625, 0.0234375, -0.3515625, 0.1484375, // - // - 0.3046875, -0.1953125, 0.4296875, -0.0703125, // - 0.2734375, -0.2265625, 0.3984375, -0.1015625, // - 0.3046875, -0.1953125, 0.4296875, -0.0703125, // - 0.2734375, -0.2265625, 0.3984375, -0.1015625, // - // - -0.2578125, 0.2421875, -0.3828125, 0.1171875, // - -0.2890625, 0.2109375, -0.4140625, 0.0859375, // - -0.2578125, 0.2421875, -0.3828125, 0.1171875, // - -0.2890625, 0.2109375, -0.4140625, 0.0859375, // - // - 0.4921875, -0.0078125, 0.3671875, -0.1328125, // - 0.4609375, -0.0390625, 0.3359375, -0.1640625, // - 0.4921875, -0.0078125, 0.3671875, -0.1328125, // - 0.4609375, -0.0390625, 0.3359375, -0.1640625, // +// 32x32 blue noise dithering pattern from +// https://momentsingraphics.de/BlueNoise.html#Downloads scaled to have +// an average of 0 and be fully contained in (0.49219 to -0.49219). +// In SIMD codepath we could load up to 128 bits, so need 3 extra (32-bit) +// elements for zero-cost wrapping. +const float kDither[1024 + 3] = { + -0.26057, 0.32619, 0.21039, -0.03281, -0.10616, 0.16792, 0.43042, -0.48061, + -0.00965, -0.31075, 0.24899, -0.35322, -0.02509, -0.25285, 0.02895, 0.10230, + -0.28373, -0.00193, 0.23355, 0.43428, -0.23741, 0.18336, -0.31847, -0.11002, + -0.36094, 0.26057, -0.19108, -0.29531, 0.40726, -0.09458, 0.11002, -0.48833, + 0.16020, -0.35708, -0.18336, 0.36094, -0.28373, -0.34550, -0.20267, 0.07914, + 0.35708, -0.41498, 0.47675, -0.21811, -0.12546, 0.44200, -0.41884, -0.17178, + 0.39954, 0.33778, -0.33778, 0.04053, -0.46517, 0.27215, -0.16792, 0.39182, + 0.20653, -0.43814, -0.02895, 0.17950, -0.41498, 0.01737, 0.24899, 0.49219, + -0.00965, 0.08300, 0.41112, -0.46903, 0.04053, 0.47289, 0.26057, -0.05983, + -0.13704, 0.14862, 0.03281, 0.29531, -0.45744, 0.22583, 0.14862, -0.09072, + -0.37638, 0.19881, -0.14476, 0.14476, -0.09072, 0.48447, -0.39954, 0.06369, + -0.05983, -0.26829, 0.43428, -0.12546, 0.28759, -0.22969, -0.32619, -0.15248, + -0.42270, 0.23741, -0.23355, -0.11774, 0.18722, 0.11388, -0.43814, -0.24899, + 0.41884, 0.21039, -0.28373, -0.06756, 0.07914, 0.36480, -0.31075, 0.30303, + -0.03281, 0.07142, -0.42656, 0.38024, -0.27987, 0.00579, 0.12546, -0.22197, + 0.29917, 0.36866, 0.13704, -0.47289, 0.09072, 0.35708, -0.04825, 0.38796, + -0.28759, -0.07142, 0.44200, 0.27601, -0.38024, -0.16020, -0.01737, 0.30303, + -0.33006, -0.40340, -0.16792, 0.40726, -0.36480, -0.00579, -0.19108, 0.41498, + -0.26443, 0.46903, -0.21811, 0.28759, -0.04053, 0.22197, 0.34550, -0.44972, + -0.14476, -0.34164, 0.04053, -0.19494, 0.45358, -0.37252, 0.21425, 0.05597, + 0.31075, 0.14090, -0.33778, 0.00579, 0.34550, -0.29917, 0.38796, 0.13704, + 0.05983, -0.10230, 0.34164, 0.10616, -0.23741, 0.19494, -0.47675, 0.04439, + -0.39568, 0.24127, 0.10616, -0.49219, -0.17950, -0.36094, -0.30303, 0.45744, + -0.01351, 0.24513, -0.39182, -0.07528, 0.18722, -0.26057, -0.11002, -0.45358, + 0.46903, -0.17178, -0.41112, 0.07528, -0.09458, 0.21811, -0.20267, -0.48833, + 0.44972, 0.00965, 0.24127, -0.42656, 0.48447, -0.11774, 0.26443, 0.14090, + -0.15634, -0.07142, -0.32233, 0.36094, 0.42270, 0.19108, 0.07142, -0.11002, + 0.15634, 0.38024, -0.28759, 0.27987, -0.00193, 0.33006, 0.11388, -0.21039, + 0.02123, 0.17950, 0.38024, -0.24127, -0.44586, 0.48833, -0.03667, 0.26829, + -0.36866, -0.22583, 0.17178, -0.30689, 0.29145, -0.04825, -0.35322, 0.43042, + 0.34936, 0.00193, 0.16792, -0.12932, 0.03667, -0.06756, 0.31847, -0.40726, + -0.24513, 0.09458, -0.17564, 0.47675, -0.43042, -0.32233, 0.40340, 0.26057, + -0.47675, -0.12160, -0.04825, 0.28759, 0.10230, 0.15634, -0.14862, -0.27601, + 0.36094, -0.12932, -0.05983, -0.45358, -0.17950, 0.01737, 0.09458, -0.29145, + -0.22969, -0.43428, 0.45744, -0.38796, -0.27601, -0.21039, -0.46131, 0.22969, + 0.41112, -0.05211, -0.48061, 0.16406, 0.05211, -0.14862, -0.03281, -0.36866, + -0.27215, 0.34164, -0.31075, 0.42656, -0.38410, -0.32619, 0.02895, 0.19881, + 0.08300, 0.42270, 0.31461, 0.13318, 0.45744, 0.37638, -0.40726, 0.31847, + -0.08686, 0.21425, 0.29917, 0.07914, 0.26829, 0.13704, 0.48447, -0.15248, + 0.02509, -0.34936, 0.34936, -0.10230, 0.42656, -0.23741, 0.22583, 0.09072, + 0.44972, 0.20267, 0.04825, -0.21425, 0.24513, -0.07142, 0.39954, -0.46131, + -0.39568, -0.01351, -0.33392, 0.05597, -0.26443, 0.22197, -0.20653, 0.15248, + 0.04439, -0.46517, -0.16406, -0.04439, -0.34936, 0.37252, -0.01351, -0.30689, + 0.29917, 0.20653, -0.26829, 0.26443, 0.13318, -0.39954, 0.30303, -0.08686, + -0.42656, 0.12932, -0.14476, -0.46903, -0.00579, 0.34936, -0.18722, 0.28373, + -0.23741, 0.22969, -0.16020, -0.38024, -0.08300, -0.48447, -0.02123, -0.14862, + 0.48061, -0.31847, 0.39568, -0.24899, 0.18722, -0.41884, 0.10230, -0.08300, + -0.38796, 0.06369, -0.19881, -0.44972, 0.00579, -0.33392, 0.37252, -0.19108, + -0.02509, -0.35708, 0.32619, 0.46517, 0.17178, -0.28373, 0.10616, 0.47675, + -0.09458, 0.15248, 0.43428, 0.35322, 0.17564, 0.27215, 0.41112, -0.36480, + 0.24899, 0.11774, 0.01351, 0.33006, -0.11388, -0.18336, 0.41884, -0.23355, + 0.16406, 0.46131, 0.38410, -0.04825, -0.15634, 0.49219, 0.17564, 0.03667, + 0.40726, 0.23355, -0.25285, -0.08300, -0.41112, -0.12160, -0.35708, 0.05211, + -0.41884, -0.29531, 0.02123, -0.21425, 0.09844, -0.30689, -0.11388, 0.34550, + -0.26443, -0.07142, -0.39954, 0.44586, 0.05983, -0.48833, 0.24127, 0.34936, + -0.44200, -0.12546, 0.12160, -0.30303, 0.27215, 0.07528, -0.48447, -0.29145, + 0.28373, -0.17564, 0.09458, 0.02123, 0.30689, 0.41884, 0.20653, -0.03667, + 0.32233, 0.25671, -0.45744, -0.05597, 0.46517, -0.41498, 0.00965, 0.07142, + -0.44586, 0.16406, -0.20653, 0.21811, -0.29917, 0.28759, -0.05597, 0.03281, + -0.32619, -0.00965, 0.31847, -0.37252, 0.18722, -0.11002, -0.22969, -0.06369, + -0.39568, 0.36866, -0.45744, -0.31847, 0.14476, -0.22583, -0.49219, 0.37638, + -0.19494, -0.13318, 0.39182, -0.35322, 0.29531, -0.24127, 0.21039, -0.18722, + 0.45358, 0.31461, -0.13318, -0.01737, -0.36094, 0.12932, -0.25671, 0.43814, + -0.16792, 0.23355, -0.22197, 0.44972, -0.42270, 0.33392, 0.42656, 0.11774, + -0.13318, 0.19494, -0.03667, 0.44972, 0.24513, -0.15248, 0.08300, -0.33006, + 0.00579, 0.12546, 0.19494, 0.05983, -0.15634, 0.14476, 0.36480, -0.04053, + -0.33006, 0.25671, -0.46903, 0.37252, 0.48833, -0.09458, -0.41112, 0.19108, + 0.08686, -0.46903, -0.07528, 0.04053, -0.26829, -0.02895, 0.22197, -0.34164, + 0.47289, -0.21811, 0.06756, -0.38410, -0.27987, -0.06369, 0.27987, 0.43814, + -0.25671, -0.39182, 0.49219, -0.27601, -0.07914, -0.48061, 0.42656, -0.38410, + 0.11002, 0.03667, -0.27215, 0.15634, 0.07528, -0.22197, 0.33006, 0.38410, + -0.34936, 0.27987, 0.15248, 0.40340, 0.09844, -0.16406, -0.46131, 0.03281, + -0.29531, 0.31461, -0.10616, 0.39954, 0.01351, 0.33778, -0.43814, 0.17178, + -0.08686, 0.23741, -0.44586, 0.33778, -0.00193, -0.31461, 0.23741, -0.12932, + -0.22583, -0.06756, 0.40340, -0.16792, -0.43428, 0.01351, -0.14476, -0.04053, + -0.29145, 0.46517, -0.13704, -0.39182, -0.32233, 0.29531, 0.38410, 0.16020, + -0.44200, 0.26443, 0.12546, -0.42270, 0.21425, -0.19881, -0.35708, 0.04825, + 0.36480, -0.02895, -0.21425, 0.09072, 0.41498, 0.18336, 0.04439, 0.29917, + 0.47675, -0.40340, 0.27601, -0.31461, 0.31075, 0.17564, 0.24899, -0.45744, + 0.05597, -0.19494, 0.00193, 0.36094, 0.24127, -0.09844, -0.24513, -0.00965, + -0.17564, -0.05597, -0.34550, -0.24899, 0.48061, 0.15248, -0.11388, 0.45358, + -0.16406, -0.32233, 0.31461, -0.11774, -0.36866, -0.18722, -0.25671, -0.44200, + 0.13318, -0.02123, 0.19881, -0.10616, 0.43042, -0.36866, -0.24899, 0.41112, + 0.11002, 0.21425, -0.25671, -0.47675, -0.04439, 0.13704, -0.37252, 0.43814, + 0.19108, 0.03667, 0.35708, -0.14090, 0.08300, -0.02123, -0.30303, -0.48061, + 0.11774, 0.20267, -0.43042, 0.25285, 0.14090, -0.04439, 0.38796, 0.34550, + -0.34164, -0.19494, 0.05983, -0.48447, 0.09844, -0.00579, -0.07914, 0.33778, + -0.41498, -0.10230, 0.30689, 0.17178, 0.48833, -0.20267, 0.07914, 0.33392, + -0.48833, -0.30689, 0.41498, 0.22969, -0.44586, 0.32233, 0.25285, 0.39182, + -0.23355, 0.01737, 0.42270, -0.27987, 0.46903, -0.47289, 0.02123, -0.09072, + 0.21811, 0.44586, -0.25285, 0.36480, -0.29145, 0.47289, -0.18722, 0.14476, + -0.31461, 0.43814, -0.36094, 0.04439, -0.29917, -0.41884, 0.25285, -0.11774, + 0.46131, 0.11388, -0.21039, -0.07528, -0.38024, -0.26057, 0.06369, -0.05983, + 0.29145, -0.40340, -0.09072, 0.06756, -0.16020, 0.27601, -0.31075, 0.10616, + -0.14090, -0.43042, 0.25671, -0.05211, -0.13318, 0.23355, -0.44972, 0.02895, + 0.26829, -0.02895, -0.17950, 0.37252, -0.13704, 0.40726, 0.01351, -0.26443, + -0.03281, -0.40340, 0.27987, 0.17564, 0.02509, 0.44200, -0.15248, -0.34550, + 0.14862, -0.19881, -0.01351, 0.36866, -0.38796, 0.19494, -0.22197, 0.32619, + -0.37638, 0.00193, 0.30689, 0.12160, -0.39182, 0.16792, -0.34550, 0.39954, + -0.23355, 0.09072, -0.43428, 0.22969, -0.06369, 0.12546, -0.35322, 0.30689, + -0.09844, 0.06756, 0.38410, -0.33392, -0.18336, 0.35322, 0.21039, -0.42270, + 0.48833, 0.33006, 0.21811, -0.33392, 0.12932, -0.05211, 0.39568, 0.04825, + 0.48061, 0.17950, -0.31847, -0.21811, 0.38024, 0.05211, 0.32233, -0.06756, + -0.12546, 0.46131, 0.16020, -0.25285, 0.29531, -0.44972, 0.17950, -0.16406, + 0.22583, -0.46131, -0.27601, -0.00579, 0.12932, -0.47289, -0.09844, 0.10230, + -0.28759, -0.12160, -0.49219, -0.24127, 0.44586, -0.11388, -0.45358, -0.27215, + -0.17178, -0.07528, -0.47675, 0.43042, -0.02509, -0.27215, -0.19108, 0.19881, + -0.49219, -0.37252, 0.33392, -0.00193, -0.33006, -0.20267, 0.48061, 0.34164, + -0.22969, 0.42270, -0.12160, 0.31075, 0.46903, -0.22583, 0.27215, -0.02509, + 0.03281, 0.40340, 0.25671, 0.08686, 0.00965, 0.29145, -0.41112, 0.14090, + 0.24513, 0.34164, 0.08686, -0.14862, 0.27601, -0.42656, 0.48447, 0.09844, + 0.26443, -0.27987, 0.05597, -0.10230, 0.43428, 0.08686, 0.02895, -0.38024, + 0.15634, 0.09458, -0.36480, 0.18336, -0.05211, -0.40726, 0.36866, -0.33778, + -0.19881, 0.16020, -0.37638, -0.16020, -0.29917, 0.20267, 0.41884, -0.01737, + -0.34936, -0.24127, 0.02509, 0.20653, -0.36480, -0.08686, 0.01737, -0.33778, + 0.41498, -0.03667, 0.37638, -0.17178, -0.47289, 0.26829, -0.28759, -0.05597, + 0.35708, 0.00193, 0.25285, -0.15634, -0.30303, 0.06369, 0.22197, 0.45358, + -0.43814, 0.30303, -0.04053, 0.46517, 0.35322, -0.21039, 0.06756, -0.14090, + 0.37638, -0.43042, 0.45744, -0.29531, 0.39568, 0.14862, 0.23741, -0.13704, + -0.21425, 0.16406, -0.40726, 0.22583, 0.13318, 0.38796, -0.12932, -0.43428, + -0.31461, -0.20653, 0.46131, -0.45358, 0.39568, -0.24513, -0.14090, 0.11002, + -0.08300, -0.26829, 0.05211, -0.46517, -0.09844, -0.39568, -0.32619, -0.06369, + 0.16792, 0.28373, 0.11388, -0.04439, -0.18336, -0.44200, 0.35322, -0.26057, + -0.46517, 0.31075, -0.07914, -0.34164, -0.24513, -0.02123, 0.19108, 0.44200, + 0.04825, -0.07914, -0.39954, 0.12160, 0.29145, 0.00965, -0.37638, 0.32233, + 0.20267, -0.17564, 0.39182, 0.12160, 0.18336, 0.32619, 0.26057, 0.49219, + -0.48447, -0.20653, -0.10616, -0.38796, 0.31847, 0.07528, -0.01737, 0.44586, + 0.11774, 0.02509, 0.47289, 0.07142, 0.33392, -0.38410, -0.17950, 0.28373, + // Wrapped values + -0.26057, 0.32619, 0.21039 }; using DF = HWY_FULL(float); @@ -102,7 +193,7 @@ VFromD> MakeUnsigned(VFromD v, size_t x0, size_t y0, v = Mul(v, mul); // TODO(veluca): if constexpr with C++17 if (sizeof(T) == 1) { - size_t pos = (y0 % 8) * (2 * 8) + (x0 % 8); + size_t pos = (y0 % 32) * 32 + (x0 % 32); #if HWY_TARGET != HWY_SCALAR auto dither = LoadDup128(DF(), kDither + pos); #else @@ -133,7 +224,7 @@ class WriteToOutputStage : public RenderPipelineStage { flip_x_(ShouldFlipX(undo_orientation)), flip_y_(ShouldFlipY(undo_orientation)), transpose_(ShouldTranspose(undo_orientation)), - opaque_alpha_(kMaxPixelsPerCall, 1.0f), + opaque_alpha_(kChunkSize, 1.0f), memory_manager_(memory_manager) { for (size_t ec = 0; ec < extra_output.size(); ++ec) { if (extra_output[ec].callback.IsPresent() || extra_output[ec].buffer) { @@ -171,9 +262,9 @@ class WriteToOutputStage : public RenderPipelineStage { ypos = height_ - 1u - ypos; } size_t limit = std::min(xsize, width_ - xpos); - for (size_t x0 = 0; x0 < limit; x0 += kMaxPixelsPerCall) { + for (size_t x0 = 0; x0 < limit; x0 += kChunkSize) { size_t xstart = xpos + x0; - size_t len = std::min(kMaxPixelsPerCall, limit - x0); + size_t len = std::min(kChunkSize, limit - x0); const float* line_buffers[4]; for (size_t c = 0; c < num_color_; c++) { @@ -226,7 +317,7 @@ class WriteToOutputStage : public RenderPipelineStage { Status PrepareForThreads(size_t num_threads) { if (pixel_callback_.IsPresent()) { run_opaque_ = - pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall); + pixel_callback_.Init(num_threads, /*num_pixels=*/kChunkSize); JXL_RETURN_IF_ERROR(run_opaque_ != nullptr); } else { JXL_RETURN_IF_ERROR(buffer_ != nullptr); @@ -252,16 +343,15 @@ class WriteToOutputStage : public RenderPipelineStage { JXL_RETURN_IF_ERROR(extra.PrepareForThreads(num_threads)); } temp_out_.resize(num_threads); + size_t alloc_size = sizeof(float) * kChunkSize; for (AlignedMemory& temp : temp_out_) { - size_t alloc_size = - sizeof(float) * kMaxPixelsPerCall * main_.num_channels_; - JXL_ASSIGN_OR_RETURN(temp, - AlignedMemory::Create(memory_manager_, alloc_size)); + JXL_ASSIGN_OR_RETURN( + temp, AlignedMemory::Create(memory_manager_, + alloc_size * main_.num_channels_)); } if ((has_alpha_ && want_alpha_ && unpremul_alpha_) || flip_x_) { temp_in_.resize(num_threads * main_.num_channels_); for (AlignedMemory& temp : temp_in_) { - size_t alloc_size = sizeof(float) * kMaxPixelsPerCall; JXL_ASSIGN_OR_RETURN( temp, AlignedMemory::Create(memory_manager_, alloc_size)); } @@ -522,7 +612,8 @@ class WriteToOutputStage : public RenderPipelineStage { } } - static constexpr size_t kMaxPixelsPerCall = 1024; + // Process row in chunks to keep per-thread buffers compact. + static const size_t kChunkSize = 1024; size_t width_; size_t height_; Output main_; // color + alpha @@ -541,10 +632,6 @@ class WriteToOutputStage : public RenderPipelineStage { std::vector temp_out_; }; -#if JXL_CXX_LANG < JXL_CXX_17 -constexpr size_t WriteToOutputStage::kMaxPixelsPerCall; -#endif - std::unique_ptr GetWriteToOutputStage( const ImageOutput& main_output, size_t width, size_t height, bool has_alpha, bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,