Skip to content

Commit 1021dbc

Browse files
committed
Merge remote-tracking branch 'origin/main' into codex-tmp/update-branches/5159-1778674748/48f3124b5b84849f06a8827ac7636044580ecbbb
2 parents 9f608b9 + 8f46d8b commit 1021dbc

5 files changed

Lines changed: 341 additions & 28 deletions

File tree

invokeai/app/invocations/qwen_image_denoise.py

Lines changed: 89 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
import math
12
from contextlib import ExitStack
2-
from typing import Callable, Iterator, Optional, Tuple
3+
from typing import Callable, ClassVar, Iterator, Optional, Tuple
34

45
import torch
56
import torchvision.transforms as tv_transforms
@@ -176,6 +177,72 @@ def _unpack_latents(latents: torch.Tensor, height: int, width: int) -> torch.Ten
176177
latents = latents.reshape(batch_size, channels // 4, h, w)
177178
return latents
178179

180+
@staticmethod
181+
def _align_ref_latent_dims(rh: int, rw: int) -> tuple[int, int]:
182+
"""Trim reference latent spatial dims to even values for 2x2 packing.
183+
184+
Raises ValueError if the aligned dims would be < 2 (i.e., the reference
185+
latent is too small to produce any valid tokens).
186+
"""
187+
rh_aligned = rh - (rh % 2)
188+
rw_aligned = rw - (rw % 2)
189+
if rh_aligned < 2 or rw_aligned < 2:
190+
raise ValueError(
191+
f"Reference latent spatial dims must be >= 2 after even alignment; "
192+
f"got ({rh_aligned}, {rw_aligned}) from input shape ({rh}, {rw}). "
193+
"Ensure the reference image is at least 16 pixels in each dimension."
194+
)
195+
return rh_aligned, rw_aligned
196+
197+
@staticmethod
198+
def _build_img_shapes(
199+
latent_height: int,
200+
latent_width: int,
201+
ref_latent_height: int | None = None,
202+
ref_latent_width: int | None = None,
203+
) -> list[list[tuple[int, int, int]]]:
204+
"""Build the img_shapes argument for the transformer.
205+
206+
The reference segment (if present) must use its own dims so QwenEmbedRope's
207+
spatial frequencies position ref tokens distinctly from noisy tokens —
208+
otherwise reference content bleeds into the generation as a ghost.
209+
"""
210+
shapes: list[tuple[int, int, int]] = [(1, latent_height // 2, latent_width // 2)]
211+
if ref_latent_height is not None and ref_latent_width is not None:
212+
shapes.append((1, ref_latent_height // 2, ref_latent_width // 2))
213+
return [shapes]
214+
215+
# diffusers' QwenImageEdit(Plus)Pipeline VAE_IMAGE_SIZE = 1024 * 1024 pixels;
216+
# ref images are resized to this area (preserving aspect, snapped to multiples
217+
# of 32) before VAE encoding. We mirror this clamp in latent space so direct
218+
# backend callers — whose i2l may not pass explicit width/height — don't feed
219+
# the transformer an out-of-distribution reference sequence length (which
220+
# also causes a VRAM spike for large inputs).
221+
_REF_TARGET_PIXEL_AREA: ClassVar[int] = 1024 * 1024
222+
_VAE_SCALE_FACTOR: ClassVar[int] = 8
223+
224+
@classmethod
225+
def _maybe_clamp_ref_latent_size(cls, ref_latents: torch.Tensor) -> torch.Tensor:
226+
"""Bilinear-downscale the reference latent if it exceeds diffusers'
227+
VAE_IMAGE_SIZE budget.
228+
229+
Returns the latent unchanged if it's already within budget.
230+
"""
231+
_, _, rh, rw = ref_latents.shape
232+
target_cells = cls._REF_TARGET_PIXEL_AREA // (cls._VAE_SCALE_FACTOR**2)
233+
if rh * rw <= target_cells:
234+
return ref_latents
235+
aspect = rw / rh
236+
target_w_px = math.sqrt(cls._REF_TARGET_PIXEL_AREA * aspect)
237+
target_h_px = target_w_px / aspect
238+
target_w_px = max(32, round(target_w_px / 32) * 32)
239+
target_h_px = max(32, round(target_h_px / 32) * 32)
240+
target_rh = target_h_px // cls._VAE_SCALE_FACTOR
241+
target_rw = target_w_px // cls._VAE_SCALE_FACTOR
242+
return torch.nn.functional.interpolate(
243+
ref_latents, size=(target_rh, target_rw), mode="bilinear", antialias=False
244+
)
245+
179246
def _run_diffusion(self, context: InvocationContext):
180247
inference_dtype = torch.bfloat16
181248
device = TorchDevice.choose_torch_device()
@@ -332,35 +399,37 @@ def _run_diffusion(self, context: InvocationContext):
332399
use_ref_latents = has_zero_cond_t
333400

334401
ref_latents_packed = None
402+
ref_latent_height = latent_height
403+
ref_latent_width = latent_width
335404
if use_ref_latents:
336405
if ref_latents is not None:
337-
_, ref_ch, rh, rw = ref_latents.shape
338-
if rh != latent_height or rw != latent_width:
339-
ref_latents = torch.nn.functional.interpolate(
340-
ref_latents, size=(latent_height, latent_width), mode="bilinear"
341-
)
406+
# Defense-in-depth: backend callers (direct API, older graph JSON)
407+
# may wire qwen_image_i2l without explicit width/height, producing
408+
# a native-resolution reference latent. Clamp here so the
409+
# transformer always sees an in-distribution sequence length.
410+
ref_latents = self._maybe_clamp_ref_latent_size(ref_latents)
411+
_, _, rh, rw = ref_latents.shape
412+
ref_latent_height, ref_latent_width = self._align_ref_latent_dims(rh, rw)
413+
if ref_latent_height != rh or ref_latent_width != rw:
414+
ref_latents = ref_latents[..., :ref_latent_height, :ref_latent_width]
342415
else:
343416
# No reference image provided — use zeros so the model still gets the
344417
# expected sequence layout.
345418
ref_latents = torch.zeros(
346419
1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype
347420
)
348-
ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width)
349-
350-
# img_shapes tells the transformer the spatial layout of patches.
421+
ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, ref_latent_height, ref_latent_width)
422+
423+
# img_shapes tells the transformer the spatial layout of patches. The reference
424+
# segment must use the reference latent's own dimensions so RoPE positions it
425+
# distinctly from the noisy latent — otherwise the two segments share spatial
426+
# positional encoding and the model can't disentangle them, producing a
427+
# ghost/doubling artifact across the whole frame. Matches diffusers'
428+
# QwenImageEditPipeline / QwenImageEditPlusPipeline.
351429
if use_ref_latents:
352-
img_shapes = [
353-
[
354-
(1, latent_height // 2, latent_width // 2),
355-
(1, latent_height // 2, latent_width // 2),
356-
]
357-
]
430+
img_shapes = self._build_img_shapes(latent_height, latent_width, ref_latent_height, ref_latent_width)
358431
else:
359-
img_shapes = [
360-
[
361-
(1, latent_height // 2, latent_width // 2),
362-
]
363-
]
432+
img_shapes = self._build_img_shapes(latent_height, latent_width)
364433

365434
# Prepare inpaint extension (operates in 4D space, so unpack/repack around it)
366435
inpaint_mask = self._prep_inpaint_mask(context, noise) # noise has the right 4D shape

invokeai/app/invocations/qwen_image_image_to_latents.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ def invoke(self, context: InvocationContext) -> LatentsOutput:
8383
if self.width is not None and self.height is not None:
8484
image = image.convert("RGB").resize((self.width, self.height), resample=PILImage.LANCZOS)
8585

86-
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
86+
# multiple_of=16 ensures the post-VAE latents (vae_scale_factor=8) have even
87+
# spatial dims, which the transformer's 2x2 patch packing requires.
88+
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"), multiple_of=16)
8789
if image_tensor.dim() == 3:
8890
image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
8991

invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.test.ts

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,12 @@ vi.mock('services/api/types', async () => {
140140
};
141141
});
142142

143-
import { buildQwenImageGraph, isQwenImageEditModel, shouldUseCfg } from './buildQwenImageGraph';
143+
import {
144+
buildQwenImageGraph,
145+
calculateQwenImageEditRefDimensions,
146+
isQwenImageEditModel,
147+
shouldUseCfg,
148+
} from './buildQwenImageGraph';
144149

145150
describe('isQwenImageEditModel', () => {
146151
afterEach(() => {
@@ -415,3 +420,80 @@ describe('buildQwenImageGraph', () => {
415420
expect(hasReferenceLatentsEdge).toBe(false);
416421
});
417422
});
423+
424+
describe('calculateQwenImageEditRefDimensions', () => {
425+
// Cross-checked against diffusers' calculate_dimensions(1024*1024, ratio)
426+
// (see pipeline_qwenimage_edit.py / pipeline_qwenimage_edit_plus.py).
427+
it('produces ~1024² area for a square input', () => {
428+
const result = calculateQwenImageEditRefDimensions(512, 512);
429+
expect(result).toEqual({ width: 1024, height: 1024 });
430+
});
431+
432+
it('preserves aspect ratio for landscape inputs', () => {
433+
expect(calculateQwenImageEditRefDimensions(1600, 1200)).toEqual({ width: 1184, height: 896 });
434+
expect(calculateQwenImageEditRefDimensions(1920, 1080)).toEqual({ width: 1376, height: 768 });
435+
});
436+
437+
it('preserves aspect ratio for portrait inputs', () => {
438+
expect(calculateQwenImageEditRefDimensions(1200, 1600)).toEqual({ width: 896, height: 1184 });
439+
expect(calculateQwenImageEditRefDimensions(1080, 1920)).toEqual({ width: 768, height: 1376 });
440+
});
441+
442+
it('snaps dimensions to multiples of 32', () => {
443+
const { width, height } = calculateQwenImageEditRefDimensions(1600, 1200);
444+
expect(width % 32).toBe(0);
445+
expect(height % 32).toBe(0);
446+
});
447+
448+
it('clamps to a minimum of 32 for extreme aspect ratios', () => {
449+
// 50000x100 has aspect ratio 500:1 — height would round to 0 without the clamp.
450+
const { width, height } = calculateQwenImageEditRefDimensions(50000, 100);
451+
expect(height).toBeGreaterThanOrEqual(32);
452+
expect(width).toBeGreaterThanOrEqual(32);
453+
expect(width % 32).toBe(0);
454+
expect(height % 32).toBe(0);
455+
});
456+
457+
it('passes computed dims as width/height to the reference i2l node', async () => {
458+
const { selectMainModelConfig } = await import('features/controlLayers/store/paramsSlice');
459+
const editModel = { ...model, variant: 'edit' };
460+
vi.mocked(selectMainModelConfig).mockReturnValue(editModel as never);
461+
462+
const { fetchModelConfigWithTypeGuard } = await import('features/metadata/util/modelFetchingHelpers');
463+
vi.mocked(fetchModelConfigWithTypeGuard).mockResolvedValue(editModel as never);
464+
465+
const { selectRefImagesSlice } = await import('features/controlLayers/store/refImagesSlice');
466+
vi.mocked(selectRefImagesSlice).mockReturnValue({
467+
entities: [
468+
{
469+
id: 'ref-image-1',
470+
isEnabled: true,
471+
config: {
472+
type: 'qwen_image_reference_image',
473+
image: { original: { image: { image_name: 'ref.png', width: 1600, height: 1200 } } },
474+
},
475+
},
476+
],
477+
} as never);
478+
479+
const { g } = await buildQwenImageGraph({
480+
generationMode: 'txt2img',
481+
manager: null,
482+
state: {
483+
system: { shouldUseNSFWChecker: false, shouldUseWatermarker: false },
484+
} as never,
485+
});
486+
487+
const graph = g.getGraph();
488+
const refI2lNodeId = Object.keys(graph.nodes).find((id) => id.startsWith('qwen_ref_i2l:'));
489+
expect(refI2lNodeId).toBeDefined();
490+
const refI2lNode = graph.nodes[refI2lNodeId!] as { width?: number; height?: number };
491+
expect(refI2lNode.width).toBe(1184);
492+
expect(refI2lNode.height).toBe(896);
493+
494+
// Restore mocks
495+
vi.mocked(selectMainModelConfig).mockReturnValue(model as never);
496+
vi.mocked(fetchModelConfigWithTypeGuard).mockResolvedValue(model as never);
497+
vi.mocked(selectRefImagesSlice).mockReturnValue(refImagesSlice as never);
498+
});
499+
});

invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,27 @@ export const shouldUseCfg = (cfgScale: number | number[]): boolean => {
5151
return cfgScale.some((value) => value > 1);
5252
};
5353

54+
/**
55+
* Compute the target dimensions for the VAE-encoded reference image, matching
56+
* diffusers' `calculate_dimensions(VAE_IMAGE_SIZE=1024*1024, aspect_ratio)` used
57+
* by QwenImageEditPipeline / QwenImageEditPlusPipeline. The reference is resized
58+
* so its area is ~1024² while preserving aspect ratio, with each dimension
59+
* snapped to a multiple of 32 (the model was trained at this scale; feeding it a
60+
* much larger reference produces a sequence length it was not trained on).
61+
*/
62+
const QWEN_IMAGE_EDIT_REF_TARGET_AREA = 1024 * 1024;
63+
export const calculateQwenImageEditRefDimensions = (
64+
width: number,
65+
height: number
66+
): { width: number; height: number } => {
67+
const ratio = width / height;
68+
let w = Math.sqrt(QWEN_IMAGE_EDIT_REF_TARGET_AREA * ratio);
69+
let h = w / ratio;
70+
w = Math.max(32, Math.round(w / 32) * 32);
71+
h = Math.max(32, Math.round(h / 32) * 32);
72+
return { width: w, height: h };
73+
};
74+
5475
export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise<GraphBuilderReturn> => {
5576
const { generationMode, state, manager } = arg;
5677

@@ -175,15 +196,18 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise<GraphBu
175196
// Also VAE-encode the first reference image as latents for the denoising transformer.
176197
// The transformer expects [noisy_patches ; ref_patches] in its sequence.
177198
const firstConfig = validRefImageConfigs[0]!;
178-
const firstImgField = zImageField.parse(
179-
firstConfig.config.image?.crop?.image ?? firstConfig.config.image?.original.image
180-
);
181-
// Don't force-resize the reference image to the output dimensions — that would
182-
// distort the aspect ratio when they differ. The I2L encodes at the image's
183-
// native size; the denoise node handles dimension mismatches via interpolation.
199+
const firstImage = firstConfig.config.image?.crop?.image ?? firstConfig.config.image?.original.image;
200+
const firstImgField = zImageField.parse(firstImage);
201+
// Resize the reference image to ~1024² area preserving aspect ratio, matching the
202+
// diffusers QwenImageEdit(Plus)Pipeline's VAE_IMAGE_SIZE. The denoise node uses
203+
// the reference latent's own dimensions for RoPE, so the ref segment is encoded
204+
// at the resolution the model was trained on rather than the source image's
205+
// native size.
206+
const refDims = firstImage ? calculateQwenImageEditRefDimensions(firstImage.width, firstImage.height) : undefined;
184207
const refI2l = g.addNode({
185208
type: 'qwen_image_i2l',
186209
id: getPrefixedId('qwen_ref_i2l'),
210+
...(refDims ? { width: refDims.width, height: refDims.height } : {}),
187211
});
188212
const refImageNode = g.addNode({
189213
type: 'image',

0 commit comments

Comments
 (0)