Use same code for chroma and flux blocks so that optimizations are shared. (comfyanonymous#10746)

comfyanonymous · web-flow · commit f60923590c3f · 2025-11-14T01:28:05.000-05:00
diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py
@@ -1,12 +1,9 @@
 import torch
 from torch import Tensor, nn
 
-from comfy.ldm.flux.math import attention
 from comfy.ldm.flux.layers import (
     MLPEmbedder,
     RMSNorm,
-    QKNorm,
-    SelfAttention,
     ModulationOut,
 )
 
@@ -48,124 +45,6 @@ def forward(self, x: Tensor) -> Tensor:
         return x
 
 
-class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
-        super().__init__()
-
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
-
-        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-        self.flipped_img_txt = flipped_img_txt
-
-    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}):
-        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
-
-        # prepare image for attention
-        img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img))
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-
-        # prepare txt for attention
-        txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt))
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-
-        # run actual attention
-        attn = attention(torch.cat((txt_q, img_q), dim=2),
-                         torch.cat((txt_k, img_k), dim=2),
-                         torch.cat((txt_v, img_v), dim=2),
-                         pe=pe, mask=attn_mask, transformer_options=transformer_options)
-
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-
-        # calculate the img bloks
-        img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn))
-        img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img))))
-
-        # calculate the txt bloks
-        txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn))
-        txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt))))
-
-        if txt.dtype == torch.float16:
-            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
-
-        return img, txt
-
-
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float = None,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
-        # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
-
-        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-
-        self.hidden_size = hidden_size
-        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-
-        self.mlp_act = nn.GELU(approximate="tanh")
-
-    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}) -> Tensor:
-        mod = vec
-        x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
-        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
-
-        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k = self.norm(q, k, v)
-
-        # compute attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x.addcmul_(mod.gate, output)
-        if x.dtype == torch.float16:
-            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
-        return x
-
-
 class LastLayer(nn.Module):
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
         super().__init__()
diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py
@@ -11,12 +11,12 @@
 from comfy.ldm.flux.layers import (
     EmbedND,
     timestep_embedding,
+    DoubleStreamBlock,
+    SingleStreamBlock,
 )
 
 from .layers import (
-    DoubleStreamBlock,
     LastLayer,
-    SingleStreamBlock,
     Approximator,
     ChromaModulationOut,
 )
@@ -90,6 +90,7 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,
                     self.num_heads,
                     mlp_ratio=params.mlp_ratio,
                     qkv_bias=params.qkv_bias,
+                    modulation=False,
                     dtype=dtype, device=device, operations=operations
                 )
                 for _ in range(params.depth)
@@ -98,7 +99,7 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,
 
         self.single_blocks = nn.ModuleList(
             [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=False, dtype=dtype, device=device, operations=operations)
                 for _ in range(params.depth_single_blocks)
             ]
         )
diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py
@@ -10,12 +10,10 @@
 from einops import repeat
 import comfy.ldm.common_dit
 
-from comfy.ldm.flux.layers import EmbedND
+from comfy.ldm.flux.layers import EmbedND, DoubleStreamBlock, SingleStreamBlock
 
 from comfy.ldm.chroma.model import Chroma, ChromaParams
 from comfy.ldm.chroma.layers import (
-    DoubleStreamBlock,
-    SingleStreamBlock,
     Approximator,
 )
 from .layers import (
@@ -89,14 +87,14 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,
                     dtype=dtype, device=device, operations=operations
                 )
 
-
         self.double_blocks = nn.ModuleList(
             [
                 DoubleStreamBlock(
                     self.hidden_size,
                     self.num_heads,
                     mlp_ratio=params.mlp_ratio,
                     qkv_bias=params.qkv_bias,
+                    modulation=False,
                     dtype=dtype, device=device, operations=operations
                 )
                 for _ in range(params.depth)
@@ -109,6 +107,7 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,
                     self.hidden_size,
                     self.num_heads,
                     mlp_ratio=params.mlp_ratio,
+                    modulation=False,
                     dtype=dtype, device=device, operations=operations,
                 )
                 for _ in range(params.depth_single_blocks)
diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py
@@ -130,13 +130,17 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
 
 
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, dtype=None, device=None, operations=None):
         super().__init__()
 
         mlp_hidden_dim = int(hidden_size * mlp_ratio)
         self.num_heads = num_heads
         self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+        self.modulation = modulation
+
+        if self.modulation:
+            self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+
         self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
         self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
 
@@ -147,7 +151,9 @@ def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias:
             operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
         )
 
-        self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+        if self.modulation:
+            self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+
         self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
         self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
 
@@ -160,8 +166,11 @@ def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias:
         self.flipped_img_txt = flipped_img_txt
 
     def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        if self.modulation:
+            img_mod1, img_mod2 = self.img_mod(vec)
+            txt_mod1, txt_mod2 = self.txt_mod(vec)
+        else:
+            (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
 
         # prepare image for attention
         img_modulated = self.img_norm1(img)
@@ -236,6 +245,7 @@ def __init__(
         num_heads: int,
         mlp_ratio: float = 4.0,
         qk_scale: float = None,
+        modulation=True,
         dtype=None,
         device=None,
         operations=None
@@ -258,10 +268,17 @@ def __init__(
         self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
 
         self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
+        if modulation:
+            self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
+        else:
+            self.modulation = None
 
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor:
-        mod, _ = self.modulation(vec)
+        if self.modulation:
+            mod, _ = self.modulation(vec)
+        else:
+            mod = vec
+
         qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
 
         q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)

Original file line number	Diff line number	Diff line change
`@@ -11,12 +11,12 @@`
`11`	`11`	`from comfy.ldm.flux.layers import (`
`12`	`12`	`EmbedND,`
`13`	`13`	`timestep_embedding,`
	`14`	`+ DoubleStreamBlock,`
	`15`	`+ SingleStreamBlock,`
`14`	`16`	`)`
`15`	`17`
`16`	`18`	`from .layers import (`
`17`		`- DoubleStreamBlock,`
`18`	`19`	`LastLayer,`
`19`		`- SingleStreamBlock,`
`20`	`20`	`Approximator,`
`21`	`21`	`ChromaModulationOut,`
`22`	`22`	`)`
`@@ -90,6 +90,7 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,`
`90`	`90`	`self.num_heads,`
`91`	`91`	`mlp_ratio=params.mlp_ratio,`
`92`	`92`	`qkv_bias=params.qkv_bias,`
	`93`	`+ modulation=False,`
`93`	`94`	`dtype=dtype, device=device, operations=operations`
`94`	`95`	`)`
`95`	`96`	`for _ in range(params.depth)`
`@@ -98,7 +99,7 @@ def __init__(self, image_model=None, final_layer=True, dtype=None, device=None,`
`98`	`99`
`99`	`100`	`self.single_blocks = nn.ModuleList(`
`100`	`101`	`[`
`101`		`- SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)`
	`102`	`+ SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=False, dtype=dtype, device=device, operations=operations)`
`102`	`103`	`for _ in range(params.depth_single_blocks)`
`103`	`104`	`]`
`104`	`105`	`)`