mit-han-lab
diff --git a/‎README.md
+31-22 b/‎README.md
+31-22
diff --git a/‎deepcompressor/app/diffusion/dataset/calib.py
+11-10 b/‎deepcompressor/app/diffusion/dataset/calib.py
+11-10
diff --git a/‎deepcompressor/app/diffusion/dataset/collect/calib.py
+52-28 b/‎deepcompressor/app/diffusion/dataset/collect/calib.py
+52-28
diff --git a/‎deepcompressor/app/diffusion/dataset/collect/utils.py
+6-3 b/‎deepcompressor/app/diffusion/dataset/collect/utils.py
+6-3
@@ -17,7 +17,9 @@
 </p>
 
 ## News
-- [2025/01] 🎉 [**SVDQuant**](https://arxiv.org/abs/2411.05007) has been accepted to ICLR 2025!
+- [2025/02] 🎉 [**QServe**](https://arxiv.org/abs/2405.04532) has been accepted to MLSys 2025!
+- [2025/01] 🎉 [**SVDQuant**](https://arxiv.org/abs/2411.05007) has been accepted to ICLR 2025 (Spotlight)!
+- [2024/12] 🎉 [**QServe**](https://github.com/mit-han-lab/qserve) has been integratedd into NVIDIA [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama)!
 - [2024/11] 🔥 Our latest **W4A4** diffusion model quantization work [**SVDQuant**](https://arxiv.org/abs/2411.05007) algorithm and [**Nunchaku**](https://github.com/mit-han-lab/nunchaku) system is publicly released! Check our [paper](http://arxiv.org/abs/2411.05007)!
 - [2024/05] 🔥 Our latest **W4A8KV4** LLM quantization work **QoQ** algorithm and **QServe** system is publicly released! **QoQ** is short for *quattuor-octō-quattuor* which is 4-8-4 in latin. Check our [paper](https://arxiv.org/abs/2405.04532)!
 
@@ -72,24 +74,30 @@ Diffusion models have been proven highly effective at generating high-quality im
 
 Below is the quality and similarity evaluated with 5000 samples from MJHQ-30K dataset. IR means ImageReward. Our 4-bit results outperform other 4-bit baselines, effectively preserving the visual quality of 16-bit models.
 
-| Model                      | Precision | Method  | FID ($\downarrow$) | IR ($\uparrow$) | LPIPS ($\downarrow$) | PSNR( $\uparrow$) |
-|----------------------------|-----------|---------|--------------------|-----------------|----------------------|-------------------|
-| FLUX.1-dev (50 Steps)      | BF16      | --      | 20.3               | 0.953           | --                   | --                |
-|                            | INT W8A8  | Ours    | 20.4               | 0.948           | 0.089                | 27.0              |
-|                            | W4A16     | NF4     | 20.6               | 0.910           | 0.272                | 19.5              |
-|                            | INT W4A4  | Ours    | **19.86**          | 0.932           | 0.254                | 20.1              |
-|                            | FP W4A4   | Ours    | 21.0               | **0.933**       | **0.247**            | **20.2**          |
-| FLUX.1-schnell (4 Steps)   | BF16      | --      | 19.2               | 0.938           | --                   | --                |
-|                            | INT W8A8  | Ours    | 19.2               | 0.966           | 0.120                | 22.9              |
-|                            | W4A16     | NF4     | 18.9               | 0.943           | 0.257                | 18.2              |
-|                            | INT W4A4  | Ours    | **18.4**           | **0.969**       | 0.292                | 17.5              |
-|                            | FP W4A4   | Ours    | 19.9               | 0.956           | 0.279                | 17.5              |
-|                            | FP16      | --      | 16.6               | 0.944           | --                   | --                |
-| PixArt-Sigma (20 Steps)    | INT W8A8  | ViDiT-Q | 15.7               | 0.944           | 0.137                | 22.5              |
-|                            | INT W8A8  | Ours    | 16.3               | **0.955**       | **0.109**            | **23.7**          |
-|                            | INT W4A8  | ViDiT-Q | 37.3               | 0.573           | 0.611                | 12.0              |
-|                            | INT W4A4  | Ours    | 20.1               | 0.898           | 0.394                | 16.2              |
-|                            | FP W4A4   | Ours    | **18.3**           | **0.946**       | **0.326**            | **17.4**          |
+| Model                      | Precision |  Method   | FID ($\downarrow$) | IR ($\uparrow$) | LPIPS ($\downarrow$) | PSNR( $\uparrow$) |
+|----------------------------|-----------|-----------|--------------------|-----------------|----------------------|-------------------|
+| FLUX.1-dev (50 Steps)      | BF16      | --        | 20.3               | 0.953           | --                   | --                |
+|                            | W4A16     | NF4       | 20.6               | 0.910           | 0.272                | 19.5              |
+|                            | INT W4A4  |           | 20.2               | 0.908           | 0.322                | 18.5              |
+|                            | INT W4A4  | Ours      | 19.9               | 0.935           | 0.223                | 21.0              |
+|                            | NVFP4     |           | 20.3               | 0.961           | 0.345                | 16.3              |
+|                            | NVFP4     | Ours      | 20.3               | 0.942           | 0.205                | 21.5              |
+| FLUX.1-schnell (4 Steps)   | BF16      | --        | 19.2               | 0.938           | --                   | --                |
+|                            | W4A16     | NF4       | 18.9               | 0.943           | 0.257                | 18.2              |
+|                            | INT W4A4  |           | 18.1               | 0.962           | 0.345                | 16.3              |
+|                            | INT W4A4  | Ours      | 18.3               | 0.951           | 0.257                | 18.3              |
+|                            | NVFP4     |           | 19.0               | 0.952           | 0.276                | 17.6              |
+|                            | NVFP4     | Ours      | 18.9               | 0.964           | 0.229                | 19.0              |
+| SANA-1.6b (20 Steps)       | BF16      | --        | 20.6               | 0.952           | --                   | --                |
+|                            | INT W4A4  |           | 20.5               | 0.894           | 0.339                | 15.3              |
+|                            | INT W4A4  | Ours      | 19.3               | 0.935           | 0.220                | 17.8              |
+|                            | NVFP4     |           | 19.7               | 0.929           | 0.236                | 17.4              |
+|                            | NVFP4     | Ours      | 20.2               | 0.941           | 0.176                | 19.0              |
+| PixArt-Sigma (20 Steps)    | FP16      | --        | 16.6               | 0.944           | --                   | --                |
+|                            | INT W4A8  | ViDiT-Q   | 37.3               | 0.573           | 0.611                | 12.0              |
+|                            | INT W4A4  | Ours      | 19.2               | 0.878           | 0.323                | 17.6              |
+|                            | NVFP4     |           | 31.8               | 0.660           | 0.517                | 14.8              |
+|                            | NVFP4     | Ours      | 16.6               | 0.940           | 0.271                | 18.5              |
 
 ### QServe: W4A8KV4 Quantization for Efficient LLM Serving
 
@@ -111,10 +119,11 @@ Below is the WikiText2 perplexity evaluated with 2048 sequence length. The lower
 | SmoothQuant | W8A8         | 3.23          | 6.38         | 3.14        |  6.28       | 5.54       | 4.95        | 3.36        | 5.73     | 5.13      | 4.23      | 5.29       | 4.69   |
 | GPTQ-R      | W4A16 g128   | 3.46          | 6.64         | 3.42        |  6.56       | 5.63       | 4.99        | 3.43        | 5.83     | 5.20      | 4.22      | 5.39       | 4.68   |
 | AWQ         | W4A16 g128   | 3.22          | 6.60         | 3.20        |  6.54       | 5.60       | 4.97        | 3.41        | 5.78     | 5.19      | 4.21      | 5.37       | 4.67   |
-| QuaRot      | W4A4         | 5.97          | 8.32         | 6.75        |  8.33       | 6.19       | 5.45        | 3.83        | 6.34     | 5.58      | 4.64      | 5.77       | NaN    |
+| QuaRot      | W4A4         | 5.97          | 8.32         | 6.75        |  8.33       | 6.19       | 5.45        | 3.83        | 6.34     | 5.58      | 4.64      | 5.77       | -      |
+| SpinQuant   | W4A4         | 4.80          | 7.42         | 6.27        |  7.37       | 5.96       | 5.24        | 3.71        | 6.14     | 5.39      | 4.56      | -          | -      |
 | Atom        | W4A4 g128    | -             | -            | 4.33        |  7.78       | 6.12       | 5.31        | 3.73        | 6.25     | 5.52      | 4.61      | 5.76       | 4.97   |
-| QoQ         | W4A8KV4      | 3.69          | 6.91         | 3.65        |  6.84       | 5.75       | 5.11        | 3.51        | 5.92     | 5.27      | 4.32      | 5.45       | 4.73   |
-| QoQ         | W4A8KV4 g128 | 3.54          | 6.80         | 3.51        |  6.73       | 5.68       | 5.05        | 3.46        | 5.88     | 5.23      | 4.27      | 5.41       | 4.73   |
+| QoQ         | W4A8KV4      | 3.68          | 6.87         | 3.65        |  6.81       | 5.75       | 5.11        | 3.50        | 5.92     | 5.27      | 4.31      | 5.44       | 4.73   |
+| QoQ         | W4A8KV4 g128 | 3.51          | 6.77         | 3.50        |  6.70       | 5.67       | 5.06        | 3.46        | 5.88     | 5.23      | 4.27      | 5.41       | 4.73   |
 
 \* SmoothQuant is evaluated with per-tensor static KV cache quantization.
 
 
@@ -11,11 +11,19 @@
 import torch.utils.data
 from diffusers.models.attention import JointTransformerBlock
 from diffusers.models.attention_processor import Attention
-from diffusers.models.transformers.transformer_flux import FluxSingleTransformerBlock, FluxTransformerBlock
+from diffusers.models.transformers.transformer_flux import (
+    FluxSingleTransformerBlock,
+    FluxTransformerBlock,
+)
 from omniconfig import configclass
 
-from deepcompressor.data.cache import IOTensorsCache, ModuleForwardInput, TensorCache, TensorsCache
-from deepcompressor.data.utils.reshape import AttentionInputReshapeFn, LinearReshapeFn, ReshapeFn
+from deepcompressor.data.cache import (
+    IOTensorsCache,
+    ModuleForwardInput,
+    TensorCache,
+    TensorsCache,
+)
+from deepcompressor.data.utils.reshape import AttentionInputReshapeFn, LinearReshapeFn
 from deepcompressor.dataset.action import CacheAction, ConcatCacheAction
 from deepcompressor.dataset.cache import BaseCalibCacheLoader
 from deepcompressor.dataset.config import BaseDataLoaderConfig
@@ -113,9 +121,6 @@ def info(
                         encoder_hidden_states_cache.reshape = AttentionInputReshapeFn(encoder_channels_dim)
                 else:
                     assert encoder_hidden_states_cache.channels_dim == encoder_channels_dim
-            if tensors["image_rotary_emb"] is None:
-                tensors.pop("image_rotary_emb")
-                cache.tensors.pop("image_rotary_emb")
             hidden_states, hidden_states_cache = tensors["hidden_states"], cache.tensors["hidden_states"]
             channels_dim = 1 if hidden_states.dim() == 4 else -1
             if hidden_states_cache.channels_dim is None:
@@ -163,7 +168,6 @@ def _init_cache(self, name: str, module: nn.Module) -> IOTensorsCache:
                     OrderedDict(
                         hidden_states=TensorCache(channels_dim=-1, reshape=LinearReshapeFn()),
                         temb=TensorCache(channels_dim=1, reshape=LinearReshapeFn()),
-                        image_rotary_emb=TensorCache(channels_dim=1, reshape=ReshapeFn()),
                     )
                 ),
                 outputs=TensorCache(channels_dim=-1, reshape=LinearReshapeFn()),
@@ -174,7 +178,6 @@ def _init_cache(self, name: str, module: nn.Module) -> IOTensorsCache:
                     OrderedDict(
                         hidden_states=TensorCache(channels_dim=None, reshape=None),
                         encoder_hidden_states=TensorCache(channels_dim=None, reshape=None),
-                        image_rotary_emb=TensorCache(channels_dim=1, reshape=ReshapeFn()),
                     ),
                 ),
                 outputs=TensorCache(channels_dim=None, reshape=None),
@@ -211,7 +214,6 @@ def _convert_layer_inputs(
         kwargs = {k: v for k, v in kwargs.items()}  # noqa: C416
         if "res_hidden_states_tuple" in kwargs:
             kwargs["res_hidden_states_tuple"] = None
-            # tree_map(lambda x: x.detach().cpu(), kwargs["res_hidden_states_tuple"])
         if "hidden_states" in kwargs:
             hidden_states = kwargs.pop("hidden_states")
             assert len(args) == 0, f"Invalid args: {args}"
@@ -333,7 +335,6 @@ def iter_layer_activations(  # noqa: C901
             layer_kwargs.pop("hidden_states", None)
             layer_kwargs.pop("encoder_hidden_states", None)
             layer_kwargs.pop("temb", None)
-            layer_kwargs.pop("image_rotary_emb", None)
             layer_struct = layer_structs[layer_idx]
             if isinstance(layer_struct, DiffusionBlockStruct):
                 assert layer_struct.name == layer_name
 
@@ -2,18 +2,19 @@
 """Collect calibration dataset."""
 
 import os
-import random
 from dataclasses import dataclass
 
+import datasets
 import torch
-import torch.nn as nn
-import yaml
 from omniconfig import configclass
-from tqdm import trange
+from torch import nn
+from tqdm import tqdm
 
 from deepcompressor.app.diffusion.config import DiffusionPtqRunConfig
 from deepcompressor.utils.common import hash_str_to_int, tree_map
 
+from ...utils import get_control
+from ..data import get_dataset
 from .utils import CollectHook
 
 
@@ -22,7 +23,7 @@ def process(x: torch.Tensor) -> torch.Tensor:
     return torch.from_numpy(x.float().numpy()).to(dtype)
 
 
-def collect(config: DiffusionPtqRunConfig, filenames: list[str], dataset: dict[str, str]):
+def collect(config: DiffusionPtqRunConfig, dataset: datasets.Dataset):
     samples_dirpath = os.path.join(config.output.root, "samples")
     caches_dirpath = os.path.join(config.output.root, "caches")
     os.makedirs(samples_dirpath, exist_ok=True)
@@ -35,25 +36,48 @@ def collect(config: DiffusionPtqRunConfig, filenames: list[str], dataset: dict[s
     model.register_forward_hook(CollectHook(caches=caches), with_kwargs=True)
 
     batch_size = config.eval.batch_size
-    print(f"In total {len(filenames)} samples")
+    print(f"In total {len(dataset)} samples")
     print(f"Evaluating with batch size {batch_size}")
     pipeline.set_progress_bar_config(desc="Sampling", leave=False, dynamic_ncols=True, position=1)
-    num_batches = (len(filenames) + batch_size - 1) // batch_size
-    for i in trange(num_batches, desc="Images", leave=False, dynamic_ncols=True, position=0):
-        batch = filenames[i * batch_size : (i + 1) * batch_size]
-        prompts = [dataset[name] for name in batch]
-        seeds = [hash_str_to_int(name) for name in batch]
+    for batch in tqdm(
+        dataset.iter(batch_size=batch_size, drop_last_batch=False),
+        desc="Data",
+        leave=False,
+        dynamic_ncols=True,
+        total=(len(dataset) + batch_size - 1) // batch_size,
+    ):
+        filenames = batch["filename"]
+        prompts = batch["prompt"]
+        seeds = [hash_str_to_int(name) for name in filenames]
         generators = [torch.Generator(device=pipeline.device).manual_seed(seed) for seed in seeds]
-        images = pipeline(prompts, generator=generators, **config.eval.get_pipeline_kwargs()).images
-        if len(caches) == batch_size * config.eval.num_steps:
-            num_guidances = 1
-        elif len(caches) == 2 * batch_size * config.eval.num_steps:
-            num_guidances = 2
-        else:
-            raise ValueError(f"Unexpected number of caches: {len(caches)} != {batch_size} * {config.eval.num_steps}")
-        for j, (filename, image) in enumerate(zip(batch, images, strict=True)):
+        pipeline_kwargs = config.eval.get_pipeline_kwargs()
+
+        task = config.pipeline.task
+        control_root = config.eval.control_root
+        if task in ["canny-to-image", "depth-to-image", "inpainting"]:
+            controls = get_control(
+                task,
+                batch["image"],
+                names=batch["filename"],
+                data_root=os.path.join(
+                    control_root, collect_config.dataset_name, f"{dataset.config_name}-{config.eval.num_samples}"
+                ),
+            )
+            if task == "inpainting":
+                pipeline_kwargs["image"] = controls[0]
+                pipeline_kwargs["mask_image"] = controls[1]
+            else:
+                pipeline_kwargs["control_image"] = controls
+
+        result_images = pipeline(prompts, generator=generators, **pipeline_kwargs).images
+        num_guidances = (len(caches) // batch_size) // config.eval.num_steps
+        num_steps = len(caches) // (batch_size * num_guidances)
+        assert (
+            len(caches) == batch_size * num_steps * num_guidances
+        ), f"Unexpected number of caches: {len(caches)} != {batch_size} * {config.eval.num_steps} * {num_guidances}"
+        for j, (filename, image) in enumerate(zip(filenames, result_images, strict=True)):
             image.save(os.path.join(samples_dirpath, f"{filename}.png"))
-            for s in range(config.eval.num_steps):
+            for s in range(num_steps):
                 for g in range(num_guidances):
                     c = caches[s * batch_size * num_guidances + g * batch_size + j]
                     c["filename"] = filename
@@ -82,7 +106,7 @@ class CollectConfig:
 
     root: str = "datasets"
     dataset_name: str = "qdiff"
-    prompt_path: str = "prompts/qdiff.yaml"
+    data_path: str = "prompts/qdiff.yaml"
     num_samples: int = 128
 
 
@@ -109,13 +133,13 @@ class CollectConfig:
     )
     print(f"Saving caches to {collect_dirpath}")
 
-    dataset = yaml.safe_load(open(collect_config.prompt_path, "r"))
-    filenames = list(dataset.keys())
-    if collect_config.num_samples > 0:
-        random.Random(0).shuffle(filenames)
-        filenames = filenames[: collect_config.num_samples]
-        filenames = sorted(filenames)
+    dataset = get_dataset(
+        collect_config.data_path,
+        max_dataset_size=collect_config.num_samples,
+        return_gt=ptq_config.pipeline.task in ["canny-to-image"],
+        repeat=1,
+    )
 
     ptq_config.output.root = collect_dirpath
     os.makedirs(ptq_config.output.root, exist_ok=True)
-    collect(ptq_config, filenames=filenames, dataset=dataset)
+    collect(ptq_config, dataset=dataset)
@@ -6,8 +6,11 @@
 
 import torch
 import torch.nn as nn
-from diffusers.models.transformers.pixart_transformer_2d import PixArtTransformer2DModel
-from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
+from diffusers.models.transformers import (
+    FluxTransformer2DModel,
+    PixArtTransformer2DModel,
+    SanaTransformer2DModel,
+)
 from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
 
 from deepcompressor.utils.common import tree_map, tree_split
@@ -51,7 +54,7 @@ def __call__(
             # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
             timesteps = timesteps.expand(sample.shape[0])
             input_kwargs["timestep"] = timesteps
-        elif isinstance(module, PixArtTransformer2DModel):
+        elif isinstance(module, (PixArtTransformer2DModel, SanaTransformer2DModel)):
             new_args.append(input_kwargs.pop("hidden_states"))
         elif isinstance(module, FluxTransformer2DModel):
             new_args.append(input_kwargs.pop("hidden_states"))