vllm-project
diff --git a/‎examples/offline_inference/image_to_image/image_edit.py‎
Lines changed: 175 additions & 43 deletions b/‎examples/offline_inference/image_to_image/image_edit.py‎
Lines changed: 175 additions & 43 deletions
diff --git a/‎vllm_omni/config/model.py‎
Lines changed: 24 additions & 4 deletions b/‎vllm_omni/config/model.py‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎vllm_omni/diffusion/diffusion_engine.py‎
Lines changed: 12 additions & 12 deletions b/‎vllm_omni/diffusion/diffusion_engine.py‎
Lines changed: 12 additions & 12 deletions
@@ -22,6 +22,33 @@
         --cfg_scale 4.0 \
         --guidance_scale 1.0
 
+Usage (with cache-dit acceleration):
+    python image_edit.py \
+        --image input.png \
+        --prompt "Edit description" \
+        --cache_backend cache_dit \
+        --cache_dit_max_continuous_cached_steps 3 \
+        --cache_dit_residual_diff_threshold 0.24 \
+        --cache_dit_enable_taylorseer
+
+Usage (with tea_cache acceleration):
+    python image_edit.py \
+        --image input.png \
+        --prompt "Edit description" \
+        --cache_backend tea_cache \
+        --tea_cache_rel_l1_thresh 0.25
+
+Usage (layered):
+    python image_edit.py \
+        --model "Qwen/Qwen-Image-Layered" \
+        --image input.png \
+        --prompt "" \
+        --output "layered" \
+        --num_inference_steps 50 \
+        --cfg_scale 4.0 \
+        --layers 4 \
+        --color-format "RGBA"
+
 For more options, run:
     python image_edit.py --help
 """
@@ -100,7 +127,7 @@ def parse_args() -> argparse.Namespace:
         "--output",
         type=str,
         default="output_image_edit.png",
-        help="Path to save the edited image (PNG).",
+        help=("Path to save the edited image (PNG). Or prefix for Qwen-Image-Layered model save images(PNG)."),
     )
     parser.add_argument(
         "--num_outputs_per_prompt",
@@ -132,6 +159,87 @@ def parse_args() -> argparse.Namespace:
         help="Number of GPUs used for ulysses sequence parallelism.",
     )
 
+    parser.add_argument("--layers", type=int, default=4, help="Number of layers to decompose the input image into.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=640,
+        help="Bucket in (640, 1024) to determine the condition and output resolution",
+    )
+
+    parser.add_argument(
+        "--color-format",
+        type=str,
+        default="RGB",
+        help="For Qwen-Image-Layered, set to RGBA.",
+    )
+
+    # Cache-DiT specific parameters
+    parser.add_argument(
+        "--cache_dit_fn_compute_blocks",
+        type=int,
+        default=1,
+        help="[cache-dit] Number of forward compute blocks. Optimized for single-transformer models.",
+    )
+    parser.add_argument(
+        "--cache_dit_bn_compute_blocks",
+        type=int,
+        default=0,
+        help="[cache-dit] Number of backward compute blocks.",
+    )
+    parser.add_argument(
+        "--cache_dit_max_warmup_steps",
+        type=int,
+        default=4,
+        help="[cache-dit] Maximum warmup steps (works for few-step models).",
+    )
+    parser.add_argument(
+        "--cache_dit_residual_diff_threshold",
+        type=float,
+        default=0.24,
+        help="[cache-dit] Residual diff threshold. Higher values enable more aggressive caching.",
+    )
+    parser.add_argument(
+        "--cache_dit_max_continuous_cached_steps",
+        type=int,
+        default=3,
+        help="[cache-dit] Maximum continuous cached steps to prevent precision degradation.",
+    )
+    parser.add_argument(
+        "--cache_dit_enable_taylorseer",
+        action="store_true",
+        default=False,
+        help="[cache-dit] Enable TaylorSeer acceleration (not suitable for few-step models).",
+    )
+    parser.add_argument(
+        "--cache_dit_taylorseer_order",
+        type=int,
+        default=1,
+        help="[cache-dit] TaylorSeer polynomial order.",
+    )
+    parser.add_argument(
+        "--cache_dit_scm_steps_mask_policy",
+        type=str,
+        default=None,
+        choices=[None, "slow", "medium", "fast", "ultra"],
+        help="[cache-dit] SCM mask policy: None (disabled), slow, medium, fast, ultra.",
+    )
+    parser.add_argument(
+        "--cache_dit_scm_steps_policy",
+        type=str,
+        default="dynamic",
+        choices=["dynamic", "static"],
+        help="[cache-dit] SCM steps policy: dynamic or static.",
+    )
+
+    # TeaCache specific parameters
+    parser.add_argument(
+        "--tea_cache_rel_l1_thresh",
+        type=float,
+        default=0.2,
+        help="[tea_cache] Threshold for accumulated relative L1 distance.",
+    )
+
     return parser.parse_args()
 
 
@@ -143,7 +251,8 @@ def main():
     for image_path in args.image:
         if not os.path.exists(image_path):
             raise FileNotFoundError(f"Input image not found: {image_path}")
-        img = Image.open(image_path).convert("RGB")
+
+        img = Image.open(image_path).convert(args.color_format)
         input_images.append(img)
 
     # Use single image or list based on number of inputs
@@ -164,29 +273,22 @@ def main():
     cache_config = None
     if args.cache_backend == "cache_dit":
         # cache-dit configuration: Hybrid DBCache + SCM + TaylorSeer
-        # All parameters marked with [cache-dit only] in DiffusionCacheConfig
         cache_config = {
-            # DBCache parameters [cache-dit only]
-            "Fn_compute_blocks": 1,  # Optimized for single-transformer models
-            "Bn_compute_blocks": 0,  # Number of backward compute blocks
-            "max_warmup_steps": 4,  # Maximum warmup steps (works for few-step models)
-            "residual_diff_threshold": 0.24,  # Higher threshold for more aggressive caching
-            "max_continuous_cached_steps": 3,  # Limit to prevent precision degradation
-            # TaylorSeer parameters [cache-dit only]
-            "enable_taylorseer": False,  # Disabled by default (not suitable for few-step models)
-            "taylorseer_order": 1,  # TaylorSeer polynomial order
-            # SCM (Step Computation Masking) parameters [cache-dit only]
-            "scm_steps_mask_policy": None,  # SCM mask policy: None (disabled), "slow", "medium", "fast", "ultra"
-            "scm_steps_policy": "dynamic",  # SCM steps policy: "dynamic" or "static"
+            "Fn_compute_blocks": args.cache_dit_fn_compute_blocks,
+            "Bn_compute_blocks": args.cache_dit_bn_compute_blocks,
+            "max_warmup_steps": args.cache_dit_max_warmup_steps,
+            "residual_diff_threshold": args.cache_dit_residual_diff_threshold,
+            "max_continuous_cached_steps": args.cache_dit_max_continuous_cached_steps,
+            "enable_taylorseer": args.cache_dit_enable_taylorseer,
+            "taylorseer_order": args.cache_dit_taylorseer_order,
+            "scm_steps_mask_policy": args.cache_dit_scm_steps_mask_policy,
+            "scm_steps_policy": args.cache_dit_scm_steps_policy,
         }
     elif args.cache_backend == "tea_cache":
         # TeaCache configuration
-        # All parameters marked with [tea_cache only] in DiffusionCacheConfig
         cache_config = {
-            # TeaCache parameters [tea_cache only]
-            "rel_l1_thresh": 0.2,  # Threshold for accumulated relative L1 distance
+            "rel_l1_thresh": args.tea_cache_rel_l1_thresh,
             # Note: coefficients will use model-specific defaults based on model_type
-            #       (e.g., QwenImagePipeline or FluxPipeline)
         }
 
     # Initialize Omni with appropriate pipeline
@@ -218,16 +320,20 @@ def main():
     try:
         generation_start = time.perf_counter()
         # Generate edited image
-        outputs = omni.generate(
-            prompt=args.prompt,
-            pil_image=input_image,
-            negative_prompt=args.negative_prompt,
-            generator=generator,
-            true_cfg_scale=args.cfg_scale,
-            guidance_scale=args.guidance_scale,
-            num_inference_steps=args.num_inference_steps,
-            num_outputs_per_prompt=args.num_outputs_per_prompt,
-        )
+        generate_kwargs = {
+            "prompt": args.prompt,
+            "pil_image": input_image,
+            "negative_prompt": args.negative_prompt,
+            "generator": generator,
+            "true_cfg_scale": args.cfg_scale,
+            "guidance_scale": args.guidance_scale,
+            "num_inference_steps": args.num_inference_steps,
+            "num_outputs_per_prompt": args.num_outputs_per_prompt,
+            "layers": args.layers,
+            "resolution": args.resolution,
+        }
+
+        outputs = omni.generate(**generate_kwargs)
         generation_end = time.perf_counter()
         generation_time = generation_end - generation_start
 
@@ -239,15 +345,24 @@ def main():
         logger.info("Outputs: %s", outputs)
 
         # Extract images from OmniRequestOutput
-        first_output = outputs[0]
+        # Handle both OmniRequestOutput list and direct images list
         images = []
-        if getattr(first_output, "images", None):
-            images = first_output.images
-        elif getattr(first_output, "request_output", None):
-            req_out = first_output.request_output
-            if isinstance(req_out, list):
-                req_out = req_out[0]
-            images = getattr(req_out, "images", None) or []
+        if isinstance(outputs, list) and len(outputs) > 0:
+            first_output = outputs[0]
+            # Check if it's OmniRequestOutput with images attribute
+            if hasattr(first_output, "images") and first_output.images:
+                images = first_output.images
+            elif hasattr(first_output, "request_output") and first_output.request_output:
+                req_out = first_output.request_output
+                if isinstance(req_out, list):
+                    req_out = req_out[0]
+                if hasattr(req_out, "images"):
+                    images = req_out.images or []
+            # Check if outputs is already a list of images
+            elif isinstance(first_output, Image.Image):
+                images = outputs
+        elif isinstance(outputs, Image.Image):
+            images = [outputs]
 
         if not images:
             raise ValueError("No images found in omni.generate() output")
@@ -258,16 +373,33 @@ def main():
         suffix = output_path.suffix or ".png"
         stem = output_path.stem or "output_image_edit"
 
-        if len(images) <= 1:
-            images[0].save(output_path)
-            print(f"Saved edited image to {os.path.abspath(output_path)}")
+        # Handle layered output (each image may be a list of layers)
+        if args.num_outputs_per_prompt <= 1:
+            img = images[0]
+            # Check if this is a layered output (list of images)
+            if isinstance(img, list):
+                for sub_idx, sub_img in enumerate(img):
+                    save_path = output_path.parent / f"{stem}_{sub_idx}{suffix}"
+                    sub_img.save(save_path)
+                    print(f"Saved edited image to {os.path.abspath(save_path)}")
+            else:
+                img.save(output_path)
+                print(f"Saved edited image to {os.path.abspath(output_path)}")
         else:
             for idx, img in enumerate(images):
-                save_path = output_path.parent / f"{stem}_{idx}{suffix}"
-                img.save(save_path)
-                print(f"Saved edited image to {os.path.abspath(save_path)}")
+                # Check if this is a layered output (list of images)
+                if isinstance(img, list):
+                    for sub_idx, sub_img in enumerate(img):
+                        save_path = output_path.parent / f"{stem}_{idx}_{sub_idx}{suffix}"
+                        sub_img.save(save_path)
+                        print(f"Saved edited image to {os.path.abspath(save_path)}")
+                else:
+                    save_path = output_path.parent / f"{stem}_{idx}{suffix}"
+                    img.save(save_path)
+                    print(f"Saved edited image to {os.path.abspath(save_path)}")
     finally:
         omni.close()
 
+
 if __name__ == "__main__":
     main()
@@ -87,7 +87,17 @@ def draw_hf_text_config(self):
         # we need to draw the text config from the corresponding model stage.
         if self.hf_config_name is None:
             return get_hf_text_config(self.hf_config)
-        return getattr(self.hf_config, self.hf_config_name).get_text_config()
+        try:
+            # Try to get the stage-specific config (e.g., thinker_config, talker_config)
+            stage_config = getattr(self.hf_config, self.hf_config_name)
+            return stage_config.get_text_config()
+        except AttributeError:
+            # Fallback: if the attribute doesn't exist, use the default get_hf_text_config
+            logger.warning(
+                f"Config attribute '{self.hf_config_name}' not found in hf_config, "
+                "falling back to default get_hf_text_config"
+            )
+            return get_hf_text_config(self.hf_config)
 
     def __post_init__(
         self,
@@ -173,9 +183,19 @@ def __post_init__(
         self.hf_text_config = self.draw_hf_text_config()
         self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None)
         self.encoder_config = self._get_encoder_config()
-        self.hf_image_processor_config = get_hf_image_processor_config(
-            self.model, hf_token=self.hf_token, revision=self.revision
-        )
+        # Try to load image processor config, but allow it to fail for stages that don't need it
+        try:
+            self.hf_image_processor_config = get_hf_image_processor_config(
+                self.model, hf_token=self.hf_token, revision=self.revision
+            )
+        except (OSError, ValueError, IndexError) as e:
+            # Some stages (e.g., code2wav, talker) don't need image processor
+            # Log warning but allow initialization to continue
+            logger.warning(
+                f"Failed to load image processor config for model '{self.model}': {e}. "
+                "This is expected for stages that don't require image processing."
+            )
+            self.hf_image_processor_config = None
 
         architectures = self.architectures
         registry = self.registry
 
@@ -67,15 +67,15 @@ def step(self, requests: list[OmniDiffusionRequest]):
                 return None
 
             postprocess_start_time = time.time()
-            images = self.post_process_func(output.output)
+            images = self.post_process_func(output.output) if self.post_process_func is not None else output.output
             postprocess_time = time.time() - postprocess_start_time
             logger.info(f"Post-processing completed in {postprocess_time:.4f} seconds")
 
             # Convert to OmniRequestOutput format
             # Ensure images is a list
             if not isinstance(images, list):
                 images = [images] if images is not None else []
-            
+
             # Handle single request or multiple requests
             if len(requests) == 1:
                 # Single request: return single OmniRequestOutput
@@ -84,11 +84,11 @@ def step(self, requests: list[OmniDiffusionRequest]):
                 prompt = request.prompt
                 if isinstance(prompt, list):
                     prompt = prompt[0] if prompt else None
-                
+
                 metrics = {}
                 if output.trajectory_timesteps is not None:
-                    metrics['trajectory_timesteps'] = output.trajectory_timesteps
-                
+                    metrics["trajectory_timesteps"] = output.trajectory_timesteps
+
                 return OmniRequestOutput.from_diffusion(
                     request_id=request_id,
                     images=images,
@@ -101,22 +101,22 @@ def step(self, requests: list[OmniDiffusionRequest]):
                 # Split images based on num_outputs_per_prompt for each request
                 results = []
                 image_idx = 0
-                
+
                 for request in requests:
                     request_id = request.request_id or ""
                     prompt = request.prompt
                     if isinstance(prompt, list):
                         prompt = prompt[0] if prompt else None
-                    
+
                     # Get images for this request
                     num_outputs = request.num_outputs_per_prompt
-                    request_images = images[image_idx:image_idx + num_outputs] if image_idx < len(images) else []
+                    request_images = images[image_idx : image_idx + num_outputs] if image_idx < len(images) else []
                     image_idx += num_outputs
-                    
+
                     metrics = {}
                     if output.trajectory_timesteps is not None:
-                        metrics['trajectory_timesteps'] = output.trajectory_timesteps
-                    
+                        metrics["trajectory_timesteps"] = output.trajectory_timesteps
+
                     results.append(
                         OmniRequestOutput.from_diffusion(
                             request_id=request_id,
@@ -126,7 +126,7 @@ def step(self, requests: list[OmniDiffusionRequest]):
                             latents=output.trajectory_latents,
                         )
                     )
-                
+
                 return results
         except Exception as e:
             logger.error(f"Generation failed: {e}")