From 7b499de6d04eab1180dd86ab667c6a66a816f0d6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 03:35:15 +0100
Subject: [PATCH 01/58] up

---
 .../modular_pipeline_utils.py                 | 127 +++++++++++++++++-
 .../qwenimage/before_denoise.py               |  40 +++---
 2 files changed, 146 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index aa421a53727b..afc4d6959a6f 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -17,7 +17,7 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union
-
+import PIL.Image
 import torch
 
 from ..configuration_utils import ConfigMixin, FrozenDict
@@ -342,6 +342,121 @@ class InputParam:
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
+    @classmethod
+    def prompt(cls) -> "InputParam":
+        return cls(name="prompt", type_hint=str, required=True,
+                   description="The prompt or prompts to guide image generation.")
+    
+    @classmethod
+    def negative_prompt(cls) -> "InputParam":
+        return cls(name="negative_prompt", type_hint=str, default=None,
+                   description="The prompt or prompts not to guide the image generation.")
+    
+    @classmethod
+    def max_sequence_length(cls, default: int = 512) -> "InputParam":
+        return cls(name="max_sequence_length", type_hint=int, default=default,
+                   description="Maximum sequence length for prompt encoding.")
+    
+    @classmethod
+    def height(cls, default: Optional[int] = None) -> "InputParam":
+        return cls(name="height", type_hint=int, default=default,
+                   description="The height in pixels of the generated image.")
+    
+    @classmethod
+    def width(cls, default: Optional[int] = None) -> "InputParam":
+        return cls(name="width", type_hint=int, default=default,
+                   description="The width in pixels of the generated image.")
+
+    @classmethod
+    def num_inference_steps(cls, default: int = 50) -> "InputParam":
+        return cls(name="num_inference_steps", type_hint=int, default=default,
+                   description="The number of denoising steps.")
+    
+    
+    @classmethod
+    def num_images_per_prompt(cls, default: int = 1) -> "InputParam":
+        return cls(name="num_images_per_prompt", type_hint=int, default=default,
+                   description="The number of images to generate per prompt.")
+    
+    @classmethod
+    def generator(cls) -> "InputParam":
+        return cls(name="generator", type_hint=torch.Generator, default=None,
+                   description="Torch generator for deterministic generation.")
+    
+    
+    @classmethod
+    def sigmas(cls) -> "InputParam":
+        return cls(name="sigmas", type_hint=List[float], default=None,
+                   description="Custom sigmas for the denoising process.")
+    
+    @classmethod
+    def strength(cls, default: float = 0.9) -> "InputParam":
+        return cls(name="strength", type_hint=float, default=default,
+                   description="Strength for img2img/inpainting.")
+    
+    @classmethod
+    def image(cls) -> "InputParam":
+        return cls(name="image", type_hint=PIL.Image.Image, required=True,
+                   description="Input image for img2img, editing, or conditioning.")
+    
+    @classmethod
+    def mask_image(cls) -> "InputParam":
+        return cls(name="mask_image", type_hint=PIL.Image.Image, required=True,
+                   description="Mask image for inpainting.")
+    
+    @classmethod
+    def control_image(cls) -> "InputParam":
+        return cls(name="control_image", type_hint=PIL.Image.Image, required=True,
+                   description="Control image for ControlNet conditioning.")
+    
+    @classmethod
+    def padding_mask_crop(cls) -> "InputParam":
+        return cls(name="padding_mask_crop", type_hint=int, default=None,
+                   description="Padding for mask cropping in inpainting.")
+    
+
+    @classmethod
+    def latents(cls) -> "InputParam":
+        return cls(name="latents", type_hint=torch.Tensor, default=None,
+                   description="Pre-generated noisy latents for image generation.")
+    
+    
+    @classmethod
+    def timesteps(cls) -> "InputParam":
+        return cls(name="timesteps", type_hint=torch.Tensor, default=None,
+                   description="Timesteps for the denoising process.")
+    
+    
+    # =====================================================================
+    # ControlNet
+    # =====================================================================
+
+    @classmethod
+    def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
+        return cls(name="control_guidance_start", type_hint=float, default=default,
+                   description="When to start applying ControlNet.")
+    
+    @classmethod
+    def control_guidance_end(cls, default: float = 1.0) -> "InputParam":
+        return cls(name="control_guidance_end", type_hint=float, default=default,
+                   description="When to stop applying ControlNet.")
+    
+    @classmethod
+    def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
+        return cls(name="controlnet_conditioning_scale", type_hint=float, default=default,
+                   description="Scale for ControlNet conditioning.")
+
+    
+    @classmethod
+    def output_type(cls) -> "InputParam":
+        return cls(name="output_type", type_hint=str, default="pil",
+                   description="Output format: 'pil', 'np', 'pt', or 'latent'.")
+    
+    @classmethod
+    def attention_kwargs(cls) -> "InputParam":
+        return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None,
+                   description="Additional kwargs for attention processors.")
+
 
 @dataclass
 class OutputParam:
@@ -357,6 +472,16 @@ def __repr__(self):
             f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
         )
 
+    @classmethod
+    def images(cls) -> "OutputParam":
+        return cls(name="images", type_hint=List[PIL.Image.Image],
+                   description="Generated images.")
+    
+    @classmethod
+    def latents(cls) -> "OutputParam":
+        return cls(name="latents", type_hint=torch.Tensor,
+                   description="Denoised latents.")
+
 
 def format_inputs_short(inputs):
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 0c66d6ea3303..6fa4a971c2c5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -134,11 +134,11 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents"),
-            InputParam(name="height"),
-            InputParam(name="width"),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="generator"),
+            InputParam.latents(),
+            InputParam.height(),
+            InputParam.width(),
+            InputParam.num_images_per_prompt(),
+            InputParam.generator(),
             InputParam(
                 name="batch_size",
                 required=True,
@@ -225,12 +225,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents"),
-            InputParam(name="height"),
-            InputParam(name="width"),
-            InputParam(name="layers", default=4),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="generator"),
+            InputParam.latents(),
+            InputParam.height(),
+            InputParam.width(),
+            InputParam(name="layers", type_hint=int, default=4),
+            InputParam.num_images_per_prompt(),
+            InputParam.generator(),
             InputParam(
                 name="batch_size",
                 required=True,
@@ -466,8 +466,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_inference_steps", default=50),
-            InputParam(name="sigmas"),
+            InputParam.num_inference_steps(),
+            InputParam.sigmas(),
             InputParam(
                 name="latents",
                 required=True,
@@ -532,8 +532,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("num_inference_steps", default=50, type_hint=int),
-            InputParam("sigmas", type_hint=List[float]),
+            InputParam.num_inference_steps(),
+            InputParam.sigmas(),
             InputParam("image_latents", required=True, type_hint=torch.Tensor),
         ]
 
@@ -590,8 +590,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_inference_steps", default=50),
-            InputParam(name="sigmas"),
+            InputParam.num_inference_steps(),
+            InputParam.sigmas(),
             InputParam(
                 name="latents",
                 required=True,
@@ -971,9 +971,9 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("control_guidance_start", default=0.0),
-            InputParam("control_guidance_end", default=1.0),
-            InputParam("controlnet_conditioning_scale", default=1.0),
+            InputParam.control_guidance_start(),
+            InputParam.control_guidance_end(),
+            InputParam.controlnet_conditioning_scale(),
             InputParam("control_image_latents", required=True),
             InputParam(
                 "timesteps",

From b29873dee72ea60e155a2a14a72e6e6ee6195b63 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 10:52:53 +0100
Subject: [PATCH 02/58] up up

---
 .../modular_pipeline_utils.py                 | 57 +++++++++++------
 .../qwenimage/before_denoise.py               |  6 +-
 .../modular_pipelines/qwenimage/decoders.py   | 28 +++------
 .../modular_pipelines/qwenimage/denoise.py    | 16 ++---
 .../modular_pipelines/qwenimage/encoders.py   | 61 ++++++++++---------
 .../modular_pipelines/qwenimage/inputs.py     | 34 +++++------
 .../qwenimage/modular_blocks_qwenimage.py     | 26 ++++++--
 .../modular_pipelines/z_image/denoise.py      |  5 +-
 8 files changed, 125 insertions(+), 108 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index afc4d6959a6f..cb179eccc7f7 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -342,6 +342,18 @@ class InputParam:
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
+
+    @classmethod
+    def template(cls, name: str) -> Optional["InputParam"]:
+        """Get template for name if exists, otherwise None."""
+        if hasattr(cls, name) and callable(getattr(cls, name)):
+            return getattr(cls, name)()
+        return None
+
+    # ======================================================
+    # InputParam templates
+    # ======================================================
+
     @classmethod
     def prompt(cls) -> "InputParam":
         return cls(name="prompt", type_hint=str, required=True,
@@ -383,7 +395,6 @@ def generator(cls) -> "InputParam":
         return cls(name="generator", type_hint=torch.Generator, default=None,
                    description="Torch generator for deterministic generation.")
     
-    
     @classmethod
     def sigmas(cls) -> "InputParam":
         return cls(name="sigmas", type_hint=List[float], default=None,
@@ -394,6 +405,7 @@ def strength(cls, default: float = 0.9) -> "InputParam":
         return cls(name="strength", type_hint=float, default=default,
                    description="Strength for img2img/inpainting.")
     
+    # images
     @classmethod
     def image(cls) -> "InputParam":
         return cls(name="image", type_hint=PIL.Image.Image, required=True,
@@ -425,12 +437,24 @@ def latents(cls) -> "InputParam":
     def timesteps(cls) -> "InputParam":
         return cls(name="timesteps", type_hint=torch.Tensor, default=None,
                    description="Timesteps for the denoising process.")
+
+    @classmethod
+    def output_type(cls) -> "InputParam":
+        return cls(name="output_type", type_hint=str, default="pil",
+                   description="Output format: 'pil', 'np', 'pt''.")
     
-    
-    # =====================================================================
-    # ControlNet
-    # =====================================================================
+    @classmethod
+    def attention_kwargs(cls) -> "InputParam":
+        return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None,
+                   description="Additional kwargs for attention processors.")
+
+    @classmethod
+    def denoiser_input_fields(cls) -> "InputParam":
+        return cls(kwargs_type="denoiser_input_fields", type_hint=torch.Tensor,
+                   description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.")
+
 
+    # ControlNet
     @classmethod
     def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
         return cls(name="control_guidance_start", type_hint=float, default=default,
@@ -446,18 +470,6 @@ def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
         return cls(name="controlnet_conditioning_scale", type_hint=float, default=default,
                    description="Scale for ControlNet conditioning.")
 
-    
-    @classmethod
-    def output_type(cls) -> "InputParam":
-        return cls(name="output_type", type_hint=str, default="pil",
-                   description="Output format: 'pil', 'np', 'pt', or 'latent'.")
-    
-    @classmethod
-    def attention_kwargs(cls) -> "InputParam":
-        return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None,
-                   description="Additional kwargs for attention processors.")
-
-
 @dataclass
 class OutputParam:
     """Specification for an output parameter."""
@@ -472,6 +484,17 @@ def __repr__(self):
             f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
         )
 
+    @classmethod
+    def template(cls, name: str) -> Optional["OutputParam"]:
+        """Get template for name if exists, otherwise None."""
+        if hasattr(cls, name) and callable(getattr(cls, name)):
+            return getattr(cls, name)()
+        return None
+
+    # ======================================================
+    # OutputParam templates
+    # ======================================================
+
     @classmethod
     def images(cls) -> "OutputParam":
         return cls(name="images", type_hint=List[PIL.Image.Image],
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 6fa4a971c2c5..d61711e13a52 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -228,7 +228,7 @@ def inputs(self) -> List[InputParam]:
             InputParam.latents(),
             InputParam.height(),
             InputParam.width(),
-            InputParam(name="layers", type_hint=int, default=4),
+            InputParam(name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"),
             InputParam.num_images_per_prompt(),
             InputParam.generator(),
             InputParam(
@@ -598,7 +598,7 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The latents to use for the denoising process, used to calculate the image sequence length.",
             ),
-            InputParam(name="strength", default=0.9),
+            InputParam.strength(0.9),
         ]
 
     @property
@@ -886,7 +886,7 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(name="batch_size", required=True),
-            InputParam(name="layers", required=True),
+            InputParam(name="layers", default=4, description="Number of layers to extract from the image"),
             InputParam(name="height", required=True),
             InputParam(name="width", required=True),
             InputParam(name="prompt_embeds_mask"),
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 24a88ebfca3c..9c3a1c01d018 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -91,7 +91,7 @@ def inputs(self) -> List[InputParam]:
             InputParam("latents", required=True, type_hint=torch.Tensor),
             InputParam("height", required=True, type_hint=int),
             InputParam("width", required=True, type_hint=int),
-            InputParam("layers", required=True, type_hint=int),
+            InputParam("layers", default=4, description="Number of layers to extract from the image"),
         ]
 
     @torch.no_grad()
@@ -141,11 +141,7 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[str]:
         return [
-            OutputParam(
-                "images",
-                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
-                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
-            )
+            OutputParam.images()
         ]
 
     @torch.no_grad()
@@ -198,14 +194,14 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("output_type", default="pil", type_hint=str),
+            InputParam("latents", required=True, type_hint=torch.Tensor, description="The latents to decode, can be generated in the denoise step"),
+            InputParam.output_type(),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.images(),
         ]
 
     @torch.no_grad()
@@ -273,12 +269,7 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("images", required=True, description="the generated image from decoders step"),
-            InputParam(
-                name="output_type",
-                default="pil",
-                type_hint=str,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
-            ),
+            InputParam.output_type(),
         ]
 
     @staticmethod
@@ -323,12 +314,7 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("images", required=True, description="the generated image from decoders step"),
-            InputParam(
-                name="output_type",
-                default="pil",
-                type_hint=str,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
-            ),
+            InputParam.output_type(),
             InputParam("mask_overlay_kwargs"),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index eb1e5a341c68..472945b2269a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -218,7 +218,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("attention_kwargs"),
+            InputParam.attention_kwargs(),
             InputParam(
                 "latents",
                 required=True,
@@ -231,10 +231,7 @@ def inputs(self) -> List[InputParam]:
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.denoiser_input_fields(),
             InputParam(
                 "img_shapes",
                 required=True,
@@ -322,7 +319,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("attention_kwargs"),
+            InputParam.attention_kwargs(),
             InputParam(
                 "latents",
                 required=True,
@@ -335,10 +332,7 @@ def inputs(self) -> List[InputParam]:
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.denoiser_input_fields(),
             InputParam(
                 "img_shapes",
                 required=True,
@@ -424,7 +418,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
+            OutputParam.latents(),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 4b66dd32e521..2eca8645ef2c 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -301,8 +301,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
+            InputParam.template(self._image_input_name) or InputParam(
+                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="Input image for conditioning"
             ),
         ]
 
@@ -381,7 +381,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
+            InputParam.template(self._image_input_name) or InputParam(
                 name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
             ),
             InputParam(
@@ -484,7 +484,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
+            InputParam.template(self._image_input_name) or InputParam(
                 name=self._image_input_name,
                 required=True,
                 type_hint=torch.Tensor,
@@ -564,7 +564,7 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", type_hint=str, description="The prompt to encode"),
+            InputParam(name="prompt", type_hint=str, description="The prompt to encode"), # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -647,11 +647,9 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
-            InputParam(
-                name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024
-            ),
+            InputParam.prompt(),
+            InputParam.negative_prompt(),
+            InputParam.max_sequence_length(1024),
         ]
 
     @property
@@ -772,8 +770,8 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
+            InputParam.prompt(),
+            InputParam.negative_prompt(),
             InputParam(
                 name="resized_image",
                 required=True,
@@ -895,8 +893,8 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
+            InputParam.prompt(),
+            InputParam.negative_prompt(),
             InputParam(
                 name="resized_cond_image",
                 required=True,
@@ -1010,11 +1008,11 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("mask_image", required=True),
-            InputParam("image", required=True),
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("padding_mask_crop"),
+            InputParam.mask_image(),
+            InputParam.image(),
+            InputParam.height(),
+            InputParam.width(),
+            InputParam.padding_mask_crop(),
         ]
 
     @property
@@ -1082,9 +1080,9 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("mask_image", required=True),
-            InputParam("resized_image", required=True),
-            InputParam("padding_mask_crop"),
+            InputParam.mask_image(),
+            InputParam("resized_image", required=True, type_hint=PIL.Image.Image, description="The resized image. should be generated using a resize step"),
+            InputParam.padding_mask_crop(),
         ]
 
     @property
@@ -1140,9 +1138,9 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("image", required=True),
-            InputParam("height"),
-            InputParam("width"),
+            InputParam.image(),
+            InputParam.height(),
+            InputParam.width(),
         ]
 
     @property
@@ -1312,7 +1310,10 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam(self._image_input_name, required=True), InputParam("generator")]
+        return [
+            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), 
+            InputParam.generator(),
+        ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -1383,10 +1384,10 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam("control_image", required=True),
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("generator"),
+            InputParam.control_image(),
+            InputParam.height(),
+            InputParam.width(),
+            InputParam.generator(),
         ]
         return inputs
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index 4a1cf3700c57..e28493ecc369 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -129,7 +129,7 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.num_images_per_prompt(),
             InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
             InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
             InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
@@ -269,17 +269,17 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.num_images_per_prompt(),
             InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.height(),
+            InputParam.width(),
         ]
 
         for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
+            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
 
         for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
 
         return inputs
 
@@ -398,17 +398,17 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.num_images_per_prompt(),
             InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.height(),
+            InputParam.width(),
         ]
 
         for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
+            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
 
         for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
 
         return inputs
 
@@ -544,15 +544,15 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
+            InputParam.num_images_per_prompt(),
             InputParam(name="batch_size", required=True),
         ]
 
         for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
+            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
 
         for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
 
         return inputs
 
@@ -638,9 +638,9 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam(name="control_image_latents", required=True),
             InputParam(name="batch_size", required=True),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.num_images_per_prompt(),
+            InputParam.height(),
+            InputParam.width(),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 63e9f5a28372..c349c7d9f224 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -54,7 +54,23 @@
 
 
 # ====================
-# 1. VAE ENCODER
+# 1. TEXT ENCODER
+# ====================
+
+class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
+    model_name = "qwenimage"
+    block_classes = [QwenImageTextEncoderStep()]
+    block_names = ["text_encoder"]
+    block_trigger_inputs = ["prompt"]
+
+    @property
+    def description(self) -> str:
+        return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block."
+        " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
+        " - if `prompt` is not provided, step will be skipped."
+
+# ====================
+# 2. VAE ENCODER
 # ====================
 
 
@@ -118,7 +134,7 @@ def description(self):
 
 
 # ====================
-# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
+# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
 # ====================
 
 
@@ -396,7 +412,7 @@ def description(self):
 
 
 # ====================
-# 3. DECODE
+# 4. DECODE
 # ====================
 
 
@@ -439,11 +455,11 @@ def description(self):
 
 
 # ====================
-# 4. AUTO BLOCKS & PRESETS
+# 5. AUTO BLOCKS & PRESETS
 # ====================
 AUTO_BLOCKS = InsertableDict(
     [
-        ("text_encoder", QwenImageTextEncoderStep()),
+        ("text_encoder", QwenImageAutoTextEncoderStep()),
         ("vae_encoder", QwenImageAutoVaeEncoderStep()),
         ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
         ("denoise", QwenImageAutoCoreDenoiseStep()),
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index 3d5a00a9df50..a165fb513f3c 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -129,10 +129,7 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.denoiser_input_fields(),
         ]
         guider_input_names = []
         uncond_guider_input_names = []

From 43ab14845d9cbf090e0de0f1f284bdec54008954 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 10:56:54 +0100
Subject: [PATCH 03/58] update outputs

---
 .../modular_pipelines/qwenimage/modular_blocks_qwenimage.py | 6 ++----
 .../qwenimage/modular_blocks_qwenimage_edit.py              | 6 ++----
 .../qwenimage/modular_blocks_qwenimage_edit_plus.py         | 6 ++----
 .../qwenimage/modular_blocks_qwenimage_layered.py           | 6 ++----
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index f58dffd922fc..e112578c399d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -418,9 +418,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.latents(),
         ]
 
 
@@ -500,5 +498,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.images(),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 2683e64080bf..30fcb842d591 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -313,9 +313,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.latents(),
         ]
 
 
@@ -349,5 +347,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.images(),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 99c5b109bf38..345b0cd93560 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -144,9 +144,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.latents(),
         ]
 
 
@@ -196,5 +194,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.images(),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 63ee36df5112..965f9e1976ad 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -142,9 +142,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.latents(),
         ]
 
 
@@ -174,5 +172,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.images(),
         ]

From 34a743e2dc36dc0ce7a86251ab3c4b74f89beb00 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 10:57:27 +0100
Subject: [PATCH 04/58] style

---
 .../modular_pipeline_utils.py                 | 191 +++++++++++-------
 .../qwenimage/before_denoise.py               |   4 +-
 .../modular_pipelines/qwenimage/decoders.py   |  15 +-
 .../modular_pipelines/qwenimage/encoders.py   |  27 ++-
 .../qwenimage/modular_blocks_qwenimage.py     |   6 +-
 .../modular_blocks_qwenimage_edit.py          |   5 +-
 .../modular_blocks_qwenimage_edit_plus.py     |   4 -
 .../modular_blocks_qwenimage_layered.py       |   5 -
 8 files changed, 155 insertions(+), 102 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index cb179eccc7f7..fab7c7193e5d 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -17,6 +17,7 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union
+
 import PIL.Image
 import torch
 
@@ -342,7 +343,6 @@ class InputParam:
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
-
     @classmethod
     def template(cls, name: str) -> Optional["InputParam"]:
         """Get template for name if exists, otherwise None."""
@@ -356,119 +356,172 @@ def template(cls, name: str) -> Optional["InputParam"]:
 
     @classmethod
     def prompt(cls) -> "InputParam":
-        return cls(name="prompt", type_hint=str, required=True,
-                   description="The prompt or prompts to guide image generation.")
-    
+        return cls(
+            name="prompt", type_hint=str, required=True, description="The prompt or prompts to guide image generation."
+        )
+
     @classmethod
     def negative_prompt(cls) -> "InputParam":
-        return cls(name="negative_prompt", type_hint=str, default=None,
-                   description="The prompt or prompts not to guide the image generation.")
-    
+        return cls(
+            name="negative_prompt",
+            type_hint=str,
+            default=None,
+            description="The prompt or prompts not to guide the image generation.",
+        )
+
     @classmethod
     def max_sequence_length(cls, default: int = 512) -> "InputParam":
-        return cls(name="max_sequence_length", type_hint=int, default=default,
-                   description="Maximum sequence length for prompt encoding.")
-    
+        return cls(
+            name="max_sequence_length",
+            type_hint=int,
+            default=default,
+            description="Maximum sequence length for prompt encoding.",
+        )
+
     @classmethod
     def height(cls, default: Optional[int] = None) -> "InputParam":
-        return cls(name="height", type_hint=int, default=default,
-                   description="The height in pixels of the generated image.")
-    
+        return cls(
+            name="height", type_hint=int, default=default, description="The height in pixels of the generated image."
+        )
+
     @classmethod
     def width(cls, default: Optional[int] = None) -> "InputParam":
-        return cls(name="width", type_hint=int, default=default,
-                   description="The width in pixels of the generated image.")
+        return cls(
+            name="width", type_hint=int, default=default, description="The width in pixels of the generated image."
+        )
 
     @classmethod
     def num_inference_steps(cls, default: int = 50) -> "InputParam":
-        return cls(name="num_inference_steps", type_hint=int, default=default,
-                   description="The number of denoising steps.")
-    
-    
+        return cls(
+            name="num_inference_steps", type_hint=int, default=default, description="The number of denoising steps."
+        )
+
     @classmethod
     def num_images_per_prompt(cls, default: int = 1) -> "InputParam":
-        return cls(name="num_images_per_prompt", type_hint=int, default=default,
-                   description="The number of images to generate per prompt.")
-    
+        return cls(
+            name="num_images_per_prompt",
+            type_hint=int,
+            default=default,
+            description="The number of images to generate per prompt.",
+        )
+
     @classmethod
     def generator(cls) -> "InputParam":
-        return cls(name="generator", type_hint=torch.Generator, default=None,
-                   description="Torch generator for deterministic generation.")
-    
+        return cls(
+            name="generator",
+            type_hint=torch.Generator,
+            default=None,
+            description="Torch generator for deterministic generation.",
+        )
+
     @classmethod
     def sigmas(cls) -> "InputParam":
-        return cls(name="sigmas", type_hint=List[float], default=None,
-                   description="Custom sigmas for the denoising process.")
-    
+        return cls(
+            name="sigmas", type_hint=List[float], default=None, description="Custom sigmas for the denoising process."
+        )
+
     @classmethod
     def strength(cls, default: float = 0.9) -> "InputParam":
-        return cls(name="strength", type_hint=float, default=default,
-                   description="Strength for img2img/inpainting.")
-    
+        return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.")
+
     # images
     @classmethod
     def image(cls) -> "InputParam":
-        return cls(name="image", type_hint=PIL.Image.Image, required=True,
-                   description="Input image for img2img, editing, or conditioning.")
-    
+        return cls(
+            name="image",
+            type_hint=PIL.Image.Image,
+            required=True,
+            description="Input image for img2img, editing, or conditioning.",
+        )
+
     @classmethod
     def mask_image(cls) -> "InputParam":
-        return cls(name="mask_image", type_hint=PIL.Image.Image, required=True,
-                   description="Mask image for inpainting.")
-    
+        return cls(
+            name="mask_image", type_hint=PIL.Image.Image, required=True, description="Mask image for inpainting."
+        )
+
     @classmethod
     def control_image(cls) -> "InputParam":
-        return cls(name="control_image", type_hint=PIL.Image.Image, required=True,
-                   description="Control image for ControlNet conditioning.")
-    
+        return cls(
+            name="control_image",
+            type_hint=PIL.Image.Image,
+            required=True,
+            description="Control image for ControlNet conditioning.",
+        )
+
     @classmethod
     def padding_mask_crop(cls) -> "InputParam":
-        return cls(name="padding_mask_crop", type_hint=int, default=None,
-                   description="Padding for mask cropping in inpainting.")
-    
+        return cls(
+            name="padding_mask_crop",
+            type_hint=int,
+            default=None,
+            description="Padding for mask cropping in inpainting.",
+        )
 
     @classmethod
     def latents(cls) -> "InputParam":
-        return cls(name="latents", type_hint=torch.Tensor, default=None,
-                   description="Pre-generated noisy latents for image generation.")
-    
-    
+        return cls(
+            name="latents",
+            type_hint=torch.Tensor,
+            default=None,
+            description="Pre-generated noisy latents for image generation.",
+        )
+
     @classmethod
     def timesteps(cls) -> "InputParam":
-        return cls(name="timesteps", type_hint=torch.Tensor, default=None,
-                   description="Timesteps for the denoising process.")
+        return cls(
+            name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process."
+        )
 
     @classmethod
     def output_type(cls) -> "InputParam":
-        return cls(name="output_type", type_hint=str, default="pil",
-                   description="Output format: 'pil', 'np', 'pt''.")
-    
+        return cls(name="output_type", type_hint=str, default="pil", description="Output format: 'pil', 'np', 'pt''.")
+
     @classmethod
     def attention_kwargs(cls) -> "InputParam":
-        return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None,
-                   description="Additional kwargs for attention processors.")
+        return cls(
+            name="attention_kwargs",
+            type_hint=Dict[str, Any],
+            default=None,
+            description="Additional kwargs for attention processors.",
+        )
 
     @classmethod
     def denoiser_input_fields(cls) -> "InputParam":
-        return cls(kwargs_type="denoiser_input_fields", type_hint=torch.Tensor,
-                   description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.")
-
+        return cls(
+            kwargs_type="denoiser_input_fields",
+            type_hint=torch.Tensor,
+            description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+        )
 
     # ControlNet
     @classmethod
     def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
-        return cls(name="control_guidance_start", type_hint=float, default=default,
-                   description="When to start applying ControlNet.")
-    
+        return cls(
+            name="control_guidance_start",
+            type_hint=float,
+            default=default,
+            description="When to start applying ControlNet.",
+        )
+
     @classmethod
     def control_guidance_end(cls, default: float = 1.0) -> "InputParam":
-        return cls(name="control_guidance_end", type_hint=float, default=default,
-                   description="When to stop applying ControlNet.")
-    
+        return cls(
+            name="control_guidance_end",
+            type_hint=float,
+            default=default,
+            description="When to stop applying ControlNet.",
+        )
+
     @classmethod
     def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
-        return cls(name="controlnet_conditioning_scale", type_hint=float, default=default,
-                   description="Scale for ControlNet conditioning.")
+        return cls(
+            name="controlnet_conditioning_scale",
+            type_hint=float,
+            default=default,
+            description="Scale for ControlNet conditioning.",
+        )
+
 
 @dataclass
 class OutputParam:
@@ -497,13 +550,11 @@ def template(cls, name: str) -> Optional["OutputParam"]:
 
     @classmethod
     def images(cls) -> "OutputParam":
-        return cls(name="images", type_hint=List[PIL.Image.Image],
-                   description="Generated images.")
-    
+        return cls(name="images", type_hint=List[PIL.Image.Image], description="Generated images.")
+
     @classmethod
     def latents(cls) -> "OutputParam":
-        return cls(name="latents", type_hint=torch.Tensor,
-                   description="Denoised latents.")
+        return cls(name="latents", type_hint=torch.Tensor, description="Denoised latents.")
 
 
 def format_inputs_short(inputs):
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index d61711e13a52..cb808b1d3807 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -228,7 +228,9 @@ def inputs(self) -> List[InputParam]:
             InputParam.latents(),
             InputParam.height(),
             InputParam.width(),
-            InputParam(name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"),
+            InputParam(
+                name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"
+            ),
             InputParam.num_images_per_prompt(),
             InputParam.generator(),
             InputParam(
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 9c3a1c01d018..8207e99b69ae 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Union
+from typing import List
 
-import numpy as np
-import PIL
 import torch
 
 from ...configuration_utils import FrozenDict
@@ -140,9 +138,7 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[str]:
-        return [
-            OutputParam.images()
-        ]
+        return [OutputParam.images()]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -194,7 +190,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor, description="The latents to decode, can be generated in the denoise step"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to decode, can be generated in the denoise step",
+            ),
             InputParam.output_type(),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 2eca8645ef2c..f0dd6471b168 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -301,8 +301,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="Input image for conditioning"
+            InputParam.template(self._image_input_name)
+            or InputParam(
+                name=self._image_input_name,
+                required=True,
+                type_hint=torch.Tensor,
+                description="Input image for conditioning",
             ),
         ]
 
@@ -381,7 +385,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(
+            InputParam.template(self._image_input_name)
+            or InputParam(
                 name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
             ),
             InputParam(
@@ -484,7 +489,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(
+            InputParam.template(self._image_input_name)
+            or InputParam(
                 name=self._image_input_name,
                 required=True,
                 type_hint=torch.Tensor,
@@ -564,7 +570,9 @@ def expected_configs(self) -> List[ConfigSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", type_hint=str, description="The prompt to encode"), # it is not required for qwenimage-layered, unlike other pipelines
+            InputParam(
+                name="prompt", type_hint=str, description="The prompt to encode"
+            ),  # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -1081,7 +1089,12 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.mask_image(),
-            InputParam("resized_image", required=True, type_hint=PIL.Image.Image, description="The resized image. should be generated using a resize step"),
+            InputParam(
+                "resized_image",
+                required=True,
+                type_hint=PIL.Image.Image,
+                description="The resized image. should be generated using a resize step",
+            ),
             InputParam.padding_mask_crop(),
         ]
 
@@ -1311,7 +1324,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), 
+            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True),
             InputParam.generator(),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index e112578c399d..d6117a12a57d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
@@ -62,6 +58,7 @@
 # 1. TEXT ENCODER
 # ====================
 
+
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     model_name = "qwenimage"
     block_classes = [QwenImageTextEncoderStep()]
@@ -74,6 +71,7 @@ def description(self) -> str:
         " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
         " - if `prompt` is not provided, step will be skipped."
 
+
 # ====================
 # 2. VAE ENCODER
 # ====================
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 30fcb842d591..14d0945dbe57 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional
-
-import PIL.Image
-import torch
+from typing import Optional
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 345b0cd93560..fbe5e60f353f 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 965f9e1976ad..e91a5c40b19b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -13,11 +13,6 @@
 # limitations under the License.
 
 
-from typing import List
-
-import PIL.Image
-import torch
-
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam

From ff09bf1a631e38683205217e8dba4961de090319 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 11:55:03 +0100
Subject: [PATCH 05/58] add modular_auto_docstring!

---
 .../qwenimage/modular_blocks_qwenimage.py     | 814 +++++++++++++++++-
 utils/modular_auto_docstring.py               | 296 +++++++
 2 files changed, 1104 insertions(+), 6 deletions(-)
 create mode 100644 utils/modular_auto_docstring.py

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index d6117a12a57d..19feffe77eda 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -58,8 +58,59 @@
 # 1. TEXT ENCODER
 # ====================
 
-
+#auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
+    """
+    class QwenImageAutoTextEncoderStep
+
+      Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
+
+      Components:
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageTextEncoderStep()]
     block_names = ["text_encoder"]
@@ -76,8 +127,54 @@ def description(self) -> str:
 # 2. VAE ENCODER
 # ====================
 
-
+#auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintVaeEncoderStep
+
+      This step is used for processing image and mask inputs for inpainting tasks. It:
+       - Resizes the image to the target size, based on `height` and `width`.
+       - Processes and updates `image` and `mask_image`.
+       - Creates `image_latents`.
+
+      Components:
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          mask_image (`Image`):
+              Mask image for inpainting.
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          processed_image (`None`):
+
+          processed_mask_image (`None`):
+
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
     block_names = ["preprocess", "encode"]
@@ -92,7 +189,40 @@ def description(self) -> str:
         )
 
 
+#auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgVaeEncoderStep
+
+      Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage"
 
     block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -103,7 +233,6 @@ def description(self) -> str:
         return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
 
 
-# Auto VAE encoder
 class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
     block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
     block_names = ["inpaint", "img2img"]
@@ -121,7 +250,43 @@ def description(self):
 
 
 # optional controlnet vae encoder
+#auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
+    """
+    class QwenImageOptionalControlNetVaeEncoderStep
+
+      Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
+       - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
+       - if `control_image` is not provided, step will be skipped.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          control_image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          control_image_latents (`Tensor`):
+              The latents representing the control image
+    """
     block_classes = [QwenImageControlNetVaeEncoderStep]
     block_names = ["controlnet"]
     block_trigger_inputs = ["control_image"]
@@ -142,7 +307,52 @@ def description(self):
 
 
 # assemble input steps
+#auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgInputStep
+
+      Input step that prepares the inputs for the img2img denoising step. It:
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
     block_names = ["text_inputs", "additional_inputs"]
@@ -154,7 +364,54 @@ def description(self):
         " - update height/width based `image_latents`, patchify `image_latents`."
 
 
+#auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintInputStep
+
+      Input step that prepares the inputs for the inpainting denoising step. It:
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -172,7 +429,49 @@ def description(self):
 
 
 # assemble prepare latents steps
+#auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintPrepareLatentsStep
+
+      This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the pachified latents `mask` based on the processedmask image.
+
+      Components:
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+
+          image_latents (`Tensor`):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+
+          height (`None`):
+
+          width (`None`):
+
+          dtype (`None`):
+
+      Outputs:
+
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
     model_name = "qwenimage"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -190,7 +489,66 @@ def description(self) -> str:
 
 
 # Qwen Image (text2image)
+#auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageCoreDenoiseStep
+
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -212,10 +570,81 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
+    
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Qwen Image (inpainting)
+#auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageInpaintInputStep(),
@@ -240,9 +669,78 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Qwen Image (image2image)
+#auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageImg2ImgInputStep(),
@@ -267,9 +765,87 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Qwen Image (text2image) with controlnet
+#auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetCoreDenoiseStep
+
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -295,10 +871,95 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
+ 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Qwen Image (inpainting) with controlnet
+#auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetInpaintCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageInpaintInputStep(),
@@ -327,9 +988,93 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
+
 
 # Qwen Image (image2image) with controlnet
+#auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetImg2ImgCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageImg2ImgInputStep(),
@@ -357,7 +1102,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-
+    
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -426,7 +1176,32 @@ def outputs(self):
 
 
 # standard decode step works for most tasks except for inpaint
+#auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -437,7 +1212,34 @@ def description(self):
 
 
 # Inpaint decode step
+#auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
new file mode 100644
index 000000000000..c6aaf8a46a56
--- /dev/null
+++ b/utils/modular_auto_docstring.py
@@ -0,0 +1,296 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Auto Docstring Generator for Modular Pipeline Blocks
+
+This script scans Python files for classes that have `# auto_docstring` comment above them
+and inserts/updates the docstring from the class's `doc` property.
+
+Run from the root of the repo:
+    python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
+
+Examples:
+    # Check for auto_docstring markers (will error if found without proper docstring)
+    python utils/modular_auto_docstring.py
+
+    # Check specific directory
+    python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
+
+    # Fix and overwrite the docstrings
+    python utils/modular_auto_docstring.py --fix_and_overwrite
+
+Usage in code:
+    # auto_docstring
+    class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
+        # docstring will be automatically inserted here
+        
+        @property
+        def doc(self):
+            return "Your docstring content..."
+"""
+
+import argparse
+import ast
+import glob
+import importlib
+import os
+import re
+import sys
+
+
+# All paths are set with the intent you should run this script from the root of the repo
+DIFFUSERS_PATH = "src/diffusers"
+REPO_PATH = "."
+
+# Pattern to match the auto_docstring comment
+AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
+
+
+def setup_diffusers_import():
+    """Setup import path to use the local diffusers module."""
+    src_path = os.path.join(REPO_PATH, "src")
+    if src_path not in sys.path:
+        sys.path.insert(0, src_path)
+
+
+def get_module_from_filepath(filepath: str) -> str:
+    """Convert a filepath to a module name."""
+    filepath = os.path.normpath(filepath)
+    
+    if filepath.startswith("src" + os.sep):
+        filepath = filepath[4:]
+    
+    if filepath.endswith(".py"):
+        filepath = filepath[:-3]
+    
+    module_name = filepath.replace(os.sep, ".")
+    return module_name
+
+
+def load_module(filepath: str):
+    """Load a module from filepath."""
+    setup_diffusers_import()
+    module_name = get_module_from_filepath(filepath)
+    
+    try:
+        module = importlib.import_module(module_name)
+        return module
+    except Exception as e:
+        print(f"Warning: Could not import module {module_name}: {e}")
+        return None
+
+
+def get_doc_from_class(module, class_name: str) -> str:
+    """Get the doc property from an instantiated class."""
+    if module is None:
+        return None
+    
+    cls = getattr(module, class_name, None)
+    if cls is None:
+        return None
+    
+    try:
+        instance = cls()
+        if hasattr(instance, "doc"):
+            return instance.doc
+    except Exception as e:
+        print(f"Warning: Could not instantiate {class_name}: {e}")
+    
+    return None
+
+
+def find_auto_docstring_classes(filepath: str) -> list:
+    """
+    Find all classes in a file that have # auto_docstring comment above them.
+    
+    Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
+    """
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    
+    # Parse AST to find class locations and their docstrings
+    content = "".join(lines)
+    try:
+        tree = ast.parse(content)
+    except SyntaxError as e:
+        print(f"Syntax error in {filepath}: {e}")
+        return []
+    
+    # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
+    class_info = {}
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef):
+            has_docstring = False
+            docstring_end_line = node.lineno  # default to class line
+            
+            if node.body and isinstance(node.body[0], ast.Expr):
+                first_stmt = node.body[0]
+                if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
+                    has_docstring = True
+                    docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
+            
+            class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
+    
+    # Now scan for # auto_docstring comments
+    classes_to_update = []
+    
+    for i, line in enumerate(lines):
+        if AUTO_DOCSTRING_PATTERN.match(line):
+            # Found the marker, look for class definition on next non-empty, non-comment line
+            j = i + 1
+            while j < len(lines):
+                next_line = lines[j].strip()
+                if next_line and not next_line.startswith("#"):
+                    break
+                j += 1
+            
+            if j < len(lines) and lines[j].strip().startswith("class "):
+                # Extract class name
+                match = re.match(r"class\s+(\w+)", lines[j].strip())
+                if match:
+                    class_name = match.group(1)
+                    if class_name in class_info:
+                        class_line, has_docstring, docstring_end_line = class_info[class_name]
+                        classes_to_update.append((
+                            class_name,
+                            class_line,
+                            has_docstring,
+                            docstring_end_line
+                        ))
+    
+    return classes_to_update
+
+
+def format_docstring(doc: str, indent: str = "    ") -> str:
+    """Format a doc string as a properly indented docstring."""
+    lines = doc.strip().split("\n")
+    
+    if len(lines) == 1:
+        return f'{indent}"""{lines[0]}"""\n'
+    else:
+        result = [f'{indent}"""\n']
+        for line in lines:
+            if line.strip():
+                result.append(f"{indent}{line}\n")
+            else:
+                result.append("\n")
+        result.append(f'{indent}"""\n')
+        return "".join(result)
+
+
+def process_file(filepath: str, overwrite: bool = False) -> list:
+    """
+    Process a file and find/insert docstrings for # auto_docstring marked classes.
+    
+    Returns list of classes that need updating.
+    """
+    classes_to_update = find_auto_docstring_classes(filepath)
+    
+    if not classes_to_update:
+        return []
+    
+    if not overwrite:
+        # Just return the list of classes that need updating
+        return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+    
+    # Load the module to get doc properties
+    module = load_module(filepath)
+    
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    
+    # Process in reverse order to maintain line numbers
+    updated = False
+    for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
+        doc = get_doc_from_class(module, class_name)
+        
+        if doc is None:
+            print(f"Warning: Could not get doc for {class_name} in {filepath}")
+            continue
+        
+        # Format the new docstring with 4-space indent
+        new_docstring = format_docstring(doc, "    ")
+        
+        if has_docstring:
+            # Replace existing docstring (line after class definition to docstring_end_line)
+            # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
+            lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
+        else:
+            # Insert new docstring right after class definition line
+            # class_line is 1-indexed, so lines[class_line-1] is the class line
+            # Insert at position class_line (which is right after the class line)
+            lines = lines[:class_line] + [new_docstring] + lines[class_line:]
+        
+        updated = True
+        print(f"Updated docstring for {class_name} in {filepath}")
+    
+    if updated:
+        with open(filepath, "w", encoding="utf-8", newline="\n") as f:
+            f.writelines(lines)
+    
+    return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+
+
+def check_auto_docstrings(path: str = None, overwrite: bool = False):
+    """
+    Check all files for # auto_docstring markers and optionally fix them.
+    """
+    if path is None:
+        path = DIFFUSERS_PATH
+    
+    if os.path.isfile(path):
+        all_files = [path]
+    else:
+        all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
+    
+    all_markers = []
+    
+    for filepath in all_files:
+        markers = process_file(filepath, overwrite)
+        all_markers.extend(markers)
+    
+    if not overwrite and len(all_markers) > 0:
+        message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
+        raise ValueError(
+            f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
+            f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
+        )
+    
+    if overwrite and len(all_markers) > 0:
+        print(f"\nUpdated {len(all_markers)} docstring(s).")
+    elif len(all_markers) == 0:
+        print("No # auto_docstring markers found.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check and fix # auto_docstring markers in modular pipeline blocks",
+    )
+    parser.add_argument(
+        "path",
+        nargs="?",
+        default=None,
+        help="File or directory to process (default: src/diffusers)"
+    )
+    parser.add_argument(
+        "--fix_and_overwrite",
+        action="store_true",
+        help="Whether to fix the docstrings by inserting them from doc property.",
+    )
+    
+    args = parser.parse_args()
+    
+    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file

From d20f413f78822e9513bd60c203bf0f58885b3a54 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:11:28 +0100
Subject: [PATCH 06/58] more auto docstring

---
 .../modular_blocks_qwenimage_edit.py          | 471 +++++++++++++++++-
 .../modular_blocks_qwenimage_edit_plus.py     | 226 ++++++++-
 .../modular_blocks_qwenimage_layered.py       | 245 ++++++++-
 3 files changed, 935 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 14d0945dbe57..cae6236eb5aa 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -55,9 +55,62 @@
 # 1. TEXT ENCODER
 # ====================
 
-
+#auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts."""
+    """
+    class QwenImageEditVLEncoderStep
+
+      QwenImage-Edit VL encoder step that encode the image and text prompts together.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 64)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
 
     model_name = "qwenimage-edit"
     block_classes = [
@@ -77,7 +130,39 @@ def description(self) -> str:
 
 
 # Edit VAE encoder
+#auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditVaeEncoderStep
+
+      Vae encoder step that encode the image inputs into their latent representations.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -92,7 +177,53 @@ def description(self) -> str:
 
 
 # Edit Inpaint VAE encoder
+#auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintVaeEncoderStep
+
+      This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
+       - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
+       - process the resized image and mask image.
+       - create image latents.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          mask_image (`Image`):
+              Mask image for inpainting.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          processed_image (`None`):
+
+          processed_mask_image (`None`):
+
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -134,7 +265,54 @@ def description(self):
 
 
 # assemble input steps
+#auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInputStep
+
+      Input step that prepares the inputs for the edit denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -151,7 +329,56 @@ def description(self):
         )
 
 
+#auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintInputStep
+
+      Input step that prepares the inputs for the edit inpaint denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -171,7 +398,49 @@ def description(self):
 
 
 # assemble prepare latents steps
+#auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintPrepareLatentsStep
+
+      This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the patchified latents `mask` based on the processed mask image.
+
+      Components:
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+
+          image_latents (`Tensor`):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+
+          height (`None`):
+
+          width (`None`):
+
+          dtype (`None`):
+
+      Outputs:
+
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
     model_name = "qwenimage-edit"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -186,7 +455,68 @@ def description(self) -> str:
 
 
 # Qwen Image Edit (image2image) core denoise step
+#auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditCoreDenoiseStep
+
+      Core denoising workflow for QwenImage-Edit edit (img2img) task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditInputStep(),
@@ -209,9 +539,81 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
+
 
 # Qwen Image Edit (inpainting) core denoise step
+#auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintCoreDenoiseStep
+
+      Core denoising workflow for QwenImage-Edit edit inpaint task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditInpaintInputStep(),
@@ -236,6 +638,12 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Core denoising workflow for QwenImage-Edit edit inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
+
 
 # Auto core denoise step for QwenImage Edit
 class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -263,7 +671,12 @@ def description(self):
             " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
             "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
         )
-
+    
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
 
 # ====================
 # 4. DECODE
@@ -271,7 +684,32 @@ def description(self):
 
 
 # Decode step (standard)
+#auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -282,7 +720,34 @@ def description(self):
 
 
 # Inpaint decode step
+#auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditInpaintDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index fbe5e60f353f..2fcd633f0d7f 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -49,8 +49,64 @@
 # ====================
 
 
+#auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""
+    """
+    class QwenImageEditPlusVLEncoderStep
+
+      QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
+
+          prompt_template_encode_start_idx (default: 64)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+      Outputs:
+
+          resized_cond_image (`List`):
+              The resized images
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
@@ -69,8 +125,40 @@ def description(self) -> str:
 # ====================
 
 
+#auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
-    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
+    """
+    class QwenImageEditPlusVaeEncoderStep
+
+      VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
@@ -94,7 +182,56 @@ def description(self) -> str:
 
 
 # assemble input steps
+#auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditPlusInputStep
+
+      Input step that prepares the inputs for the Edit Plus denoising step. It:
+       - Standardizes text embeddings batch size.
+       - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
+       - Outputs lists of image_height/image_width for RoPE calculation.
+       - Defaults height/width from last image in the list.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`List`):
+              The image heights calculated from the image latents dimension
+
+          image_width (`List`):
+              The image widths calculated from the image latents dimension
+    """
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -114,7 +251,67 @@ def description(self):
 
 
 # Qwen Image Edit Plus (image2image) core denoise step
+#auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditPlusCoreDenoiseStep
+
+      Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageEditPlusInputStep(),
@@ -149,7 +346,32 @@ def outputs(self):
 # ====================
 
 
+#auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageEditPlusDecodeStep
+
+      Decode step that decodes the latents to images and postprocesses the generated image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit-plus"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index e91a5c40b19b..f647f16868ab 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -49,9 +49,111 @@
 # 1. TEXT ENCODER
 # ====================
 
-
+#auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
-    """Text encoder that takes text prompt, will generate a prompt based on image if not provided."""
+    """
+    class QwenImageLayeredTextEncoderStep
+
+      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          image_caption_prompt_en (default: <|im_start|>system
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
+    1. Write the caption using natural, descriptive language without structured formats or rich text.
+    2. Enrich caption details by including:
+     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+    3. Maintain authenticity and accuracy:
+     - Avoid generalizations
+     - Describe all visible information in the image, while do not add information not explicitly shown in the image
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
+
+          image_caption_prompt_cn (default: <|im_start|>system
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
+    2. 通过加入以下内容，丰富图注细节：
+     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
+     - 对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
+     - 环境细节：例如天气、光照、颜色、纹理、气氛等
+     - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调
+    3. 保持真实性与准确性：
+     - 不要使用笼统的描述
+     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+          prompt (`str`, *optional*):
+              The prompt to encode
+
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
 
     model_name = "qwenimage-layered"
     block_classes = [
@@ -72,7 +174,42 @@ def description(self) -> str:
 
 
 # Edit VAE encoder
+#auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageLayeredVaeEncoderStep
+
+      Vae encoder step that encode the image inputs into their latent representations.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          resized_image (`List`):
+              The resized images
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredResizeStep(),
@@ -93,7 +230,54 @@ def description(self) -> str:
 
 
 # assemble input steps
+#auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageLayeredInputStep
+
+      Input step that prepares the inputs for the layered denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+
+          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+
+          height (`int`):
+              The height of the image output
+
+          width (`int`):
+              The width of the image output
+    """
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -111,7 +295,64 @@ def description(self):
 
 
 # Qwen Image Layered (image2image) core denoise step
+#auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageLayeredCoreDenoiseStep
+
+      Core denoising workflow for QwenImage-Layered img2img task.
+
+      Components:
+
+          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredInputStep(),

From 2a81f2ec5417efdc7773937dd7db2f675a46b66a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:15:36 +0100
Subject: [PATCH 07/58] style

---
 .../qwenimage/modular_blocks_qwenimage.py     | 86 ++++++++++++-------
 .../modular_blocks_qwenimage_edit.py          | 46 ++++++----
 .../modular_blocks_qwenimage_edit_plus.py     | 26 +++---
 .../modular_blocks_qwenimage_layered.py       | 47 +++++-----
 4 files changed, 116 insertions(+), 89 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 19feffe77eda..d54dca5f5ad6 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -58,7 +58,8 @@
 # 1. TEXT ENCODER
 # ====================
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     """
     class QwenImageAutoTextEncoderStep
@@ -76,11 +77,8 @@ class QwenImageAutoTextEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -111,6 +109,7 @@ class QwenImageAutoTextEncoderStep
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageTextEncoderStep()]
     block_names = ["text_encoder"]
@@ -127,7 +126,8 @@ def description(self) -> str:
 # 2. VAE ENCODER
 # ====================
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintVaeEncoderStep
@@ -175,6 +175,7 @@ class QwenImageInpaintVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
     block_names = ["preprocess", "encode"]
@@ -189,7 +190,7 @@ def description(self) -> str:
         )
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgVaeEncoderStep
@@ -223,6 +224,7 @@ class QwenImageImg2ImgVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage"
 
     block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -250,13 +252,12 @@ def description(self):
 
 
 # optional controlnet vae encoder
-#auto_docstring
+# auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
     class QwenImageOptionalControlNetVaeEncoderStep
 
-      Vae encoder step that encode the image inputs into their latent representations.
-      This is an auto pipeline block.
+      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
@@ -287,6 +288,7 @@ class QwenImageOptionalControlNetVaeEncoderStep
           control_image_latents (`Tensor`):
               The latents representing the control image
     """
+
     block_classes = [QwenImageControlNetVaeEncoderStep]
     block_names = ["controlnet"]
     block_trigger_inputs = ["control_image"]
@@ -307,7 +309,7 @@ def description(self):
 
 
 # assemble input steps
-#auto_docstring
+# auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgInputStep
@@ -353,6 +355,7 @@ class QwenImageImg2ImgInputStep
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
     block_names = ["text_inputs", "additional_inputs"]
@@ -364,7 +367,7 @@ def description(self):
         " - update height/width based `image_latents`, patchify `image_latents`."
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintInputStep
@@ -412,6 +415,7 @@ class QwenImageInpaintInputStep
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -429,7 +433,7 @@ def description(self):
 
 
 # assemble prepare latents steps
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintPrepareLatentsStep
@@ -450,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -472,6 +477,7 @@ class QwenImageInpaintPrepareLatentsStep
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -489,12 +495,13 @@ def description(self) -> str:
 
 
 # Qwen Image (text2image)
-#auto_docstring
+# auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
+      inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -570,20 +577,22 @@ class QwenImageCoreDenoiseStep
     @property
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-    
+
     @property
     def outputs(self):
         return [
             OutputParam.latents(),
         ]
 
+
 # Qwen Image (inpainting)
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      inpaint task.
 
       Components:
 
@@ -675,13 +684,15 @@ def outputs(self):
             OutputParam.latents(),
         ]
 
+
 # Qwen Image (image2image)
-#auto_docstring
+# auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      img2img task.
 
       Components:
 
@@ -771,13 +782,15 @@ def outputs(self):
             OutputParam.latents(),
         ]
 
+
 # Qwen Image (text2image) with controlnet
-#auto_docstring
+# auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
+      inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -871,20 +884,22 @@ class QwenImageControlNetCoreDenoiseStep
     @property
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
- 
+
     @property
     def outputs(self):
         return [
             OutputParam.latents(),
         ]
 
+
 # Qwen Image (inpainting) with controlnet
-#auto_docstring
+# auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      inpaint task.
 
       Components:
 
@@ -996,12 +1011,13 @@ def outputs(self):
 
 
 # Qwen Image (image2image) with controlnet
-#auto_docstring
+# auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      img2img task.
 
       Components:
 
@@ -1102,13 +1118,14 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep
     @property
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-    
+
     @property
     def outputs(self):
         return [
             OutputParam.latents(),
         ]
 
+
 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
     block_classes = [
@@ -1176,7 +1193,7 @@ def outputs(self):
 
 
 # standard decode step works for most tasks except for inpaint
-#auto_docstring
+# auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageDecodeStep
@@ -1202,6 +1219,7 @@ class QwenImageDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -1212,12 +1230,13 @@ def description(self):
 
 
 # Inpaint decode step
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+      overally to the original image.
 
       Components:
 
@@ -1240,6 +1259,7 @@ class QwenImageInpaintDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index cae6236eb5aa..37a438ea1f54 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -55,7 +55,8 @@
 # 1. TEXT ENCODER
 # ====================
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditVLEncoderStep
@@ -75,11 +76,10 @@ class QwenImageEditVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -130,7 +130,7 @@ def description(self) -> str:
 
 
 # Edit VAE encoder
-#auto_docstring
+# auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditVaeEncoderStep
@@ -163,6 +163,7 @@ class QwenImageEditVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -177,7 +178,7 @@ def description(self) -> str:
 
 
 # Edit Inpaint VAE encoder
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintVaeEncoderStep
@@ -224,6 +225,7 @@ class QwenImageEditInpaintVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -265,7 +267,7 @@ def description(self):
 
 
 # assemble input steps
-#auto_docstring
+# auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInputStep
@@ -313,6 +315,7 @@ class QwenImageEditInputStep
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -329,7 +332,7 @@ def description(self):
         )
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintInputStep
@@ -379,6 +382,7 @@ class QwenImageEditInpaintInputStep
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -398,7 +402,7 @@ def description(self):
 
 
 # assemble prepare latents steps
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintPrepareLatentsStep
@@ -419,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -441,6 +446,7 @@ class QwenImageEditInpaintPrepareLatentsStep
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -455,7 +461,7 @@ def description(self) -> str:
 
 
 # Qwen Image Edit (image2image) core denoise step
-#auto_docstring
+# auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageEditCoreDenoiseStep
@@ -547,7 +553,7 @@ def outputs(self):
 
 
 # Qwen Image Edit (inpainting) core denoise step
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintCoreDenoiseStep
@@ -671,20 +677,21 @@ def description(self):
             " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n"
             "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
         )
-    
+
     @property
     def outputs(self):
         return [
             OutputParam.latents(),
         ]
 
+
 # ====================
 # 4. DECODE
 # ====================
 
 
 # Decode step (standard)
-#auto_docstring
+# auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditDecodeStep
@@ -710,6 +717,7 @@ class QwenImageEditDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -720,12 +728,13 @@ def description(self):
 
 
 # Inpaint decode step
-#auto_docstring
+# auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+      overlay to the original image.
 
       Components:
 
@@ -748,6 +757,7 @@ class QwenImageEditInpaintDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 2fcd633f0d7f..851b69f232e7 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -49,7 +49,7 @@
 # ====================
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusVLEncoderStep
@@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -125,13 +124,13 @@ def description(self) -> str:
 # ====================
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusVaeEncoderStep
 
-      VAE encoder step that encodes image inputs into latent representations.
-      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
+      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
+      on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
@@ -182,7 +181,7 @@ def description(self) -> str:
 
 
 # assemble input steps
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusInputStep
@@ -232,6 +231,7 @@ class QwenImageEditPlusInputStep
           image_width (`List`):
               The image widths calculated from the image latents dimension
     """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -251,7 +251,7 @@ def description(self):
 
 
 # Qwen Image Edit Plus (image2image) core denoise step
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusCoreDenoiseStep
@@ -312,6 +312,7 @@ class QwenImageEditPlusCoreDenoiseStep
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageEditPlusInputStep(),
@@ -346,7 +347,7 @@ def outputs(self):
 # ====================
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusDecodeStep
@@ -372,6 +373,7 @@ class QwenImageEditPlusDecodeStep
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index f647f16868ab..56fa1345a5ce 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -49,12 +49,14 @@
 # 1. TEXT ENCODER
 # ====================
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredTextEncoderStep
 
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+      provided.
 
       Components:
 
@@ -71,28 +73,23 @@ class QwenImageLayeredTextEncoderStep
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -102,16 +99,11 @@ class QwenImageLayeredTextEncoderStep
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -174,7 +166,7 @@ def description(self) -> str:
 
 
 # Edit VAE encoder
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredVaeEncoderStep
@@ -210,6 +202,7 @@ class QwenImageLayeredVaeEncoderStep
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredResizeStep(),
@@ -230,7 +223,7 @@ def description(self) -> str:
 
 
 # assemble input steps
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredInputStep
@@ -278,6 +271,7 @@ class QwenImageLayeredInputStep
           width (`int`):
               The width of the image output
     """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -295,7 +289,7 @@ def description(self):
 
 
 # Qwen Image Layered (image2image) core denoise step
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredCoreDenoiseStep
@@ -353,6 +347,7 @@ class QwenImageLayeredCoreDenoiseStep
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredInputStep(),

From f0555af1c6be0adb75404f2724a071d8b49b5506 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:15:53 +0100
Subject: [PATCH 08/58] up up up

---
 utils/modular_auto_docstring.py | 90 +++++++++++++++------------------
 1 file changed, 40 insertions(+), 50 deletions(-)

diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
index c6aaf8a46a56..e2d523b2f378 100644
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -36,7 +36,7 @@
     # auto_docstring
     class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
         # docstring will be automatically inserted here
-        
+
         @property
         def doc(self):
             return "Your docstring content..."
@@ -69,13 +69,13 @@ def setup_diffusers_import():
 def get_module_from_filepath(filepath: str) -> str:
     """Convert a filepath to a module name."""
     filepath = os.path.normpath(filepath)
-    
+
     if filepath.startswith("src" + os.sep):
         filepath = filepath[4:]
-    
+
     if filepath.endswith(".py"):
         filepath = filepath[:-3]
-    
+
     module_name = filepath.replace(os.sep, ".")
     return module_name
 
@@ -84,7 +84,7 @@ def load_module(filepath: str):
     """Load a module from filepath."""
     setup_diffusers_import()
     module_name = get_module_from_filepath(filepath)
-    
+
     try:
         module = importlib.import_module(module_name)
         return module
@@ -97,30 +97,30 @@ def get_doc_from_class(module, class_name: str) -> str:
     """Get the doc property from an instantiated class."""
     if module is None:
         return None
-    
+
     cls = getattr(module, class_name, None)
     if cls is None:
         return None
-    
+
     try:
         instance = cls()
         if hasattr(instance, "doc"):
             return instance.doc
     except Exception as e:
         print(f"Warning: Could not instantiate {class_name}: {e}")
-    
+
     return None
 
 
 def find_auto_docstring_classes(filepath: str) -> list:
     """
     Find all classes in a file that have # auto_docstring comment above them.
-    
+
     Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
     """
     with open(filepath, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
-    
+
     # Parse AST to find class locations and their docstrings
     content = "".join(lines)
     try:
@@ -128,25 +128,25 @@ def find_auto_docstring_classes(filepath: str) -> list:
     except SyntaxError as e:
         print(f"Syntax error in {filepath}: {e}")
         return []
-    
+
     # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
     class_info = {}
     for node in ast.walk(tree):
         if isinstance(node, ast.ClassDef):
             has_docstring = False
             docstring_end_line = node.lineno  # default to class line
-            
+
             if node.body and isinstance(node.body[0], ast.Expr):
                 first_stmt = node.body[0]
                 if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
                     has_docstring = True
                     docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
-            
+
             class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
-    
+
     # Now scan for # auto_docstring comments
     classes_to_update = []
-    
+
     for i, line in enumerate(lines):
         if AUTO_DOCSTRING_PATTERN.match(line):
             # Found the marker, look for class definition on next non-empty, non-comment line
@@ -156,7 +156,7 @@ def find_auto_docstring_classes(filepath: str) -> list:
                 if next_line and not next_line.startswith("#"):
                     break
                 j += 1
-            
+
             if j < len(lines) and lines[j].strip().startswith("class "):
                 # Extract class name
                 match = re.match(r"class\s+(\w+)", lines[j].strip())
@@ -164,20 +164,15 @@ def find_auto_docstring_classes(filepath: str) -> list:
                     class_name = match.group(1)
                     if class_name in class_info:
                         class_line, has_docstring, docstring_end_line = class_info[class_name]
-                        classes_to_update.append((
-                            class_name,
-                            class_line,
-                            has_docstring,
-                            docstring_end_line
-                        ))
-    
+                        classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line))
+
     return classes_to_update
 
 
 def format_docstring(doc: str, indent: str = "    ") -> str:
     """Format a doc string as a properly indented docstring."""
     lines = doc.strip().split("\n")
-    
+
     if len(lines) == 1:
         return f'{indent}"""{lines[0]}"""\n'
     else:
@@ -194,36 +189,36 @@ def format_docstring(doc: str, indent: str = "    ") -> str:
 def process_file(filepath: str, overwrite: bool = False) -> list:
     """
     Process a file and find/insert docstrings for # auto_docstring marked classes.
-    
+
     Returns list of classes that need updating.
     """
     classes_to_update = find_auto_docstring_classes(filepath)
-    
+
     if not classes_to_update:
         return []
-    
+
     if not overwrite:
         # Just return the list of classes that need updating
         return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
-    
+
     # Load the module to get doc properties
     module = load_module(filepath)
-    
+
     with open(filepath, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
-    
+
     # Process in reverse order to maintain line numbers
     updated = False
     for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
         doc = get_doc_from_class(module, class_name)
-        
+
         if doc is None:
             print(f"Warning: Could not get doc for {class_name} in {filepath}")
             continue
-        
+
         # Format the new docstring with 4-space indent
         new_docstring = format_docstring(doc, "    ")
-        
+
         if has_docstring:
             # Replace existing docstring (line after class definition to docstring_end_line)
             # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
@@ -233,14 +228,14 @@ def process_file(filepath: str, overwrite: bool = False) -> list:
             # class_line is 1-indexed, so lines[class_line-1] is the class line
             # Insert at position class_line (which is right after the class line)
             lines = lines[:class_line] + [new_docstring] + lines[class_line:]
-        
+
         updated = True
         print(f"Updated docstring for {class_name} in {filepath}")
-    
+
     if updated:
         with open(filepath, "w", encoding="utf-8", newline="\n") as f:
             f.writelines(lines)
-    
+
     return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
 
 
@@ -250,25 +245,25 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False):
     """
     if path is None:
         path = DIFFUSERS_PATH
-    
+
     if os.path.isfile(path):
         all_files = [path]
     else:
         all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
-    
+
     all_markers = []
-    
+
     for filepath in all_files:
         markers = process_file(filepath, overwrite)
         all_markers.extend(markers)
-    
+
     if not overwrite and len(all_markers) > 0:
         message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
         raise ValueError(
             f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
             f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
         )
-    
+
     if overwrite and len(all_markers) > 0:
         print(f"\nUpdated {len(all_markers)} docstring(s).")
     elif len(all_markers) == 0:
@@ -279,18 +274,13 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False):
     parser = argparse.ArgumentParser(
         description="Check and fix # auto_docstring markers in modular pipeline blocks",
     )
-    parser.add_argument(
-        "path",
-        nargs="?",
-        default=None,
-        help="File or directory to process (default: src/diffusers)"
-    )
+    parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)")
     parser.add_argument(
         "--fix_and_overwrite",
         action="store_true",
         help="Whether to fix the docstrings by inserting them from doc property.",
     )
-    
+
     args = parser.parse_args()
-    
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file
+
+    check_auto_docstrings(args.path, args.fix_and_overwrite)

From 507953f4156349d4d96cc6a8e0e7aa8eeefcf47e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:19:14 +0100
Subject: [PATCH 09/58] more more

---
 .../qwenimage/modular_blocks_qwenimage.py     | 168 +++++++++++++++---
 .../modular_blocks_qwenimage_edit.py          | 118 +++++++++++-
 .../modular_blocks_qwenimage_edit_plus.py     | 102 ++++++++++-
 .../modular_blocks_qwenimage_layered.py       | 165 +++++++++++++++--
 4 files changed, 503 insertions(+), 50 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index d54dca5f5ad6..7f18de4f99dd 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -77,8 +77,11 @@ class QwenImageAutoTextEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -257,7 +260,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
     class QwenImageOptionalControlNetVaeEncoderStep
 
-      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
+      Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
@@ -454,8 +458,7 @@ class QwenImageInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -500,8 +503,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -591,8 +593,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
@@ -691,8 +692,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
@@ -789,8 +789,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -898,8 +897,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
@@ -1016,8 +1014,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
@@ -1235,8 +1232,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
-      overally to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
 
@@ -1298,8 +1294,140 @@ def description(self):
     ]
 )
 
-
+# auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
+    """
+    class QwenImageAutoBlocks
+
+      Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
+      - for image-to-image generation, you need to provide `image`
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` 
+      - to run the controlnet workflow, you need to provide `control_image`
+      - for text-to-image generation, all you need to provide is `prompt`
+
+      Components:
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          control_image_processor (`VaeImageProcessor`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+          mask_image (`Image`, *optional*):
+              Mask image for inpainting.
+
+          image (`Image`, *optional*):
+              Input image for img2img, editing, or conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+
+          num_inference_steps (`int`):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          control_image_latents (`None`, *optional*):
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
 
     block_classes = AUTO_BLOCKS.values()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 37a438ea1f54..91efe9dda2bf 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -76,10 +76,11 @@ class QwenImageEditVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -423,8 +424,7 @@ class QwenImageEditInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -733,8 +733,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-      overlay to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
 
@@ -802,8 +801,109 @@ def outputs(self):
     ]
 )
 
-
+# auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
+    """
+    class QwenImageEditAutoBlocks
+
+      Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
+      - for edit (img2img) generation, you need to provide `image`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 64)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          mask_image (`Image`, *optional*):
+              Mask image for inpainting.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          height (`int`):
+              The height in pixels of the generated image.
+
+          width (`int`):
+              The width in pixels of the generated image.
+
+          image_latents (`None`):
+
+          processed_mask_image (`None`, *optional*):
+
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+
+          num_inference_steps (`int`):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit"
     block_classes = EDIT_AUTO_BLOCKS.values()
     block_names = EDIT_AUTO_BLOCKS.keys()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 851b69f232e7..3a780daf9602 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -69,10 +69,11 @@ class QwenImageEditPlusVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -129,8 +130,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusVaeEncoderStep
 
-      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
-      on its own aspect ratio to 1024x1024 target area.
+      VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
@@ -396,8 +397,95 @@ def description(self):
     ]
 )
 
-
+# auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
+    """
+    class QwenImageEditPlusAutoBlocks
+
+      Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
+      - `image` is required input (can be single image or list of images).
+      - Each image is resized independently based on its own aspect ratio.
+      - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
+
+          prompt_template_encode_start_idx (default: 64)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-edit-plus"
     block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
     block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 56fa1345a5ce..7cb5cd7a1ca3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -55,8 +55,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredTextEncoderStep
 
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-      provided.
+      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
 
@@ -73,23 +72,28 @@ class QwenImageLayeredTextEncoderStep
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -99,11 +103,16 @@ class QwenImageLayeredTextEncoderStep
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -390,8 +399,136 @@ def outputs(self):
     ]
 )
 
-
+# auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
+    """
+    class QwenImageLayeredAutoBlocks
+
+      Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
+
+      Components:
+
+          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+
+          processor (`Qwen2VLProcessor`) [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Configs:
+
+          image_caption_prompt_en (default: <|im_start|>system
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
+    1. Write the caption using natural, descriptive language without structured formats or rich text.
+    2. Enrich caption details by including:
+     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+    3. Maintain authenticity and accuracy:
+     - Avoid generalizations
+     - Describe all visible information in the image, while do not add information not explicitly shown in the image
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
+
+          image_caption_prompt_cn (default: <|im_start|>system
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
+    2. 通过加入以下内容，丰富图注细节：
+     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
+     - 对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
+     - 环境细节：例如天气、光照、颜色、纹理、气氛等
+     - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调
+    3. 保持真实性与准确性：
+     - 不要使用笼统的描述
+     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+          prompt (`str`, *optional*):
+              The prompt to encode
+
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-layered"
     block_classes = LAYERED_AUTO_BLOCKS.values()
     block_names = LAYERED_AUTO_BLOCKS.keys()

From 1c90ce33f2445b29c1967976a1734db97f5eaa3a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 10 Jan 2026 12:21:26 +0100
Subject: [PATCH 10/58] up

---
 .../qwenimage/modular_blocks_qwenimage.py     | 47 +++++++------
 .../modular_blocks_qwenimage_edit.py          | 29 ++++----
 .../modular_blocks_qwenimage_edit_plus.py     | 24 +++----
 .../modular_blocks_qwenimage_layered.py       | 69 +++++++------------
 4 files changed, 79 insertions(+), 90 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 7f18de4f99dd..85b77c2a6b93 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -77,11 +77,8 @@ class QwenImageAutoTextEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -260,8 +257,7 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
     class QwenImageOptionalControlNetVaeEncoderStep
 
-      Vae encoder step that encode the image inputs into their latent representations.
-      This is an auto pipeline block.
+      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
@@ -458,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -503,7 +500,8 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
+      inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -593,7 +591,8 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      inpaint task.
 
       Components:
 
@@ -692,7 +691,8 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      img2img task.
 
       Components:
 
@@ -789,7 +789,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetCoreDenoiseStep
 
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
+      inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -897,7 +898,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetInpaintCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      inpaint task.
 
       Components:
 
@@ -1014,7 +1016,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
     class QwenImageControlNetImg2ImgCoreDenoiseStep
 
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
+      img2img task.
 
       Components:
 
@@ -1232,7 +1235,8 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+      overally to the original image.
 
       Components:
 
@@ -1294,6 +1298,7 @@ def description(self):
     ]
 )
 
+
 # auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
     """
@@ -1301,7 +1306,7 @@ class QwenImageAutoBlocks
 
       Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
       - for image-to-image generation, you need to provide `image`
-      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` 
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
       - to run the controlnet workflow, you need to provide `control_image`
       - for text-to-image generation, all you need to provide is `prompt`
 
@@ -1332,11 +1337,8 @@ class QwenImageAutoBlocks
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -1428,6 +1430,7 @@ class QwenImageAutoBlocks
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
 
     block_classes = AUTO_BLOCKS.values()
@@ -1438,7 +1441,7 @@ def description(self):
         return (
             "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
             + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
             + "- to run the controlnet workflow, you need to provide `control_image`\n"
             + "- for text-to-image generation, all you need to provide is `prompt`"
         )
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 91efe9dda2bf..3fcbc8853f48 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -76,11 +76,10 @@ class QwenImageEditVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -424,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep
               The initial random noised, can be generated in prepare latent step.
 
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
 
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -733,7 +733,8 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
     class QwenImageEditInpaintDecodeStep
 
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+      overlay to the original image.
 
       Components:
 
@@ -801,6 +802,7 @@ def outputs(self):
     ]
 )
 
+
 # auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
@@ -808,7 +810,8 @@ class QwenImageEditAutoBlocks
 
       Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
 
       Components:
 
@@ -835,11 +838,10 @@ class QwenImageEditAutoBlocks
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -904,6 +906,7 @@ class QwenImageEditAutoBlocks
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit"
     block_classes = EDIT_AUTO_BLOCKS.values()
     block_names = EDIT_AUTO_BLOCKS.keys()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 3a780daf9602..0364e394d29d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -130,8 +129,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageEditPlusVaeEncoderStep
 
-      VAE encoder step that encodes image inputs into latent representations.
-      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
+      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
+      on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
@@ -397,6 +396,7 @@ def description(self):
     ]
 )
 
+
 # auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
     """
@@ -430,11 +430,10 @@ class QwenImageEditPlusAutoBlocks
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -486,6 +485,7 @@ class QwenImageEditPlusAutoBlocks
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-edit-plus"
     block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
     block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 7cb5cd7a1ca3..5602fc9b93e5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -55,7 +55,8 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
     class QwenImageLayeredTextEncoderStep
 
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+      provided.
 
       Components:
 
@@ -72,28 +73,23 @@ class QwenImageLayeredTextEncoderStep
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -103,16 +99,11 @@ class QwenImageLayeredTextEncoderStep
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -399,6 +390,7 @@ def outputs(self):
     ]
 )
 
+
 # auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     """
@@ -431,28 +423,23 @@ class QwenImageLayeredAutoBlocks
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -462,16 +449,11 @@ class QwenImageLayeredAutoBlocks
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -529,6 +511,7 @@ class QwenImageLayeredAutoBlocks
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-layered"
     block_classes = LAYERED_AUTO_BLOCKS.values()
     block_names = LAYERED_AUTO_BLOCKS.keys()

From aea0d046f6eb759dca55a11bd9c55f89db39b3e4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 17 Jan 2026 09:36:58 +0100
Subject: [PATCH 11/58] address feedbacks

---
 .../modular_pipeline_utils.py                 |   4 +-
 .../qwenimage/modular_blocks_qwenimage.py     | 408 ++++--------------
 .../modular_blocks_qwenimage_edit.py          | 256 +++--------
 .../modular_blocks_qwenimage_edit_plus.py     | 147 ++-----
 .../modular_blocks_qwenimage_layered.py       | 190 +++-----
 utils/modular_auto_docstring.py               |  16 +-
 6 files changed, 271 insertions(+), 750 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index fab7c7193e5d..368fbbcbd138 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -711,7 +711,7 @@ def wrap_text(text, indent, max_length):
 
         formatted_params.append(param_str)
 
-    return "\n\n".join(formatted_params)
+    return "\n".join(formatted_params)
 
 
 def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -781,7 +781,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
         loading_field_values = []
         for field_name in component.loading_fields():
             field_value = getattr(component, field_name)
-            if field_value is not None:
+            if field_value:
                 loading_field_values.append(f"{field_name}={field_value}")
 
         # Add loading field information if available
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 85b77c2a6b93..3bd4ae56832a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -62,50 +62,44 @@
 # auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     """
-    class QwenImageAutoTextEncoderStep
-
-      Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
+    Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
 
       Components:
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
 
       Outputs:
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -130,48 +124,36 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintVaeEncoderStep
-
-      This step is used for processing image and mask inputs for inpainting tasks. It:
+    This step is used for processing image and mask inputs for inpainting tasks. It:
        - Resizes the image to the target size, based on `height` and `width`.
        - Processes and updates `image` and `mask_image`.
        - Creates `image_latents`.
 
       Components:
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           mask_image (`Image`):
               Mask image for inpainting.
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           processed_image (`None`):
-
           processed_mask_image (`None`):
-
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -193,34 +175,26 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgVaeEncoderStep
-
-      Vae encoder step that preprocess andencode the image inputs into their latent representations.
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
 
       Components:
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -255,36 +229,30 @@ def description(self):
 # auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
-    class QwenImageOptionalControlNetVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          control_image_processor (`VaeImageProcessor`) [subfolder=]
+          control_image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           control_image (`Image`, *optional*):
               Control image for ControlNet conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           control_image_latents (`Tensor`):
               The latents representing the control image
     """
@@ -312,46 +280,32 @@ def description(self):
 # auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgInputStep
-
-      Input step that prepares the inputs for the img2img denoising step. It:
+    Input step that prepares the inputs for the img2img denoising step. It:
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -370,48 +324,33 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintInputStep
-
-      Input step that prepares the inputs for the inpainting denoising step. It:
+    Input step that prepares the inputs for the inpainting denoising step. It:
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -436,44 +375,32 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintPrepareLatentsStep
-
-      This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
+    This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
        - Add noise to the image latents to create the latents input for the denoiser.
        - Create the pachified latents `mask` based on the processedmask image.
 
       Components:
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
-
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
-
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-
           height (`None`):
-
           width (`None`):
-
           dtype (`None`):
 
       Outputs:
-
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
-
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -498,60 +425,43 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageCoreDenoiseStep
-
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -589,67 +499,47 @@ def outputs(self):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -689,65 +579,46 @@ def outputs(self):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -787,74 +658,53 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetCoreDenoiseStep
-
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           control_image_latents (`None`):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -896,81 +746,57 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetInpaintCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           control_image_latents (`None`):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -1014,79 +840,56 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetImg2ImgCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           control_image_latents (`None`):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -1196,26 +999,21 @@ def outputs(self):
 # auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image.
+    Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -1233,29 +1031,22 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
-      overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -1302,131 +1093,102 @@ def description(self):
 # auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageAutoBlocks
-
-      Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
+    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
       - for image-to-image generation, you need to provide `image`
-      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
       - to run the controlnet workflow, you need to provide `control_image`
       - for text-to-image generation, all you need to provide is `prompt`
 
       Components:
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          control_image_processor (`VaeImageProcessor`) [subfolder=]
+          control_image_processor (`VaeImageProcessor`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
-
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-
           image (`Image`, *optional*):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           control_image (`Image`, *optional*):
               Control image for ControlNet conditioning.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_image_latents (`None`, *optional*):
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 3fcbc8853f48..627cfce6ee7b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -59,55 +59,46 @@
 # auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditVLEncoderStep
-
-      QwenImage-Edit VL encoder step that encode the image and text prompts together.
+    QwenImage-Edit VL encoder step that encode the image and text prompts together.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -133,33 +124,26 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations.
+    Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -181,47 +165,36 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintVaeEncoderStep
-
-      This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
+    This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
        - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
        - process the resized image and mask image.
        - create image latents.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           mask_image (`Image`):
               Mask image for inpainting.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           processed_mask_image (`None`):
-
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -270,48 +243,34 @@ def description(self):
 # auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInputStep
-
-      Input step that prepares the inputs for the edit denoising step. It:
+    Input step that prepares the inputs for the edit denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -335,50 +294,35 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintInputStep
-
-      Input step that prepares the inputs for the edit inpaint denoising step. It:
+    Input step that prepares the inputs for the edit inpaint denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -405,44 +349,32 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintPrepareLatentsStep
-
-      This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
+    This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
        - Add noise to the image latents to create the latents input for the denoiser.
        - Create the patchified latents `mask` based on the processed mask image.
 
       Components:
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
-
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
-
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-
           height (`None`):
-
           width (`None`):
-
           dtype (`None`):
 
       Outputs:
-
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
-
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -464,61 +396,44 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit edit (img2img) task.
+    Core denoising workflow for QwenImage-Edit edit (img2img) task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -556,66 +471,47 @@ def outputs(self):
 # auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit edit inpaint task.
+    Core denoising workflow for QwenImage-Edit edit inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -694,26 +590,21 @@ def outputs(self):
 # auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image.
+    Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -731,29 +622,22 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-      overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -806,103 +690,81 @@ def outputs(self):
 # auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageEditAutoBlocks
-
-      Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
+    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
-        `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           height (`int`):
               The height in pixels of the generated image.
-
           width (`int`):
               The width in pixels of the generated image.
-
           image_latents (`None`):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 0364e394d29d..cc07fc1e6a75 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -52,57 +52,48 @@
 # auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusVLEncoderStep
-
-      QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
+    QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-
           resized_cond_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -127,34 +118,27 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusVaeEncoderStep
-
-      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
-      on its own aspect ratio to 1024x1024 target area.
+    VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -184,9 +168,7 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusInputStep
-
-      Input step that prepares the inputs for the Edit Plus denoising step. It:
+    Input step that prepares the inputs for the Edit Plus denoising step. It:
        - Standardizes text embeddings batch size.
        - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
        - Outputs lists of image_height/image_width for RoPE calculation.
@@ -194,40 +176,28 @@ class QwenImageEditPlusInputStep
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`List`):
               The image heights calculated from the image latents dimension
-
           image_width (`List`):
               The image widths calculated from the image latents dimension
     """
@@ -254,61 +224,44 @@ def description(self):
 # auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
+    Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -350,26 +303,21 @@ def outputs(self):
 # auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusDecodeStep
-
-      Decode step that decodes the latents to images and postprocesses the generated image.
+    Decode step that decodes the latents to images and postprocesses the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -400,88 +348,73 @@ def description(self):
 # auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusAutoBlocks
-
-      Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
+    Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
       - `image` is required input (can be single image or list of images).
       - Each image is resized independently based on its own aspect ratio.
       - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 5602fc9b93e5..7cbc174871b5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,43 +53,45 @@
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredTextEncoderStep
-
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-      provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -99,50 +101,44 @@ class QwenImageLayeredTextEncoderStep
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           prompt (`str`, *optional*):
               The prompt to encode
-
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -169,36 +165,28 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations.
+    Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -226,48 +214,34 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredInputStep
-
-      Input step that prepares the inputs for the layered denoising step. It:
+    Input step that prepares the inputs for the layered denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
-
           height (`int`):
               The height of the image output
-
           width (`int`):
               The width of the image output
     """
@@ -292,58 +266,42 @@ def description(self):
 # auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Layered img2img task.
+    Core denoising workflow for QwenImage-Layered img2img task.
 
       Components:
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           layers (`int`, *optional*, defaults to 4):
               Number of layers to extract from the image
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -394,52 +352,55 @@ def outputs(self):
 # auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredAutoBlocks
-
-      Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
+    Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -449,65 +410,54 @@ class QwenImageLayeredAutoBlocks
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           prompt (`str`, *optional*):
               The prompt to encode
-
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           layers (`int`, *optional*, defaults to 4):
               Number of layers to extract from the image
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
index e2d523b2f378..01d984a58430 100644
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -169,6 +169,17 @@ def find_auto_docstring_classes(filepath: str) -> list:
     return classes_to_update
 
 
+def strip_class_name_line(doc: str, class_name: str) -> str:
+    """Remove the 'class ClassName' line from the doc if present."""
+    lines = doc.strip().split("\n")
+    if lines and lines[0].strip() == f"class {class_name}":
+        # Remove the class line and any blank line following it
+        lines = lines[1:]
+        while lines and not lines[0].strip():
+            lines = lines[1:]
+    return "\n".join(lines)
+
+
 def format_docstring(doc: str, indent: str = "    ") -> str:
     """Format a doc string as a properly indented docstring."""
     lines = doc.strip().split("\n")
@@ -216,6 +227,9 @@ def process_file(filepath: str, overwrite: bool = False) -> list:
             print(f"Warning: Could not get doc for {class_name} in {filepath}")
             continue
 
+        # Remove the "class ClassName" line since it's redundant in a docstring
+        doc = strip_class_name_line(doc, class_name)
+
         # Format the new docstring with 4-space indent
         new_docstring = format_docstring(doc, "    ")
 
@@ -283,4 +297,4 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False):
 
     args = parser.parse_args()
 
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
+    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file

From 25c968a38f991b020d12604eedb4efda1d016dee Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 17 Jan 2026 09:57:56 +0100
Subject: [PATCH 12/58] add TODO in the description for empty docstring

---
 .../modular_pipeline_utils.py                 |  2 +
 .../modular_pipelines/qwenimage/encoders.py   |  3 +-
 .../qwenimage/modular_blocks_qwenimage.py     | 97 +++++++++++++++----
 .../modular_blocks_qwenimage_edit.py          | 59 ++++++++---
 .../modular_blocks_qwenimage_edit_plus.py     | 29 ++++--
 .../modular_blocks_qwenimage_layered.py       | 78 +++++++--------
 utils/modular_auto_docstring.py               |  2 +-
 7 files changed, 184 insertions(+), 86 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 368fbbcbd138..45556c538ab8 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -708,6 +708,8 @@ def wrap_text(text, indent, max_length):
             desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
             wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
             param_str += f"\n{desc_indent}{wrapped_desc}"
+        else:
+            param_str += f"\n{desc_indent}TODO: Add description."
 
         formatted_params.append(param_str)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index f0dd6471b168..8d7b1905423d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -1324,7 +1324,8 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True),
+            InputParam.template(self._image_input_name)
+            or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"),
             InputParam.generator(),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 3bd4ae56832a..645c01f66ee5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -75,11 +75,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -151,7 +148,9 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
 
       Outputs:
           processed_image (`None`):
+              TODO: Add description.
           processed_mask_image (`None`):
+              TODO: Add description.
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
@@ -195,6 +194,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
 
       Outputs:
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -290,14 +290,19 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -334,15 +339,21 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -389,14 +400,18 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
           height (`None`):
+              TODO: Add description.
           width (`None`):
+              TODO: Add description.
           dtype (`None`):
+              TODO: Add description.
 
       Outputs:
           initial_noise (`Tensor`):
@@ -425,7 +440,8 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -441,9 +457,13 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           height (`int`, *optional*):
@@ -499,7 +519,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
 
@@ -515,15 +536,21 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -579,7 +606,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
 
@@ -595,14 +623,19 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -658,7 +691,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -676,10 +710,15 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -746,7 +785,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
 
@@ -764,16 +804,23 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -840,7 +887,8 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
 
@@ -858,15 +906,21 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -1031,7 +1085,8 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+    overally to the original image.
 
       Components:
 
@@ -1045,6 +1100,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
@@ -1126,11 +1182,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -1160,9 +1213,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -1174,10 +1231,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
           control_image_latents (`None`, *optional*):
+              TODO: Add description.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
           control_guidance_end (`float`, *optional*, defaults to 1.0):
@@ -1187,6 +1247,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 627cfce6ee7b..0bfbb921c9c4 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -74,11 +74,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -144,6 +143,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -192,7 +192,9 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           processed_mask_image (`None`):
+              TODO: Add description.
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
@@ -255,14 +257,19 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -306,15 +313,21 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -363,14 +376,18 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
           height (`None`):
+              TODO: Add description.
           width (`None`):
+              TODO: Add description.
           dtype (`None`):
+              TODO: Add description.
 
       Outputs:
           initial_noise (`Tensor`):
@@ -412,14 +429,19 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -487,15 +509,21 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -622,7 +650,8 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+    overlay to the original image.
 
       Components:
 
@@ -636,6 +665,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
@@ -692,7 +722,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
 
       Components:
 
@@ -719,11 +750,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -747,7 +777,9 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           width (`int`):
               The width in pixels of the generated image.
           image_latents (`None`):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -763,6 +795,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index cc07fc1e6a75..8dab6fbcf95d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -67,11 +67,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -139,6 +138,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -182,14 +182,19 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -240,14 +245,19 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -376,11 +386,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 7cbc174871b5..544b1abfc3ed 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,7 +53,8 @@
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+    provided.
 
       Components:
 
@@ -70,28 +71,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -101,16 +97,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -187,6 +178,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -226,10 +218,15 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -282,10 +279,15 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           layers (`int`, *optional*, defaults to 4):
@@ -379,28 +381,23 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -410,16 +407,11 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
index 01d984a58430..7bb2c87e81da 100644
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -297,4 +297,4 @@ def check_auto_docstrings(path: str = None, overwrite: bool = False):
 
     args = parser.parse_args()
 
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file
+    check_auto_docstrings(args.path, args.fix_and_overwrite)

From de03d7f1005777cc3bfdf9107bb8b775311fce8d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sun, 18 Jan 2026 00:35:01 +0100
Subject: [PATCH 13/58] refactor based on dhruv's feedback: remove the class
 method

---
 .../modular_pipeline_utils.py                 | 343 ++++++++----------
 1 file changed, 147 insertions(+), 196 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 45556c538ab8..f8dde1fbd096 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -324,6 +324,133 @@ class ConfigSpec:
     description: Optional[str] = None
 
 
+# ======================================================
+# InputParam and OutputParam templates
+# ======================================================
+
+INPUT_PARAM_TEMPLATES = {
+    "prompt": {
+        "type_hint": str,
+        "required": True,
+        "description": "The prompt or prompts to guide image generation.",
+    },
+    "negative_prompt": {
+        "type_hint": str,
+        "default": None,
+        "description": "The prompt or prompts not to guide the image generation.",
+    },
+    "max_sequence_length": {
+        "type_hint": int,
+        "default": 512,
+        "description": "Maximum sequence length for prompt encoding.",
+    },
+    "height": {
+        "type_hint": int,
+        "description": "The height in pixels of the generated image.",
+    },
+    "width": {
+        "type_hint": int,
+        "description": "The width in pixels of the generated image.",
+    },
+    "num_inference_steps": {
+        "type_hint": int,
+        "default": 50,
+        "description": "The number of denoising steps.",
+    },
+    "num_images_per_prompt": {
+        "type_hint": int,
+        "default": 1,
+        "description": "The number of images to generate per prompt.",
+    },
+    "generator": {
+        "type_hint": torch.Generator,
+        "default": None,
+        "description": "Torch generator for deterministic generation.",
+    },
+    "sigmas": {
+        "type_hint": List[float],
+        "default": None,
+        "description": "Custom sigmas for the denoising process.",
+    },
+    "strength": {
+        "type_hint": float,
+        "default": 0.9,
+        "description": "Strength for img2img/inpainting.",
+    },
+    "image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Input image for img2img, editing, or conditioning.",
+    },
+    "mask_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Mask image for inpainting.",
+    },
+    "control_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Control image for ControlNet conditioning.",
+    },
+    "padding_mask_crop": {
+        "type_hint": int,
+        "default": None,
+        "description": "Padding for mask cropping in inpainting.",
+    },
+    "latents": {
+        "type_hint": torch.Tensor,
+        "default": None,
+        "description": "Pre-generated noisy latents for image generation.",
+    },
+    "timesteps": {
+        "type_hint": torch.Tensor,
+        "default": None,
+        "description": "Timesteps for the denoising process.",
+    },
+    "output_type": {
+        "type_hint": str,
+        "default": "pil",
+        "description": "Output format: 'pil', 'np', 'pt'.",
+    },
+    "attention_kwargs": {
+        "type_hint": Dict[str, Any],
+        "default": None,
+        "description": "Additional kwargs for attention processors.",
+    },
+    "denoiser_input_fields": {
+        "kwargs_type": "denoiser_input_fields",
+        "type_hint": torch.Tensor,
+        "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+    },
+    "control_guidance_start": {
+        "type_hint": float,
+        "default": 0.0,
+        "description": "When to start applying ControlNet.",
+    },
+    "control_guidance_end": {
+        "type_hint": float,
+        "default": 1.0,
+        "description": "When to stop applying ControlNet.",
+    },
+    "controlnet_conditioning_scale": {
+        "type_hint": float,
+        "default": 1.0,
+        "description": "Scale for ControlNet conditioning.",
+    },
+}
+
+OUTPUT_PARAM_TEMPLATES = {
+    "images": {
+        "type_hint": List[PIL.Image.Image],
+        "description": "Generated images.",
+    },
+    "latents": {
+        "type_hint": torch.Tensor,
+        "description": "Denoised latents.",
+    },
+}
+
+
 # YiYi Notes: both inputs and intermediate_inputs are InputParam objects
 # however some fields are not relevant for intermediate_inputs
 # e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
@@ -344,190 +471,22 @@ def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
     @classmethod
-    def template(cls, name: str) -> Optional["InputParam"]:
-        """Get template for name if exists, otherwise None."""
-        if hasattr(cls, name) and callable(getattr(cls, name)):
-            return getattr(cls, name)()
-        return None
-
-    # ======================================================
-    # InputParam templates
-    # ======================================================
-
-    @classmethod
-    def prompt(cls) -> "InputParam":
-        return cls(
-            name="prompt", type_hint=str, required=True, description="The prompt or prompts to guide image generation."
-        )
-
-    @classmethod
-    def negative_prompt(cls) -> "InputParam":
-        return cls(
-            name="negative_prompt",
-            type_hint=str,
-            default=None,
-            description="The prompt or prompts not to guide the image generation.",
-        )
-
-    @classmethod
-    def max_sequence_length(cls, default: int = 512) -> "InputParam":
-        return cls(
-            name="max_sequence_length",
-            type_hint=int,
-            default=default,
-            description="Maximum sequence length for prompt encoding.",
-        )
-
-    @classmethod
-    def height(cls, default: Optional[int] = None) -> "InputParam":
-        return cls(
-            name="height", type_hint=int, default=default, description="The height in pixels of the generated image."
-        )
-
-    @classmethod
-    def width(cls, default: Optional[int] = None) -> "InputParam":
-        return cls(
-            name="width", type_hint=int, default=default, description="The width in pixels of the generated image."
-        )
-
-    @classmethod
-    def num_inference_steps(cls, default: int = 50) -> "InputParam":
-        return cls(
-            name="num_inference_steps", type_hint=int, default=default, description="The number of denoising steps."
-        )
-
-    @classmethod
-    def num_images_per_prompt(cls, default: int = 1) -> "InputParam":
-        return cls(
-            name="num_images_per_prompt",
-            type_hint=int,
-            default=default,
-            description="The number of images to generate per prompt.",
-        )
-
-    @classmethod
-    def generator(cls) -> "InputParam":
-        return cls(
-            name="generator",
-            type_hint=torch.Generator,
-            default=None,
-            description="Torch generator for deterministic generation.",
-        )
-
-    @classmethod
-    def sigmas(cls) -> "InputParam":
-        return cls(
-            name="sigmas", type_hint=List[float], default=None, description="Custom sigmas for the denoising process."
-        )
-
-    @classmethod
-    def strength(cls, default: float = 0.9) -> "InputParam":
-        return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.")
-
-    # images
-    @classmethod
-    def image(cls) -> "InputParam":
-        return cls(
-            name="image",
-            type_hint=PIL.Image.Image,
-            required=True,
-            description="Input image for img2img, editing, or conditioning.",
-        )
-
-    @classmethod
-    def mask_image(cls) -> "InputParam":
-        return cls(
-            name="mask_image", type_hint=PIL.Image.Image, required=True, description="Mask image for inpainting."
-        )
-
-    @classmethod
-    def control_image(cls) -> "InputParam":
-        return cls(
-            name="control_image",
-            type_hint=PIL.Image.Image,
-            required=True,
-            description="Control image for ControlNet conditioning.",
-        )
-
-    @classmethod
-    def padding_mask_crop(cls) -> "InputParam":
-        return cls(
-            name="padding_mask_crop",
-            type_hint=int,
-            default=None,
-            description="Padding for mask cropping in inpainting.",
-        )
-
-    @classmethod
-    def latents(cls) -> "InputParam":
-        return cls(
-            name="latents",
-            type_hint=torch.Tensor,
-            default=None,
-            description="Pre-generated noisy latents for image generation.",
-        )
-
-    @classmethod
-    def timesteps(cls) -> "InputParam":
-        return cls(
-            name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process."
-        )
-
-    @classmethod
-    def output_type(cls) -> "InputParam":
-        return cls(name="output_type", type_hint=str, default="pil", description="Output format: 'pil', 'np', 'pt''.")
-
-    @classmethod
-    def attention_kwargs(cls) -> "InputParam":
-        return cls(
-            name="attention_kwargs",
-            type_hint=Dict[str, Any],
-            default=None,
-            description="Additional kwargs for attention processors.",
-        )
-
-    @classmethod
-    def denoiser_input_fields(cls) -> "InputParam":
-        return cls(
-            kwargs_type="denoiser_input_fields",
-            type_hint=torch.Tensor,
-            description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-        )
-
-    # ControlNet
-    @classmethod
-    def control_guidance_start(cls, default: float = 0.0) -> "InputParam":
-        return cls(
-            name="control_guidance_start",
-            type_hint=float,
-            default=default,
-            description="When to start applying ControlNet.",
-        )
-
-    @classmethod
-    def control_guidance_end(cls, default: float = 1.0) -> "InputParam":
-        return cls(
-            name="control_guidance_end",
-            type_hint=float,
-            default=default,
-            description="When to stop applying ControlNet.",
-        )
-
-    @classmethod
-    def controlnet_conditioning_scale(cls, default: float = 1.0) -> "InputParam":
-        return cls(
-            name="controlnet_conditioning_scale",
-            type_hint=float,
-            default=default,
-            description="Scale for ControlNet conditioning.",
-        )
+    def template(cls, name: str,  **overrides) -> "InputParam":
+        """Get template for name if exists, otherwise return basic InputParam with just the name."""
+        if name in INPUT_PARAM_TEMPLATES:
+            kwargs = {"name": name, **INPUT_PARAM_TEMPLATES[name]}
+            # Override with user-provided values
+            for key, value in overrides.items():
+                kwargs[key] = value
+            return cls(**kwargs)
+        return cls(name=name, **overrides)
 
 
 @dataclass
 class OutputParam:
     """Specification for an output parameter."""
 
-    name: str
+    name: str = None
     type_hint: Any = None
     description: str = ""
     kwargs_type: str = None  # YiYi notes: remove this feature (maybe)
@@ -538,23 +497,15 @@ def __repr__(self):
         )
 
     @classmethod
-    def template(cls, name: str) -> Optional["OutputParam"]:
-        """Get template for name if exists, otherwise None."""
-        if hasattr(cls, name) and callable(getattr(cls, name)):
-            return getattr(cls, name)()
-        return None
-
-    # ======================================================
-    # OutputParam templates
-    # ======================================================
-
-    @classmethod
-    def images(cls) -> "OutputParam":
-        return cls(name="images", type_hint=List[PIL.Image.Image], description="Generated images.")
-
-    @classmethod
-    def latents(cls) -> "OutputParam":
-        return cls(name="latents", type_hint=torch.Tensor, description="Denoised latents.")
+    def template(cls, name: str, **overrides) -> "OutputParam":
+        """Get template for name if exists, otherwise return basic OutputParam with just the name."""
+        if name in OUTPUT_PARAM_TEMPLATES:
+            kwargs = {"name": name, **OUTPUT_PARAM_TEMPLATES[name]}
+            # Override with user-provided values
+            for key, value in overrides.items():
+                kwargs[key] = value
+            return cls(**kwargs)
+        return cls(name=name, **overrides)
 
 
 def format_inputs_short(inputs):
@@ -890,4 +841,4 @@ def make_doc_string(
     output += "\n\n"
     output += format_output_params(outputs, indent_level=2)
 
-    return output
+    return output
\ No newline at end of file

From 002c3e8239b267e17b3849d1e53fde78890f0ad1 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 03:24:34 +0100
Subject: [PATCH 14/58] add template method

---
 .../modular_pipeline_utils.py                 | 163 ++++++++++++------
 1 file changed, 112 insertions(+), 51 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index f8dde1fbd096..a65aa43b2a3b 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -336,7 +336,6 @@ class ConfigSpec:
     },
     "negative_prompt": {
         "type_hint": str,
-        "default": None,
         "description": "The prompt or prompts not to guide the image generation.",
     },
     "max_sequence_length": {
@@ -364,12 +363,10 @@ class ConfigSpec:
     },
     "generator": {
         "type_hint": torch.Generator,
-        "default": None,
         "description": "Torch generator for deterministic generation.",
     },
     "sigmas": {
         "type_hint": List[float],
-        "default": None,
         "description": "Custom sigmas for the denoising process.",
     },
     "strength": {
@@ -378,33 +375,16 @@ class ConfigSpec:
         "description": "Strength for img2img/inpainting.",
     },
     "image": {
-        "type_hint": PIL.Image.Image,
+        "type_hint": Union[PIL.Image.Image, List[PIL.Image.Image]],
         "required": True,
-        "description": "Input image for img2img, editing, or conditioning.",
-    },
-    "mask_image": {
-        "type_hint": PIL.Image.Image,
-        "required": True,
-        "description": "Mask image for inpainting.",
-    },
-    "control_image": {
-        "type_hint": PIL.Image.Image,
-        "required": True,
-        "description": "Control image for ControlNet conditioning.",
-    },
-    "padding_mask_crop": {
-        "type_hint": int,
-        "default": None,
-        "description": "Padding for mask cropping in inpainting.",
+        "description": "Reference image(s) for denoising. Can be a single image or list of images.",
     },
     "latents": {
         "type_hint": torch.Tensor,
-        "default": None,
         "description": "Pre-generated noisy latents for image generation.",
     },
     "timesteps": {
         "type_hint": torch.Tensor,
-        "default": None,
         "description": "Timesteps for the denoising process.",
     },
     "output_type": {
@@ -414,14 +394,28 @@ class ConfigSpec:
     },
     "attention_kwargs": {
         "type_hint": Dict[str, Any],
-        "default": None,
         "description": "Additional kwargs for attention processors.",
     },
     "denoiser_input_fields": {
         "kwargs_type": "denoiser_input_fields",
-        "type_hint": torch.Tensor,
         "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
     },
+    # inpainting
+    "mask_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Mask image for inpainting.",
+    },
+    "padding_mask_crop": {
+        "type_hint": int,
+        "description": "Padding for mask cropping in inpainting.",
+    },
+    # controlnet
+    "control_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Control image for ControlNet conditioning.",
+    },
     "control_guidance_start": {
         "type_hint": float,
         "default": 0.0,
@@ -437,6 +431,45 @@ class ConfigSpec:
         "default": 1.0,
         "description": "Scale for ControlNet conditioning.",
     },
+    "layers": {
+        "type_hint": int,
+        "default": 4,
+        "description": "Number of layers to extract from the image",
+    },
+    # common intermediate inputs
+    "prompt_embeds":{
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
+    },
+    "prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "mask for the text embeddings. Can be generated from text_encoder step.",
+    },
+    "negative_prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "description": "negative text embeddings used to guide the image generation. Can be generated from text_encoder step.",
+    },
+    "negative_prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "description": "mask for the negative text embeddings. Can be generated from text_encoder step.",
+    },
+    "image_latents": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "image latents used to guide the image generation. Can be generated from vae_encoder step.",
+    },
+    "batch_size": {
+        "type_hint": int,
+        "default": 1,
+        "description": "Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+    },
+    "dtype": {
+        "type_hint": torch.dtype,
+        "default": torch.float32,
+        "description": "The dtype of the model inputs, can be generated in input step.",
+    },
 }
 
 OUTPUT_PARAM_TEMPLATES = {
@@ -448,15 +481,34 @@ class ConfigSpec:
         "type_hint": torch.Tensor,
         "description": "Denoised latents.",
     },
+    # intermediate outputs
+    "prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The prompt embeddings.",
+    },
+    "prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The encoder attention mask.",
+    },
+    "negative_prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The negative prompt embeddings.",
+    },
+    "negative_prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The negative prompt embeddings mask.",
+    },
+    "image_latents": {
+        "type_hint": torch.Tensor,
+        "description": "The latent representation of the input image.",
+    },
 }
 
 
-# YiYi Notes: both inputs and intermediate_inputs are InputParam objects
-# however some fields are not relevant for intermediate_inputs
-# e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
-# default is not used for intermediate_inputs, we only use default from inputs, so it is ignored if it is set for intermediate_inputs
-# -> should we use different class for inputs and intermediate_inputs?
-@dataclass
 class InputParam:
     """Specification for an input parameter."""
 
@@ -465,31 +517,37 @@ class InputParam:
     default: Any = None
     required: bool = False
     description: str = ""
-    kwargs_type: str = None  # YiYi Notes: remove this feature (maybe)
+    kwargs_type: str = None
+
+    def __post_init__(self):
+        if self.required and self.default is not None:
+            raise ValueError(f"InputParam '{self.name}' cannot be both required and have a default value")
 
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
     @classmethod
-    def template(cls, name: str,  **overrides) -> "InputParam":
-        """Get template for name if exists, otherwise return basic InputParam with just the name."""
-        if name in INPUT_PARAM_TEMPLATES:
-            kwargs = {"name": name, **INPUT_PARAM_TEMPLATES[name]}
-            # Override with user-provided values
-            for key, value in overrides.items():
-                kwargs[key] = value
-            return cls(**kwargs)
-        return cls(name=name, **overrides)
+    def template(cls, name: str, note: str = None, **overrides) -> "InputParam":
+        """Get template for name if exists, otherwise raise ValueError."""
+        if name not in INPUT_PARAM_TEMPLATES:
+            raise ValueError(f"InputParam template for {name} not found")
+
+        template_kwargs = INPUT_PARAM_TEMPLATES[name].copy()
+        
+        if note and "description" in template_kwargs:
+            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
+        
+        template_kwargs.update(overrides)
+        return cls(name=name, **template_kwargs)
 
 
-@dataclass
 class OutputParam:
     """Specification for an output parameter."""
 
     name: str = None
     type_hint: Any = None
     description: str = ""
-    kwargs_type: str = None  # YiYi notes: remove this feature (maybe)
+    kwargs_type: str = None
 
     def __repr__(self):
         return (
@@ -497,15 +555,18 @@ def __repr__(self):
         )
 
     @classmethod
-    def template(cls, name: str, **overrides) -> "OutputParam":
-        """Get template for name if exists, otherwise return basic OutputParam with just the name."""
-        if name in OUTPUT_PARAM_TEMPLATES:
-            kwargs = {"name": name, **OUTPUT_PARAM_TEMPLATES[name]}
-            # Override with user-provided values
-            for key, value in overrides.items():
-                kwargs[key] = value
-            return cls(**kwargs)
-        return cls(name=name, **overrides)
+    def template(cls, name: str, note: str = None, **overrides) -> "OutputParam":
+        """Get template for name if exists, otherwise raise ValueError."""
+        if name not in OUTPUT_PARAM_TEMPLATES:
+            raise ValueError(f"OutputParam template for {name} not found")
+        
+        template_kwargs = OUTPUT_PARAM_TEMPLATES[name].copy()
+        
+        if note and "description" in template_kwargs:
+            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
+        
+        template_kwargs.update(overrides)
+        return cls(name=name, **template_kwargs)
 
 
 def format_inputs_short(inputs):

From 1f2dbc9dd2bf4d256039120f6d6ccaf49f1c09c7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 04:10:17 +0100
Subject: [PATCH 15/58] up

---
 .../qwenimage/before_denoise.py               | 187 +++----
 .../modular_pipelines/qwenimage/decoders.py   |  71 +--
 .../modular_pipelines/qwenimage/denoise.py    | 125 +----
 .../modular_pipelines/qwenimage/encoders.py   | 509 ++++++++----------
 .../modular_pipelines/qwenimage/inputs.py     | 282 +++++++---
 .../qwenimage/modular_blocks_qwenimage.py     |  61 ++-
 .../modular_blocks_qwenimage_edit.py          |  39 +-
 .../modular_blocks_qwenimage_edit_plus.py     |  30 +-
 .../modular_blocks_qwenimage_layered.py       |  73 ++-
 9 files changed, 677 insertions(+), 700 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index cb808b1d3807..b87c3555aad3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -134,28 +134,20 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.latents(),
-            InputParam.height(),
-            InputParam.width(),
-            InputParam.num_images_per_prompt(),
-            InputParam.generator(),
-            InputParam(
-                name="batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-            ),
-            InputParam(
-                name="dtype",
-                required=True,
-                type_hint=torch.dtype,
-                description="The dtype of the model inputs, can be generated in input step.",
-            ),
+            InputParam.template("latents"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
+            OutputParam(name="height", type_hint=int, description="updated to default value if not provided"),
+            OutputParam(name="width", type_hint=int, description="updated to default value if not provided"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -225,31 +217,21 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.latents(),
-            InputParam.height(),
-            InputParam.width(),
-            InputParam(
-                name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"
-            ),
-            InputParam.num_images_per_prompt(),
-            InputParam.generator(),
-            InputParam(
-                name="batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-            ),
-            InputParam(
-                name="dtype",
-                required=True,
-                type_hint=torch.dtype,
-                description="The dtype of the model inputs, can be generated in input step.",
-            ),
+            InputParam.template("latents"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("layers"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
+            OutputParam(name="height", type_hint=int, description="updated to default value if not provided"),
+            OutputParam(name="width", type_hint=int, description="updated to default value if not provided"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -325,18 +307,8 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The initial random noised, can be generated in prepare latent step.",
             ),
-            InputParam(
-                name="image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
-            ),
-            InputParam(
-                name="timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
+            InputParam.template("timesteps", required=True, note="can be generated in set_timesteps step."),
         ]
 
     @property
@@ -347,6 +319,11 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 type_hint=torch.Tensor,
                 description="The initial random noised used for inpainting denoising.",
             ),
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The scalednoisy latents to use for inpainting/image-to-image denoising.",
+            ),
         ]
 
     @staticmethod
@@ -406,9 +383,9 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The processed mask to use for the inpainting process.",
             ),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="dtype", required=True),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("dtype"),
         ]
 
     @property
@@ -468,14 +445,9 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.num_inference_steps(),
-            InputParam.sigmas(),
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
-            ),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+            InputParam.template("latents", required=True, description="The initial random noised latents for the denoising process, used to calculate the image sequence length. Can be generated in prepare latents step."),
         ]
 
     @property
@@ -484,6 +456,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"
             ),
+            OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"),
         ]
 
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -534,15 +507,16 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.num_inference_steps(),
-            InputParam.sigmas(),
-            InputParam("image_latents", required=True, type_hint=torch.Tensor),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="timesteps", type_hint=torch.Tensor),
+            OutputParam(name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"),
+            OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"),
         ]
 
     @torch.no_grad()
@@ -592,15 +566,10 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.num_inference_steps(),
-            InputParam.sigmas(),
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
-            ),
-            InputParam.strength(0.9),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare latents step."),
+            InputParam.template("strength", default=0.9),
         ]
 
     @property
@@ -609,7 +578,12 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="timesteps",
                 type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+                description="The timesteps to use for the denoising process.",
+            ),
+            OutputParam(
+                name="num_inference_steps",
+                type_hint=int,
+                description="The number of denoising steps to perform at inference time",
             ),
         ]
 
@@ -668,11 +642,11 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam.template("height", note="should be updated in prepare latents step."),
+            InputParam.template("width", note="should be updated in prepare latents step."),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -734,13 +708,13 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True),
-            InputParam(name="image_width", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."),
+            InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -813,13 +787,13 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True, type_hint=List[int]),
-            InputParam(name="image_width", required=True, type_hint=List[int]),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam(name="image_height", required=True, type_hint=List[int], descrption="The heights of the reference images. Can be generated in input step."),
+            InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -887,12 +861,12 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="layers", default=4, description="Number of layers to extract from the image"),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam.template("layers"),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -973,16 +947,11 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.control_guidance_start(),
-            InputParam.control_guidance_end(),
-            InputParam.controlnet_conditioning_scale(),
-            InputParam("control_image_latents", required=True),
-            InputParam(
-                "timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("control_guidance_start"),
+            InputParam.template("control_guidance_end"),
+            InputParam.template("controlnet_conditioning_scale"),
+            InputParam("control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
+            InputParam.template("timesteps", required=True, note="Can be generated in set_timesteps step."),
         ]
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 8207e99b69ae..499f0172888b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -47,14 +47,15 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
-            ),
+            InputParam.template("height", required=True, note="should be updated in input and prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in input and prepare latents step."),
+            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents", note="unpacked to B, C, 1, H, W"),
         ]
 
     @torch.no_grad()
@@ -86,10 +87,16 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("height", required=True, type_hint=int),
-            InputParam("width", required=True, type_hint=int),
-            InputParam("layers", default=4, description="Number of layers to extract from the image"),
+            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."),
+            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("layers"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents", note="unpacked to B, C, layers+1, H, W"),
         ]
 
     @torch.no_grad()
@@ -128,17 +135,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                name="latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
-            ),
+            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."),
         ]
 
     @property
-    def intermediate_outputs(self) -> List[str]:
-        return [OutputParam.images()]
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images", note="tensor output of the vae decoder.")]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -190,19 +192,14 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
-            ),
-            InputParam.output_type(),
+            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."),
+            InputParam.template("output_type"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]
 
     @torch.no_grad()
@@ -269,10 +266,14 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image from decoders step"),
-            InputParam.output_type(),
+            InputParam("images", required=True, description="the generated image tensor from decoders step"),
+            InputParam.template("output_type"),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images")]
+
     @staticmethod
     def check_inputs(output_type):
         if output_type not in ["pil", "np", "pt"]:
@@ -314,11 +315,15 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image from decoders step"),
-            InputParam.output_type(),
-            InputParam("mask_overlay_kwargs"),
+            InputParam("images", required=True, description="the generated image tensor from decoders step"),
+            InputParam.template("output_type"),
+            InputParam("mask_overlay_kwargs", description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images")]
+
     @staticmethod
     def check_inputs(output_type, mask_overlay_kwargs):
         if output_type not in ["pil", "np", "pt"]:
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index 472945b2269a..49fde3fd6ac3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -49,12 +49,7 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
-            ),
+            InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
         ]
 
     @torch.no_grad()
@@ -79,18 +74,8 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
-            ),
-            InputParam(
-                "image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.",
-            ),
+            InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
+            InputParam.template("image_latents", note="Can be encoded in vae_encoder step and packed in prepare_image_latents step."),
         ]
 
     @torch.no_grad()
@@ -134,30 +119,10 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
             ),
-            InputParam(
-                "controlnet_conditioning_scale",
-                type_hint=float,
-                description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
-            ),
-            InputParam(
-                "controlnet_keep",
-                required=True,
-                type_hint=List[float],
-                description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description=(
-                    "All conditional model inputs for the denoiser. "
-                    "It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens."
-                ),
-            ),
+            InputParam.template("controlnet_conditioning_scale", note="Can be generated in prepare_controlnet_inputs step."),
+            InputParam.template("controlnet_keep", note="Can be generated in prepare_controlnet_inputs step."),
+            InputParam.template("num_inference_steps", required=True, note="Can be updated in set_timesteps step."),
+            InputParam.template("denoiser_input_fields")
         ]
 
     @torch.no_grad()
@@ -218,25 +183,15 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.attention_kwargs(),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam.denoiser_input_fields(),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
+            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
                 required=True,
                 type_hint=List[Tuple[int, int]],
-                description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
+                description="The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.",
             ),
         ]
 
@@ -319,20 +274,10 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.attention_kwargs(),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam.denoiser_input_fields(),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
+            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
                 required=True,
@@ -418,7 +363,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
     @torch.no_grad()
@@ -459,24 +404,14 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam(
-                "image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.",
-            ),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
             InputParam(
                 "initial_noise",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam(
-                "timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("timesteps", required=True, note="should be updated in set_timesteps step."),
         ]
 
     @torch.no_grad()
@@ -517,18 +452,8 @@ def loop_expected_components(self) -> List[ComponentSpec]:
     @property
     def loop_inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                "timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("timesteps", required=True, note="should be generated in set_timesteps step."),
+            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
         ]
 
     @torch.no_grad()
@@ -560,6 +485,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 
 # Qwen Image (text2image, image2image)
+
+# auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage"
 
@@ -584,6 +511,7 @@ def description(self) -> str:
 
 
 # Qwen Image (inpainting)
+# auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage"
     block_classes = [
@@ -609,6 +537,7 @@ def description(self) -> str:
 
 
 # Qwen Image (text2image, image2image) with controlnet
+# auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage"
     block_classes = [
@@ -634,6 +563,7 @@ def description(self) -> str:
 
 
 # Qwen Image (inpainting) with controlnet
+# auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage"
     block_classes = [
@@ -667,6 +597,7 @@ def description(self) -> str:
 
 
 # Qwen Image Edit (image2image)
+# auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage-edit"
     block_classes = [
@@ -690,6 +621,7 @@ def description(self) -> str:
 
 
 # Qwen Image Edit (inpainting)
+# auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage-edit"
     block_classes = [
@@ -715,6 +647,7 @@ def description(self) -> str:
 
 
 # Qwen Image Layered (image2image)
+# auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
     model_name = "qwenimage-layered"
     block_classes = [
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 8d7b1905423d..82a3b6811959 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -259,33 +259,30 @@ def encode_vae_image(
 # ====================
 # 1. RESIZE
 # ====================
+# In QwenImage pipelines, resize is a separate step because the resized image is used in VL encoding and vae encoder blocks:
+#
+#   image (PIL.Image.Image)
+#       │
+#       ▼
+#   resized_image ([PIL.Image.Image])
+#       │
+#       ├──► text_encoder ──► prompt_embeds, prompt_embeds_mask
+#       │    (VL encoding needs the resized image for vision-language fusion)
+#       │
+#       └──► image_processor ──► processed_image (torch.Tensor, pixel space)
+#                │
+#                ▼
+#            vae_encoder ──► image_latents (torch.Tensor, latent space)
+#
+# In most of our other pipelines, resizing is done as part of the image preprocessing step.
+# ====================
 class QwenImageEditResizeStep(ModularPipelineBlocks):
     model_name = "qwenimage-edit"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-    ):
-        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
-
-        Args:
-            input_name (str, optional): Name of the image field to read from the
-                pipeline state. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write
-                back to the pipeline state. Defaults to "resized_image".
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        super().__init__()
 
     @property
     def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio."
+        return "Image Resize step that resize the image to target area while maintaining the aspect ratio."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -300,21 +297,15 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
-            InputParam.template(self._image_input_name)
-            or InputParam(
-                name=self._image_input_name,
-                required=True,
-                type_hint=torch.Tensor,
-                description="Input image for conditioning",
-            ),
-        ]
+        return [InputParam.template("image")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
+                name="resized_image", 
+                type_hint=List[PIL.Image.Image], 
+                description="The resized images",
             ),
         ]
 
@@ -322,7 +313,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         block_state = self.get_block_state(state)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -338,7 +329,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             for image in images
         ]
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
         self.set_block_state(state, block_state)
         return components, state
 
@@ -346,30 +337,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 class QwenImageLayeredResizeStep(ModularPipelineBlocks):
     model_name = "qwenimage-layered"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-    ):
-        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
-
-        Args:
-            input_name (str, optional): Name of the image field to read from the
-                pipeline state. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write
-                back to the pipeline state. Defaults to "resized_image".
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        super().__init__()
-
     @property
     def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio."
+        return f"Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -385,10 +355,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name)
-            or InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
-            ),
+            InputParam.template("image"),
             InputParam(
                 name="resolution",
                 default=640,
@@ -399,11 +366,11 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
-            ),
-        ]
+        return [OutputParam(
+            name="resized_image", 
+            type_hint=List[PIL.Image.Image], 
+            description="The resized images",
+        )]
 
     @staticmethod
     def check_inputs(resolution: int):
@@ -416,7 +383,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 
         self.check_inputs(resolution=block_state.resolution)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -433,45 +400,21 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             for image in images
         ]
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
         self.set_block_state(state, block_state)
         return components, state
 
 
 class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
-    """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus."""
 
     model_name = "qwenimage-edit-plus"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-        target_area: int = 1024 * 1024,
-    ):
-        """Create a step for resizing images to a target area.
-
-        Each image is resized independently based on its own aspect ratio. This is suitable for Edit Plus where
-        multiple reference images can have different dimensions.
-
-        Args:
-            input_name (str, optional): Name of the image field to read. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image".
-            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        self._target_area = target_area
-        super().__init__()
-
     @property
     def description(self) -> str:
         return (
-            f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n"
+            "Resize images for QwenImage Edit Plus pipeline.\n"
+            "Produces two outputs: resized_image (1024x1024) for VAE encoding, "
+            "resized_cond_image (384x384) for VL text encoding.\n"
             "Each image is resized independently based on its own aspect ratio."
         )
 
@@ -488,21 +431,21 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
-            InputParam.template(self._image_input_name)
-            or InputParam(
-                name=self._image_input_name,
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image(s) to resize",
-            ),
-        ]
+        # image
+        return [InputParam.template("image")] 
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
+                description="Images resized to 1024x1024 target area for VAE encoding",
+            ),
+            OutputParam(
+                name="resized_cond_image",
+                type_hint=List[PIL.Image.Image],
+                description="Images resized to 384x384 target area for VL text encoding",
             ),
         ]
 
@@ -510,7 +453,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         block_state = self.get_block_state(state)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -520,16 +463,24 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 
         # Resize each image independently based on its own aspect ratio
         resized_images = []
+        resized_cond_images = []
         for image in images:
             image_width, image_height = image.size
-            calculated_width, calculated_height, _ = calculate_dimensions(
-                self._target_area, image_width / image_height
-            )
+            
+            # For VAE encoder (1024x1024 target area)
+            vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height)
             resized_images.append(
-                components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
+                components.image_resize_processor.resize(image, height=vae_height, width=vae_width)
+            )
+            
+            # For VL text encoder (384x384 target area)
+            vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height)
+            resized_cond_images.append(
+                components.image_resize_processor.resize(image, height=vl_height, width=vl_width)
             )
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
+        block_state.resized_cond_image = resized_cond_images
         self.set_block_state(state, block_state)
         return components, state
 
@@ -538,13 +489,14 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # 2. GET IMAGE PROMPT
 # ====================
 class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
-    """
-    Auto-caption step that generates a text prompt from the input image if none is provided. Uses the VL model to
-    generate a description of the image.
-    """
 
     model_name = "qwenimage-layered"
 
+    def __init__(self):
+        self.image_caption_prompt_en = QWENIMAGE_LAYERED_CAPTION_PROMPT_EN
+        self.image_caption_prompt_cn = QWENIMAGE_LAYERED_CAPTION_PROMPT_CN
+        super().__init__()
+
     @property
     def description(self) -> str:
         return (
@@ -560,19 +512,10 @@ def expected_components(self) -> List[ComponentSpec]:
             ComponentSpec("processor", Qwen2VLProcessor),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="image_caption_prompt_en", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_EN),
-            ConfigSpec(name="image_caption_prompt_cn", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_CN),
-        ]
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                name="prompt", type_hint=str, description="The prompt to encode"
-            ),  # it is not required for qwenimage-layered, unlike other pipelines
+            InputParam.template("prompt", required=False), # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -596,9 +539,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         # If prompt is empty or None, generate caption from image
         if block_state.prompt is None or block_state.prompt == "" or block_state.prompt == " ":
             if block_state.use_en_prompt:
-                caption_prompt = components.config.image_caption_prompt_en
+                caption_prompt = self.image_caption_prompt_en
             else:
-                caption_prompt = components.config.image_caption_prompt_cn
+                caption_prompt = self.image_caption_prompt_cn
 
             model_inputs = components.processor(
                 text=caption_prompt,
@@ -627,6 +570,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_PROMPT_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_PROMPT_TEMPLATE_START_IDX
+        self.tokenizer_max_length = 1024
+        super().__init__()
+
     @property
     def description(self) -> str:
         return "Text Encoder step that generates text embeddings to guide the image generation."
@@ -644,49 +593,22 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_PROMPT_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_PROMPT_TEMPLATE_START_IDX),
-            ConfigSpec(name="tokenizer_max_length", default=1024),
-        ]
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.prompt(),
-            InputParam.negative_prompt(),
-            InputParam.max_sequence_length(1024),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
+            InputParam.template("max_sequence_length", default=1024),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -715,9 +637,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             components.text_encoder,
             components.tokenizer,
             prompt=block_state.prompt,
-            prompt_template_encode=components.config.prompt_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
-            tokenizer_max_length=components.config.tokenizer_max_length,
+            prompt_template_encode=self.prompt_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
+            tokenizer_max_length=self.tokenizer_max_length,
             device=device,
         )
 
@@ -732,9 +654,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
                 components.text_encoder,
                 components.tokenizer,
                 prompt=negative_prompt,
-                prompt_template_encode=components.config.prompt_template_encode,
-                prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
-                tokenizer_max_length=components.config.tokenizer_max_length,
+                prompt_template_encode=self.prompt_template_encode,
+                prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
+                tokenizer_max_length=self.tokenizer_max_length,
                 device=device,
             )
             block_state.negative_prompt_embeds = block_state.negative_prompt_embeds[
@@ -751,6 +673,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_EDIT_PROMPT_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX
+        super().__init__()
+
     @property
     def description(self) -> str:
         return "Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation."
@@ -768,18 +695,12 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX),
-        ]
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.prompt(),
-            InputParam.negative_prompt(),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
             InputParam(
                 name="resized_image",
                 required=True,
@@ -791,30 +712,10 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -842,8 +743,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             components.processor,
             prompt=block_state.prompt,
             image=block_state.resized_image,
-            prompt_template_encode=components.config.prompt_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            prompt_template_encode=self.prompt_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
             device=device,
         )
 
@@ -856,8 +757,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
                 components.processor,
                 prompt=negative_prompt,
                 image=block_state.resized_image,
-                prompt_template_encode=components.config.prompt_template_encode,
-                prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                prompt_template_encode=self.prompt_template_encode,
+                prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
                 device=device,
             )
 
@@ -866,10 +767,15 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 
 
 class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
-    """Text encoder for QwenImage Edit Plus (VL encoding with multiple images)."""
 
     model_name = "qwenimage-edit-plus"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE
+        self.img_template_encode = QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX
+        super().__init__()
+
     @property
     def description(self) -> str:
         return (
@@ -890,19 +796,12 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE),
-            ConfigSpec(name="img_template_encode", default=QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX),
-        ]
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.prompt(),
-            InputParam.negative_prompt(),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
             InputParam(
                 name="resized_cond_image",
                 required=True,
@@ -914,30 +813,10 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -965,9 +844,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             components.processor,
             prompt=block_state.prompt,
             image=block_state.resized_cond_image,
-            prompt_template_encode=components.config.prompt_template_encode,
-            img_template_encode=components.config.img_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            prompt_template_encode=self.prompt_template_encode,
+            img_template_encode=self.img_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
             device=device,
         )
 
@@ -981,9 +860,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
                     components.processor,
                     prompt=negative_prompt,
                     image=block_state.resized_cond_image,
-                    prompt_template_encode=components.config.prompt_template_encode,
-                    img_template_encode=components.config.img_template_encode,
-                    prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                    prompt_template_encode=self.prompt_template_encode,
+                    img_template_encode=self.img_template_encode,
+                    prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
                     device=device,
                 )
             )
@@ -1016,18 +895,26 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.mask_image(),
-            InputParam.image(),
-            InputParam.height(),
-            InputParam.width(),
-            InputParam.padding_mask_crop(),
+            InputParam.template("mask_image"),
+            InputParam.template("image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("padding_mask_crop"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="processed_image"),
-            OutputParam(name="processed_mask_image"),
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            ),
+            OutputParam(
+                name="processed_mask_image",
+                type_hint=torch.Tensor,
+                description="The processed mask image",
+            ),
             OutputParam(
                 name="mask_overlay_kwargs",
                 type_hint=Dict,
@@ -1088,21 +975,29 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.mask_image(),
+            InputParam.template("mask_image"),
             InputParam(
-                "resized_image",
+                name="resized_image",
                 required=True,
                 type_hint=PIL.Image.Image,
                 description="The resized image. should be generated using a resize step",
             ),
-            InputParam.padding_mask_crop(),
+            InputParam.template("padding_mask_crop"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="processed_image"),
-            OutputParam(name="processed_mask_image"),
+            OutputParam(
+                name="processed_image", 
+                type_hint=torch.Tensor, 
+                description="The processed image"
+            ),
+            OutputParam(
+                name="processed_mask_image",
+                type_hint=torch.Tensor,
+                description="The processed mask image",
+            ),
             OutputParam(
                 name="mask_overlay_kwargs",
                 type_hint=Dict,
@@ -1151,14 +1046,18 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.image(),
-            InputParam.height(),
-            InputParam.width(),
+            InputParam.template("image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [OutputParam(
+            name="processed_image",
+            type_hint=torch.Tensor,
+            description="The processed image",
+        )]
 
     @staticmethod
     def check_inputs(height, width, vae_scale_factor):
@@ -1209,12 +1108,21 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("resized_image", required=True),
+            InputParam(
+                name="resized_image", 
+                required=True,
+                type_hint=List[PIL.Image.Image],
+                description="The resized image. should be generated using a resize step",
+            ),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [OutputParam(
+            name="processed_image",
+            type_hint=torch.Tensor,
+            description="The processed image",
+        )]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1252,11 +1160,20 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam("resized_image")]
+        return [InputParam(
+            name="resized_image",
+            required=True,
+            type_hint=List[PIL.Image.Image],
+            description="The resized image. should be generated using a resize step",
+        )]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [OutputParam(
+            name="processed_image",
+            type_hint=torch.Tensor,
+            description="The processed image",
+        )]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1274,7 +1191,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
             processed_images.append(
                 components.image_processor.preprocess(image=img, height=img_height, width=img_width)
             )
-        block_state.processed_image = processed_images
+
         if is_image_list:
             block_state.processed_image = processed_images
         else:
@@ -1294,8 +1211,8 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        input_name: str = "processed_image",
-        output_name: str = "image_latents",
+        input: Optional[InputParam] = None,
+        output: Optional[OutputParam] = None,
     ):
         """Initialize a VAE encoder step for converting images to latent representations.
 
@@ -1303,11 +1220,24 @@ def __init__(
         a single tensor, outputs a single latent tensor.
 
         Args:
-            input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image".
-            output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents".
+            input (InputParam, optional): Input parameter for the processed image. Defaults to "processed_image".
+            output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents".
         """
-        self._image_input_name = input_name
-        self._image_latents_output_name = output_name
+        if input is None:
+            input = InputParam(name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode")
+
+        if output is None:
+            output = OutputParam.template("image_latents")
+
+        if not isinstance(input, InputParam):
+            raise ValueError(f"input must be InputParam but is {type(input)}")
+        if not isinstance(output, OutputParam):
+            raise ValueError(f"output must be OutputParam but is {type(output)}")
+
+        self._input = input
+        self._output = output
+        self._image_input_name = input.name
+        self._image_latents_output_name = output.name
         super().__init__()
 
     @property
@@ -1324,20 +1254,13 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name)
-            or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"),
-            InputParam.generator(),
+            self._input, # default is "processed_image"
+            InputParam.template("generator"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                self._image_latents_output_name,
-                type_hint=torch.Tensor,
-                description="The latents representing the reference image(s). Single tensor or list depending on input.",
-            )
-        ]
+        return [self._output] # default is "image_latents"
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -1398,10 +1321,10 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam.control_image(),
-            InputParam.height(),
-            InputParam.width(),
-            InputParam.generator(),
+            InputParam.template("control_image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("generator"),
         ]
         return inputs
 
@@ -1489,22 +1412,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 6. PERMUTE LATENTS
 # ====================
 class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
-    """Permute image latents from VAE format to Layered format."""
-
     model_name = "qwenimage-layered"
 
-    def __init__(self, input_name: str = "image_latents"):
-        self._input_name = input_name
-        super().__init__()
-
     @property
     def description(self) -> str:
-        return f"Permute {self._input_name} from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
+        return f"Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(self._input_name, required=True),
+            InputParam.template("image_latents"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("image_latents", note="permuted from [B, C, 1, H, W] to [B, 1, C, H, W]"),
         ]
 
     @torch.no_grad()
@@ -1512,8 +1435,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Permute: (B, C, 1, H, W) -> (B, 1, C, H, W)
-        latents = getattr(block_state, self._input_name)
-        setattr(block_state, self._input_name, latents.permute(0, 2, 1, 3, 4))
+        latents = block_state.image_latents
+        block_state.image_latents = latents.permute(0, 2, 1, 3, 4)
 
         self.set_block_state(state, block_state)
-        return components, state
+        return components, state
\ No newline at end of file
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index e28493ecc369..bd2f79ae7c4c 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 
 import torch
 
@@ -129,26 +129,22 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.num_images_per_prompt(),
-            InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
-            InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
-            InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
-            InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("prompt_embeds"),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
-    def intermediate_outputs(self) -> List[str]:
+    def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                "batch_size",
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
-            ),
-            OutputParam(
-                "dtype",
-                type_hint=torch.dtype,
-                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
-            ),
+            OutputParam.template("batch_size"),
+            OutputParam.template("dtype"),
+            OutputParam.template("prompt_embeds", note="batch-expanded"),
+            OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
+            OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
+            OutputParam.template("negative_prompt_embeds_mask", note="batch-expanded"),
         ]
 
     @staticmethod
@@ -228,13 +224,28 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
-    ):
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
+    ):   
+        # by default, process `image_latents`
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -252,9 +263,9 @@ def description(self) -> str:
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -269,23 +280,19 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam.num_images_per_prompt(),
-            InputParam(name="batch_size", required=True),
-            InputParam.height(),
-            InputParam.width(),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
-
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
+        # default is `image_latents`
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=int,
@@ -295,14 +302,42 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 name="image_width",
                 type_hint=int,
                 description="The image width calculated from the image latents dimension",
-            ),
+            )
         ]
 
+        # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(OutputParam(name="height", type_hint=int, note="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="width", type_hint=int, note="updated based on image size if not provided"))
+
+        # image latent inputs are modified in place (patchified and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified and batch-expanded)",
+                )
+            )
+
+        # additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -331,7 +366,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             setattr(block_state, image_latent_input_name, image_latent_tensor)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -356,13 +392,27 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
     ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -381,9 +431,9 @@ def description(self) -> str:
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -398,23 +448,20 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam.num_images_per_prompt(),
-            InputParam(name="batch_size", required=True),
-            InputParam.height(),
-            InputParam.width(),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
 
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
+        # default is `image_latents`
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=List[int],
@@ -426,12 +473,40 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 description="The image widths calculated from the image latents dimension",
             ),
         ]
+        
+        # `height`/`width` are updated if any image latent inputs are provided
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided"))
+
+        # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified, concatenated, and batch-expanded)",
+                )
+            )
+
+        # additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
 
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -476,7 +551,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -494,8 +570,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
-# YiYi TODO: support define config default component from the ModularPipeline level.
-# it is same as QwenImageAdditionalInputsStep, but with layered pachifier.
+# same as QwenImageAdditionalInputsStep, but with layered pachifier.
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
     """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""
 
@@ -503,13 +578,27 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
     ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -527,9 +616,9 @@ def description(self) -> str:
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -544,21 +633,18 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam.num_images_per_prompt(),
-            InputParam(name="batch_size", required=True),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
         ]
+        # default is `image_latents`
 
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam.template(input_name) or InputParam(name=input_name))
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=int,
@@ -569,15 +655,40 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 type_hint=int,
                 description="The image width calculated from the image latents dimension",
             ),
-            OutputParam(name="height", type_hint=int, description="The height of the image output"),
-            OutputParam(name="width", type_hint=int, description="The width of the image output"),
         ]
 
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided"))
+
+        # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified with layered pachifier and batch-expanded)",
+                )
+            )
+
+        # Add outputs for additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -608,7 +719,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             setattr(block_state, image_latent_input_name, image_latent_tensor)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -636,11 +748,19 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="control_image_latents", required=True),
-            InputParam(name="batch_size", required=True),
-            InputParam.num_images_per_prompt(),
-            InputParam.height(),
-            InputParam.width(),
+            InputParam(name="control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
+            InputParam.template("batch_size"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+        ]
+    
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."),
+            OutputParam(name="height", type_hint=int, description="updated based on control image size if not provided"),
+            OutputParam(name="width", type_hint=int, description="updated based on control image size if not provided"),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 645c01f66ee5..42593a93f98a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -75,8 +75,11 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -400,8 +403,7 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
@@ -440,8 +442,7 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
-    (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -478,7 +479,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -519,8 +520,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
-    task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
@@ -563,7 +563,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -606,8 +606,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
-    task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
@@ -648,7 +647,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -691,8 +690,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
-    (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -742,6 +740,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
               txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
+          denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
           latents (`Tensor`):
@@ -785,8 +785,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
-    task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
@@ -842,6 +841,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
+          denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
           latents (`Tensor`):
@@ -887,8 +888,7 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
-    task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
@@ -942,6 +942,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
+          denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
           latents (`Tensor`):
@@ -1065,7 +1067,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
@@ -1085,8 +1087,7 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
-    overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
 
@@ -1098,7 +1099,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`None`, *optional*):
               TODO: Add description.
 
@@ -1182,8 +1183,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -1228,7 +1232,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           image_latents (`None`, *optional*):
               TODO: Add description.
@@ -1244,8 +1248,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`None`, *optional*):
               TODO: Add description.
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 0bfbb921c9c4..46e8881b9521 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -74,10 +74,11 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -376,8 +377,7 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
@@ -452,7 +452,7 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -536,7 +536,7 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -630,7 +630,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
@@ -650,8 +650,7 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-    overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
 
@@ -663,7 +662,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`None`, *optional*):
               TODO: Add description.
 
@@ -722,8 +721,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
-        `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
 
       Components:
 
@@ -750,10 +748,11 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -790,10 +789,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`None`, *optional*):
               TODO: Add description.
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 8dab6fbcf95d..1fb967bf1322 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -67,10 +67,11 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -99,7 +100,7 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
-        QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"),
+        QwenImageEditPlusResizeStep(),
         QwenImageEditPlusTextEncoderStep(),
     ]
     block_names = ["resize", "encode"]
@@ -145,7 +146,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
-        QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"),
+        QwenImageEditPlusResizeStep(),
         QwenImageEditPlusProcessImagesInputStep(),
         QwenImageVaeEncoderStep(),
     ]
@@ -268,7 +269,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -325,7 +326,7 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
@@ -386,10 +387,11 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -418,10 +420,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 544b1abfc3ed..7d6c2ea0635a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,8 +53,7 @@
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-    provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
 
@@ -71,23 +70,28 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -97,11 +101,16 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -300,7 +309,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -381,23 +390,28 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -407,11 +421,16 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -444,10 +463,10 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          **denoiser_input_fields (`Tensor`, *optional*):
+          denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt''.
+              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):

From fb15752d5538c4e4ec95d8164630cbc374002405 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 08:10:31 +0100
Subject: [PATCH 16/58] up up up

---
 .../modular_pipeline_utils.py                 | 35 +++++---
 .../qwenimage/before_denoise.py               | 79 +++++++++++++------
 .../modular_pipelines/qwenimage/decoders.py   | 63 ++++++++++++---
 .../modular_pipelines/qwenimage/denoise.py    | 63 +++++++++++----
 .../modular_pipelines/qwenimage/encoders.py   |  2 +-
 .../modular_pipelines/qwenimage/inputs.py     | 20 ++---
 .../qwenimage/modular_blocks_qwenimage.py     | 24 +++---
 .../modular_blocks_qwenimage_edit.py          | 20 ++---
 .../modular_blocks_qwenimage_edit_plus.py     | 10 +--
 .../modular_blocks_qwenimage_layered.py       |  8 +-
 10 files changed, 216 insertions(+), 108 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index a65aa43b2a3b..5ef1b98f1ba3 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -397,6 +397,7 @@ class ConfigSpec:
         "description": "Additional kwargs for attention processors.",
     },
     "denoiser_input_fields": {
+        "name": None,
         "kwargs_type": "denoiser_input_fields",
         "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
     },
@@ -509,6 +510,7 @@ class ConfigSpec:
 }
 
 
+@dataclass
 class InputParam:
     """Specification for an input parameter."""
 
@@ -519,20 +521,22 @@ class InputParam:
     description: str = ""
     kwargs_type: str = None
 
-    def __post_init__(self):
-        if self.required and self.default is not None:
-            raise ValueError(f"InputParam '{self.name}' cannot be both required and have a default value")
-
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
     @classmethod
-    def template(cls, name: str, note: str = None, **overrides) -> "InputParam":
+    def template(cls, template_name: str, note: str = None, **overrides) -> "InputParam":
         """Get template for name if exists, otherwise raise ValueError."""
-        if name not in INPUT_PARAM_TEMPLATES:
-            raise ValueError(f"InputParam template for {name} not found")
+        if template_name not in INPUT_PARAM_TEMPLATES:
+            raise ValueError(f"InputParam template for {template_name} not found")
 
-        template_kwargs = INPUT_PARAM_TEMPLATES[name].copy()
+        template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
+        
+        # Determine the actual param name:
+        # 1. From overrides if provided
+        # 2. From template if present
+        # 3. Fall back to template_name
+        name = overrides.pop("name", template_kwargs.pop("name", template_name))
         
         if note and "description" in template_kwargs:
             template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
@@ -541,6 +545,7 @@ def template(cls, name: str, note: str = None, **overrides) -> "InputParam":
         return cls(name=name, **template_kwargs)
 
 
+@dataclass
 class OutputParam:
     """Specification for an output parameter."""
 
@@ -555,12 +560,18 @@ def __repr__(self):
         )
 
     @classmethod
-    def template(cls, name: str, note: str = None, **overrides) -> "OutputParam":
+    def template(cls, template_name: str, note: str = None, **overrides) -> "OutputParam":
         """Get template for name if exists, otherwise raise ValueError."""
-        if name not in OUTPUT_PARAM_TEMPLATES:
-            raise ValueError(f"OutputParam template for {name} not found")
+        if template_name not in OUTPUT_PARAM_TEMPLATES:
+            raise ValueError(f"OutputParam template for {template_name} not found")
+        
+        template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
         
-        template_kwargs = OUTPUT_PARAM_TEMPLATES[name].copy()
+        # Determine the actual param name:
+        # 1. From overrides if provided
+        # 2. From template if present
+        # 3. Fall back to template_name
+        name = overrides.pop("name", template_kwargs.pop("name", template_name))
         
         if note and "description" in template_kwargs:
             template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index b87c3555aad3..fc795b5f5a2f 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -146,8 +146,8 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="height", type_hint=int, description="updated to default value if not provided"),
-            OutputParam(name="width", type_hint=int, description="updated to default value if not provided"),
+            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
+            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -230,8 +230,8 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="height", type_hint=int, description="updated to default value if not provided"),
-            OutputParam(name="width", type_hint=int, description="updated to default value if not provided"),
+            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
+            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -307,8 +307,13 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The initial random noised, can be generated in prepare latent step.",
             ),
-            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
-            InputParam.template("timesteps", required=True, note="can be generated in set_timesteps step."),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
+            InputParam(
+                name="timesteps", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+            ),
         ]
 
     @property
@@ -322,7 +327,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
-                description="The scalednoisy latents to use for inpainting/image-to-image denoising.",
+                description="The scaled noisy latents to use for inpainting/image-to-image denoising.",
             ),
         ]
 
@@ -383,8 +388,8 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The processed mask to use for the inpainting process.",
             ),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("dtype"),
         ]
 
@@ -447,7 +452,12 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
-            InputParam.template("latents", required=True, description="The initial random noised latents for the denoising process, used to calculate the image sequence length. Can be generated in prepare latents step."),
+            InputParam(
+                name="latents", 
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step."
+            ),
         ]
 
     @property
@@ -456,7 +466,6 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"
             ),
-            OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"),
         ]
 
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -515,8 +524,11 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"),
-            OutputParam(name="num_inference_steps", type_hint=int, description="The number of denoising steps to perform at inference time"),
+            OutputParam(
+                name="timesteps", 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process."
+            ),
         ]
 
     @torch.no_grad()
@@ -568,7 +580,12 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
-            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare latents step."),
+            InputParam(
+                "latents", 
+                required=True, 
+                type_hint=torch.Tensor,
+                description="The latents to use for the denoising process. Can be generated in prepare latents step."
+            ),
             InputParam.template("strength", default=0.9),
         ]
 
@@ -583,7 +600,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 name="num_inference_steps",
                 type_hint=int,
-                description="The number of denoising steps to perform at inference time",
+                description="The number of denoising steps to perform at inference time. Updated based on strength.",
             ),
         ]
 
@@ -643,8 +660,8 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
-            InputParam.template("height", note="should be updated in prepare latents step."),
-            InputParam.template("width", note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
             InputParam.template("negative_prompt_embeds_mask"),
         ]
@@ -711,8 +728,8 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("batch_size"),
             InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."),
             InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
             InputParam.template("negative_prompt_embeds_mask"),
         ]
@@ -788,10 +805,10 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
-            InputParam(name="image_height", required=True, type_hint=List[int], descrption="The heights of the reference images. Can be generated in input step."),
+            InputParam(name="image_height", required=True, type_hint=List[int], description="The heights of the reference images. Can be generated in input step."),
             InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
             InputParam.template("negative_prompt_embeds_mask"),
         ]
@@ -863,8 +880,8 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
             InputParam.template("layers"),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
             InputParam.template("negative_prompt_embeds_mask"),
         ]
@@ -950,8 +967,18 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("control_guidance_start"),
             InputParam.template("control_guidance_end"),
             InputParam.template("controlnet_conditioning_scale"),
-            InputParam("control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
-            InputParam.template("timesteps", required=True, note="Can be generated in set_timesteps step."),
+            InputParam(
+                name="control_image_latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."
+            ),
+            InputParam(
+                name="timesteps", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+            ),
         ]
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 499f0172888b..4476e1db9bad 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import Any, Dict, List
 
 import torch
 
@@ -47,15 +47,24 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("height", required=True, note="should be updated in input and prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in input and prepare latents step."),
-            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The latents to decode, can be generated in the denoise step."
+            ),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam.template("latents", note="unpacked to B, C, 1, H, W"),
+            OutputParam(
+                name="latents", 
+                type_hint=torch.Tensor, 
+                description="The denoisedlatents unpacked to B, C, 1, H, W"
+            ),
         ]
 
     @torch.no_grad()
@@ -87,9 +96,14 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step."),
-            InputParam.template("height", required=True, note="should be updated in prepare latents step."),
-            InputParam.template("width", required=True, note="should be updated in prepare latents step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The denoised latents to decode, can be generated in the denoise step."
+            ),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam.template("layers"),
         ]
 
@@ -135,7 +149,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
+            ),
         ]
 
     @property
@@ -192,7 +211,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The latents to decode, can be generated in the denoise step and unpacked in the after denoise step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
+            ),
             InputParam.template("output_type"),
         ]
 
@@ -266,7 +290,12 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image tensor from decoders step"),
+            InputParam(
+                name="images", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="the generated image tensor from decoders step"
+            ),
             InputParam.template("output_type"),
         ]
 
@@ -315,9 +344,17 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image tensor from decoders step"),
+            InputParam(
+                name="images", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="the generated image tensor from decoders step"
+            ),
             InputParam.template("output_type"),
-            InputParam("mask_overlay_kwargs", description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
+            InputParam(
+                name="mask_overlay_kwargs", 
+                type_hint=Dict[str, Any],
+                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
         ]
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index 49fde3fd6ac3..ad6a9677aca3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -49,7 +49,12 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
+            ),
         ]
 
     @torch.no_grad()
@@ -74,8 +79,13 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("latents", required=True, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
-            InputParam.template("image_latents", note="Can be encoded in vae_encoder step and packed in prepare_image_latents step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
+            ),
+            InputParam.template("image_latents", note="generated in vae encoder step and updated in input step."),
         ]
 
     @torch.no_grad()
@@ -119,10 +129,13 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
             ),
-            InputParam.template("controlnet_conditioning_scale", note="Can be generated in prepare_controlnet_inputs step."),
-            InputParam.template("controlnet_keep", note="Can be generated in prepare_controlnet_inputs step."),
-            InputParam.template("num_inference_steps", required=True, note="Can be updated in set_timesteps step."),
-            InputParam.template("denoiser_input_fields")
+            InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
+            InputParam(
+                name="controlnet_keep", 
+                required=True, 
+                type_hint=List[float], 
+                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step."
+            ),
         ]
 
     @torch.no_grad()
@@ -184,8 +197,13 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("attention_kwargs"),
-            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
-            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The latents to use for the denoising process. Can be generated in prepare_latents step."
+            ),
+            InputParam.template("num_inference_steps"),
             InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
@@ -275,8 +293,13 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("attention_kwargs"),
-            InputParam.template("latents", required=True, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
-            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam(
+                name="latents", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The latents to use for the denoising process. Can be generated in prepare_latents step."
+            ),
+            InputParam.template("num_inference_steps"),
             InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
@@ -404,14 +427,19 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
+            InputParam.template("image_latents", note="Can be generated from vae encoder step and updated in input step."),
             InputParam(
                 "initial_noise",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam.template("timesteps", required=True, note="should be updated in set_timesteps step."),
+            InputParam(
+                "timesteps", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+            ),
         ]
 
     @torch.no_grad()
@@ -452,8 +480,13 @@ def loop_expected_components(self) -> List[ComponentSpec]:
     @property
     def loop_inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("timesteps", required=True, note="should be generated in set_timesteps step."),
-            InputParam.template("num_inference_steps", required=True, note="should be updated in set_timesteps step."),
+            InputParam(
+                name="timesteps", 
+                required=True, 
+                type_hint=torch.Tensor, 
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+            ),
+            InputParam.template("num_inference_steps", required=True),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 82a3b6811959..9a83f0d7178a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -1145,7 +1145,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Image Preprocess step. Images can be resized first using QwenImageEditResizeStep."
+        return "Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index bd2f79ae7c4c..b237031b91d2 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -139,8 +139,8 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam.template("batch_size"),
-            OutputParam.template("dtype"),
+            OutputParam(name="batch_size", type_hint=int, description="The batch size of the prompt embeddings"),
+            OutputParam(name="dtype", type_hint=torch.dtype, description="The data type of the prompt embeddings"),
             OutputParam.template("prompt_embeds", note="batch-expanded"),
             OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
             OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
@@ -307,8 +307,8 @@ def intermediate_outputs(self) -> List[OutputParam]:
 
         # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, note="updated based on image size if not provided"))
-            outputs.append(OutputParam(name="width", type_hint=int, note="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
 
         # image latent inputs are modified in place (patchified and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -476,8 +476,8 @@ def intermediate_outputs(self) -> List[OutputParam]:
         
         # `height`/`width` are updated if any image latent inputs are provided
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
 
         # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -658,8 +658,8 @@ def intermediate_outputs(self) -> List[OutputParam]:
         ]
 
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="updated based on image size if not provided"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="updated based on image size if not provided"))
+            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
+            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
 
         # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -759,8 +759,8 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."),
-            OutputParam(name="height", type_hint=int, description="updated based on control image size if not provided"),
-            OutputParam(name="width", type_hint=int, description="updated based on control image size if not provided"),
+            OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
+            OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 42593a93f98a..46f0b6f6ff5a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import torch
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
 from .before_denoise import (
     QwenImageControlNetBeforeDenoiserStep,
     QwenImageCreateMaskLatentsStep,
@@ -319,7 +319,7 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     """
 
     model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
+    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep()]
     block_names = ["text_inputs", "additional_inputs"]
 
     @property
@@ -373,7 +373,7 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     block_classes = [
         QwenImageTextInputsStep(),
         QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -512,7 +512,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -598,7 +598,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -682,7 +682,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -777,7 +777,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -880,7 +880,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -981,7 +981,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -1042,7 +1042,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -1279,5 +1279,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 46e8881b9521..158763ce917a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 from typing import Optional
+import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
 from .before_denoise import (
     QwenImageCreateMaskLatentsStep,
     QwenImageEditRoPEInputsStep,
@@ -206,7 +207,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
     block_classes = [
         QwenImageEditResizeStep(),
         QwenImageEditInpaintProcessImagesInputStep(),
-        QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"),
+        QwenImageVaeEncoderStep(),
     ]
     block_names = ["resize", "preprocess", "encode"]
 
@@ -286,7 +287,7 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -344,8 +345,7 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+        QwenImageAdditionalInputsStep(additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -485,7 +485,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -571,7 +571,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -605,7 +605,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -698,7 +698,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -816,5 +816,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 1fb967bf1322..a16dee1c7595 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
 from .before_denoise import (
     QwenImageEditPlusRoPEInputsStep,
     QwenImagePrepareLatentsStep,
@@ -211,7 +211,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageEditPlusAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -302,7 +302,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -446,5 +446,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 7d6c2ea0635a..2471750f2e0b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -255,7 +255,7 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageLayeredAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -342,7 +342,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.latents(),
+            OutputParam.template("latents"),
         ]
 
 
@@ -484,5 +484,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam.images(),
+            OutputParam.template("images"),
         ]

From 8d45ff5bf60a804a5eaf05933f028e2ddf9772f6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:22:04 +0100
Subject: [PATCH 17/58] apply auto docstring

---
 .../modular_pipeline_utils.py                 |   4 +-
 .../qwenimage/before_denoise.py               | 312 ++++++++++++-
 .../modular_pipelines/qwenimage/decoders.py   | 112 +++++
 .../modular_pipelines/qwenimage/denoise.py    | 295 +++++++++++-
 .../modular_pipelines/qwenimage/encoders.py   | 323 +++++++++++++-
 .../modular_pipelines/qwenimage/inputs.py     | 181 +++++++-
 .../qwenimage/modular_blocks_qwenimage.py     | 421 ++++++++----------
 .../modular_blocks_qwenimage_edit.py          | 273 ++++++------
 .../modular_blocks_qwenimage_edit_plus.py     | 150 +++----
 .../modular_blocks_qwenimage_layered.py       | 216 +++------
 10 files changed, 1616 insertions(+), 671 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 5ef1b98f1ba3..6f1010daf219 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -898,12 +898,12 @@ def make_doc_string(
 
     # Add components section if provided
     if expected_components and len(expected_components) > 0:
-        components_str = format_components(expected_components, indent_level=2)
+        components_str = format_components(expected_components, indent_level=2, add_empty_lines=False)
         output += components_str + "\n\n"
 
     # Add configs section if provided
     if expected_configs and len(expected_configs) > 0:
-        configs_str = format_configs(expected_configs, indent_level=2)
+        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
         output += configs_str + "\n\n"
 
     # Add inputs section
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index fc795b5f5a2f..0b8cd0f4b2d2 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -117,8 +117,39 @@ def get_timesteps(scheduler, num_inference_steps, strength):
 # 1. PREPARE LATENTS
 # ====================
 
-
+# auto_docstring
 class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Prepare initial random noise for the generation process
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          height (`int`):
+              if not set, updated to default value
+          width (`int`):
+              if not set, updated to default value
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
     model_name = "qwenimage"
 
     @property
@@ -201,7 +232,41 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Prepare initial random noise (B, layers+1, C, H, W) for the generation process
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          height (`int`):
+              if not set, updated to default value
+          width (`int`):
+              if not set, updated to default value
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -285,7 +350,29 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
+    """
+    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
+              vae encoder and updated in input step.)
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+      Outputs:
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
+    """
     model_name = "qwenimage"
 
     @property
@@ -366,7 +453,28 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
+    """
+    Step that creates mask latents from preprocessed mask_image by interpolating to latent space.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
     model_name = "qwenimage"
 
     @property
@@ -433,8 +541,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 2. SET TIMESTEPS
 # ====================
 
-
+# auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
+    """
+    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`):
+              The initial random noised latents for the denoising process. Can be generated in prepare latents step.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process
+    """
     model_name = "qwenimage"
 
     @property
@@ -500,7 +626,27 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
+    """
+    Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
+              vae encoder and packed in input step.)
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process.
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -562,7 +708,30 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         return components, state
 
 
+# auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
+    """
+    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`):
+              The latents to use for the denoising process. Can be generated in prepare latents step.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process.
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time. Updated based on strength.
+    """
     model_name = "qwenimage"
 
     @property
@@ -646,8 +815,32 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 ## RoPE inputs for denoiser
 
-
+# auto_docstring
 class QwenImageRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the images latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
     model_name = "qwenimage"
 
     @property
@@ -715,7 +908,36 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          image_height (`int`):
+              The height of the reference image. Can be generated in input step.
+          image_width (`int`):
+              The width of the reference image. Can be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the images latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
     model_name = "qwenimage"
 
     @property
@@ -790,7 +1012,38 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
+      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.
+      Should be placed after prepare_latents step.
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          image_height (`List`):
+              The heights of the reference images. Can be generated in input step.
+          image_width (`List`):
+              The widths of the reference images. Can be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the image latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -866,7 +1119,36 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the image latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+          additional_t_cond (`Tensor`):
+              The additional t cond, used for RoPE calculation
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -948,7 +1230,31 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 
 ## ControlNet inputs for denoiser
+
+# auto_docstring
 class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
+    """
+    step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.
+
+      Components:
+          controlnet (`QwenImageControlNetModel`)
+
+      Inputs:
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+      Outputs:
+          controlnet_keep (`List`):
+              The controlnet keep values
+    """
     model_name = "qwenimage"
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 4476e1db9bad..650bf34da7a3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -29,7 +29,27 @@
 
 
 # after denoising loop (unpack latents)
+
+#auto_docstring
 class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step.
+
+      Outputs:
+          latents (`Tensor`):
+              The denoisedlatents unpacked to B, C, 1, H, W
+    """
     model_name = "qwenimage"
 
     @property
@@ -80,7 +100,28 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+#auto_docstring
 class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents. (unpacked to B, C, layers+1, H, W)
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -131,7 +172,23 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 
 # decode step
+
+#auto_docstring
 class QwenImageDecoderStep(ModularPipelineBlocks):
+    """
+    Step that decodes the latents to images
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
     model_name = "qwenimage"
 
     @property
@@ -189,7 +246,25 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+#auto_docstring
 class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
+    """
+    Decode unpacked latents (B, C, layers+1, H, W) into layer images.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -269,7 +344,25 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 
 # postprocess the decoded images
+
+#auto_docstring
 class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
+    """
+    postprocess the generated image
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          images (`Tensor`):
+              the generated image tensor from decoders step
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
 
     @property
@@ -323,7 +416,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+#auto_docstring
 class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
+    """
+    postprocess the generated image, optional apply the mask overally to the original image..
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          images (`Tensor`):
+              the generated image tensor from decoders step
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
     model_name = "qwenimage"
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index ad6a9677aca3..ff6e411d7632 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -85,7 +85,7 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor, 
                 description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
             ),
-            InputParam.template("image_latents", note="generated in vae encoder step and updated in input step."),
+            InputParam.template("image_latents"),
         ]
 
     @torch.no_grad()
@@ -197,13 +197,6 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("attention_kwargs"),
-            InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step."
-            ),
-            InputParam.template("num_inference_steps"),
             InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
@@ -293,13 +286,6 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("attention_kwargs"),
-            InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step."
-            ),
-            InputParam.template("num_inference_steps"),
             InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
@@ -427,19 +413,19 @@ def inputs(self) -> List[InputParam]:
                 type_hint=torch.Tensor,
                 description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam.template("image_latents", note="Can be generated from vae encoder step and updated in input step."),
+            InputParam.template("image_latents"),
             InputParam(
                 "initial_noise",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam(
-                "timesteps", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
-            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents"),
         ]
 
     @torch.no_grad()
@@ -521,6 +507,38 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 # auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports text2image and image2image tasks for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage"
 
     block_classes = [
@@ -546,6 +564,45 @@ def description(self) -> str:
 # Qwen Image (inpainting)
 # auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -572,6 +629,46 @@ def description(self) -> str:
 # Qwen Image (text2image, image2image) with controlnet
 # auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopBeforeDenoiserControlNet`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports text2img/img2img tasks with controlnet for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          controlnet (`QwenImageControlNetModel`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          control_image_latents (`Tensor`):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
+          controlnet_keep (`List`):
+              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -598,6 +695,53 @@ def description(self) -> str:
 # Qwen Image (inpainting) with controlnet
 # auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopBeforeDenoiserControlNet`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks with controlnet for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          controlnet (`QwenImageControlNetModel`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          control_image_latents (`Tensor`):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
+          controlnet_keep (`List`):
+              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -632,6 +776,40 @@ def description(self) -> str:
 # Qwen Image Edit (image2image)
 # auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports QwenImage Edit.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -656,6 +834,45 @@ def description(self) -> str:
 # Qwen Image Edit (inpainting)
 # auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks for QwenImage Edit.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -682,6 +899,40 @@ def description(self) -> str:
 # Qwen Image Layered (image2image)
 # auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents. 
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
+      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports QwenImage Layered.
+
+      Components:
+          guider (`ClassifierFreeGuidance`)
+          transformer (`QwenImageTransformer2DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 9a83f0d7178a..083ee507ccbb 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -276,7 +276,23 @@ def encode_vae_image(
 #
 # In most of our other pipelines, resizing is done as part of the image preprocessing step.
 # ====================
+
+# auto_docstring
 class QwenImageEditResizeStep(ModularPipelineBlocks):
+    """
+    Image Resize step that resize the image to target area while maintaining the aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+    """
     model_name = "qwenimage-edit"
 
 
@@ -334,7 +350,24 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredResizeStep(ModularPipelineBlocks):
+    """
+    Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+    """
     model_name = "qwenimage-layered"
 
     @property
@@ -405,7 +438,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
+    """
+    Resize images for QwenImage Edit Plus pipeline.
+      Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding.
+      Each image is resized independently based on its own aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+    """
 
     model_name = "qwenimage-edit-plus"
 
@@ -488,7 +540,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # ====================
 # 2. GET IMAGE PROMPT
 # ====================
+
+# auto_docstring
 class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
+    """
+    Auto-caption step that generates a text prompt from the input image if none is provided.
+      Uses the VL model (text_encoder) to generate a description of the image.
+      If prompt is already provided, this step passes through unchanged.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
+          processor (`Qwen2VLProcessor`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          resized_image (`Image`):
+              The image to generate caption from, should be resized use the resize step
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+
+      Outputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation. If not provided, updated using image caption
+    """
 
     model_name = "qwenimage-layered"
 
@@ -530,6 +605,16 @@ def inputs(self) -> List[InputParam]:
             ),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="prompt",
+                type_hint=str,
+                description="The prompt or prompts to guide image generation. If not provided, updated using image caption",
+            ),
+        ]
+
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
@@ -567,7 +652,35 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # ====================
 # 3. TEXT ENCODER
 # ====================
+
+# auto_docstring
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
+    """
+    Text Encoder step that generates text embeddings to guide the image generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
+          guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
     model_name = "qwenimage"
 
     def __init__(self):
@@ -670,7 +783,34 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
+    """
+    Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
+          processor (`Qwen2VLProcessor`)
+          guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          resized_image (`Image`):
+              The image prompt to encode, should be resized using resize step
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
     model_name = "qwenimage"
 
     def __init__(self):
@@ -766,7 +906,34 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
+    """
+    Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
+          processor (`Qwen2VLProcessor`)
+          guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          resized_cond_image (`Tensor`):
+              The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
 
     model_name = "qwenimage-edit-plus"
 
@@ -874,7 +1041,35 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # ====================
 # 4. IMAGE PREPROCESS
 # ====================
+
+# auto_docstring
 class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width.
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          mask_image (`Image`):
+              Mask image for inpainting.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+    """
     model_name = "qwenimage"
 
     @property
@@ -954,7 +1149,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first.
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          mask_image (`Image`):
+              Mask image for inpainting.
+          resized_image (`Image`):
+              The resized image. should be generated using a resize step
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+    """
     model_name = "qwenimage-edit"
 
     @property
@@ -1025,7 +1243,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. will resize the image to the given height and width.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
     model_name = "qwenimage"
 
     @property
@@ -1087,7 +1324,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. Images needs to be resized first.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          resized_image (`List`):
+              The resized image. should be generated using a resize step
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
     model_name = "qwenimage-edit"
 
     @property
@@ -1140,7 +1392,22 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          resized_image (`List`):
+              The resized image. should be generated using a resize step
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -1204,8 +1471,26 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # ====================
 # 5. VAE ENCODER
 # ====================
+
+# auto_docstring
 class QwenImageVaeEncoderStep(ModularPipelineBlocks):
-    """VAE encoder that handles both single images and lists of images with varied resolutions."""
+    """
+    VAE Encoder step that converts processed_image into latent representations image_latents.
+      Handles both single images and lists of images with varied resolutions.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          processed_image (`Tensor`):
+              The image tensor to encode
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
 
     model_name = "qwenimage"
 
@@ -1297,7 +1582,30 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
+    """
+    VAE Encoder step that converts `control_image` into latent representations control_image_latents.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+          controlnet (`QwenImageControlNetModel`)
+          control_image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          control_image (`Image`):
+              Control image for ControlNet conditioning.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The latents representing the control image
+    """
     model_name = "qwenimage"
 
     @property
@@ -1411,7 +1719,20 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # ====================
 # 6. PERMUTE LATENTS
 # ====================
+
+# auto_docstring
 class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
+    """
+    Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing.
+
+      Inputs:
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W])
+    """
     model_name = "qwenimage-layered"
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index b237031b91d2..0e03242e5e49 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -109,7 +109,42 @@ def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: in
     return height, width
 
 
+# auto_docstring
 class QwenImageTextInputsStep(ModularPipelineBlocks):
+    """
+    Text input processing step that standardizes text embeddings for the pipeline.
+      This step:
+        1. Determines `batch_size` and `dtype` based on `prompt_embeds`
+        2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
+
+      This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps.
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+    """
     model_name = "qwenimage"
 
     @property
@@ -217,8 +252,47 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage: update height/width, expand batch, patchify."""
+    """
+    Input processing step that:
+        1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+    """
 
     model_name = "qwenimage"
 
@@ -385,8 +459,48 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
+    """
+    Input processing step for Edit Plus that:
+        1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+        Height/width defaults to last image in the list.
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`List`):
+              The image heights calculated from the image latents dimension
+          image_width (`List`):
+              The image widths calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
+              concatenated, and batch-expanded)
+    """
 
     model_name = "qwenimage-edit-plus"
 
@@ -571,8 +685,44 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 
 # same as QwenImageAdditionalInputsStep, but with layered pachifier.
+
+# auto_docstring
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""
+    """
+    Input processing step for Layered that:
+        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
+              pachifier and batch-expanded)
+    """
 
     model_name = "qwenimage-layered"
 
@@ -738,7 +888,32 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+# auto_docstring
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
+    """
+    prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.
+
+      Inputs:
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
+              generated in input step.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The control image latents (patchified and batch-expanded).
+          height (`int`):
+              if not provided, updated to control image height
+          width (`int`):
+              if not provided, updated to control image width
+    """
     model_name = "qwenimage"
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 46f0b6f6ff5a..b50e41bb5079 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -65,26 +65,10 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
 
       Components:
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-
           tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-
           guider (`ClassifierFreeGuidance`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 34)
-
-          tokenizer_max_length (default: 1024)
-
       Inputs:
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
@@ -95,13 +79,13 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
 
       Outputs:
           prompt_embeds (`Tensor`):
-              The prompt embeddings
+              The prompt embeddings.
           prompt_embeds_mask (`Tensor`):
-              The encoder attention mask
+              The encoder attention mask.
           negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings
+              The negative prompt embeddings.
           negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask
+              The negative prompt embeddings mask.
     """
 
     model_name = "qwenimage"
@@ -130,16 +114,14 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
        - Creates `image_latents`.
 
       Components:
-
           image_mask_processor (`InpaintProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
           mask_image (`Image`):
               Mask image for inpainting.
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -150,14 +132,14 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
 
       Outputs:
-          processed_image (`None`):
-              TODO: Add description.
-          processed_mask_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage"
@@ -180,14 +162,12 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that preprocess andencode the image inputs into their latent representations.
 
       Components:
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -196,10 +176,10 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
 
       Outputs:
-          processed_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage"
@@ -238,11 +218,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
        - if `control_image` is not provided, step will be skipped.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           controlnet (`QwenImageControlNetModel`)
-
           control_image_processor (`VaeImageProcessor`)
 
       Inputs:
@@ -286,36 +263,50 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     Input step that prepares the inputs for the img2img denoising step. It:
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
     """
 
     model_name = "qwenimage"
@@ -335,38 +326,54 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     Input step that prepares the inputs for the inpainting denoising step. It:
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+          processed_mask_image (`Tensor`):
+              The processed mask image (batch-expanded)
     """
 
     model_name = "qwenimage"
@@ -394,30 +401,31 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
        - Create the pachified latents `mask` based on the processedmask image.
 
       Components:
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
+              vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-          height (`None`):
-              TODO: Add description.
-          width (`None`):
-              TODO: Add description.
-          dtype (`None`):
-              TODO: Add description.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
 
       Outputs:
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -445,26 +453,22 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           height (`int`, *optional*):
@@ -479,7 +483,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -523,34 +527,30 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -563,7 +563,7 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -609,32 +609,28 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -647,7 +643,7 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -693,30 +689,25 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           controlnet (`QwenImageControlNetModel`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
-          control_image_latents (`None`):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -735,12 +726,9 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          **denoiser_input_fields (`None`, *optional*):
-              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
-              txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -788,38 +776,33 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           controlnet (`QwenImageControlNetModel`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
-          control_image_latents (`None`):
-              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -836,12 +819,9 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          **denoiser_input_fields (`None`, *optional*):
-              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
-              txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -891,36 +871,31 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           controlnet (`QwenImageControlNetModel`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          control_image_latents (`None`):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -937,12 +912,9 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          **denoiser_input_fields (`None`, *optional*):
-              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
-              txt_seq_lens/negative_txt_seq_lens.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -1058,20 +1030,18 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage"
@@ -1090,22 +1060,20 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_mask_processor (`InpaintProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`None`, *optional*):
-              TODO: Add description.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage"
@@ -1157,42 +1125,18 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       - for text-to-image generation, all you need to provide is `prompt`
 
       Components:
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-
           tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-
           guider (`ClassifierFreeGuidance`)
-
           image_mask_processor (`InpaintProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
-
           controlnet (`QwenImageControlNetModel`)
-
           control_image_processor (`VaeImageProcessor`)
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           transformer (`QwenImageTransformer2DModel`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 34)
-
-          tokenizer_max_length (default: 1024)
-
       Inputs:
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
@@ -1202,8 +1146,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Maximum sequence length for prompt encoding.
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-          image (`Image`, *optional*):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -1216,14 +1160,14 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Control image for ControlNet conditioning.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -1232,29 +1176,26 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          control_image_latents (`None`, *optional*):
-              TODO: Add description.
+          control_image_latents (`Tensor`, *optional*):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          **denoiser_input_fields (`None`, *optional*):
-              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
-              txt_seq_lens/negative_txt_seq_lens.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`None`, *optional*):
-              TODO: Add description.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 158763ce917a..0c1fa00842e5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -63,29 +63,14 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     QwenImage-Edit VL encoder step that encode the image and text prompts together.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           guider (`ClassifierFreeGuidance`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 64)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
@@ -95,13 +80,13 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           prompt_embeds (`Tensor`):
-              The prompt embeddings
+              The prompt embeddings.
           prompt_embeds_mask (`Tensor`):
-              The encoder attention mask
+              The encoder attention mask.
           negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings
+              The negative prompt embeddings.
           negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask
+              The negative prompt embeddings mask.
     """
 
     model_name = "qwenimage-edit"
@@ -128,26 +113,23 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
           resized_image (`List`):
               The resized images
-          processed_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage-edit"
@@ -173,16 +155,13 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
        - create image latents.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           image_mask_processor (`InpaintProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           mask_image (`Image`):
               Mask image for inpainting.
           padding_mask_crop (`int`, *optional*):
@@ -193,14 +172,14 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
       Outputs:
           resized_image (`List`):
               The resized images
-          processed_image (`None`):
-              TODO: Add description.
-          processed_mask_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage-edit"
@@ -252,36 +231,50 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
     """
 
     model_name = "qwenimage-edit"
@@ -308,38 +301,54 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+          processed_mask_image (`Tensor`):
+              The processed mask image (batch-expanded)
     """
 
     model_name = "qwenimage-edit"
@@ -368,30 +377,31 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
        - Create the patchified latents `mask` based on the processed mask image.
 
       Components:
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
+              vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-          height (`None`):
-              TODO: Add description.
-          width (`None`):
-              TODO: Add description.
-          dtype (`None`):
-              TODO: Add description.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
 
       Outputs:
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -416,32 +426,28 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit edit (img2img) task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -452,7 +458,7 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -496,34 +502,30 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit edit inpaint task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -536,7 +538,7 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -621,20 +623,18 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage-edit"
@@ -653,22 +653,20 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_mask_processor (`InpaintProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`None`, *optional*):
-              TODO: Add description.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage-edit"
@@ -724,41 +722,20 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           guider (`ClassifierFreeGuidance`)
-
           image_mask_processor (`InpaintProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           transformer (`QwenImageTransformer2DModel`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 64)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
@@ -775,10 +752,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
               The height in pixels of the generated image.
           width (`int`):
               The width in pixels of the generated image.
-          image_latents (`None`):
-              TODO: Add description.
-          processed_mask_image (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -789,12 +766,12 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
               Strength for img2img/inpainting.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`None`, *optional*):
-              TODO: Add description.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index a16dee1c7595..726c000f4b38 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -55,47 +55,32 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           guider (`ClassifierFreeGuidance`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
-
-          prompt_template_encode_start_idx (default: 64)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
           resized_cond_image (`List`):
-              The resized images
+              Images resized to 384x384 target area for VL text encoding
           prompt_embeds (`Tensor`):
-              The prompt embeddings
+              The prompt embeddings.
           prompt_embeds_mask (`Tensor`):
-              The encoder attention mask
+              The encoder attention mask.
           negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings
+              The negative prompt embeddings.
           negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask
+              The negative prompt embeddings mask.
     """
 
     model_name = "qwenimage-edit-plus"
@@ -122,26 +107,25 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
       Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
           resized_image (`List`):
-              The resized images
-          processed_image (`None`):
-              TODO: Add description.
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+          processed_image (`Tensor`):
+              The processed image
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage-edit-plus"
@@ -176,36 +160,50 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
        - Defaults height/width from last image in the list.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`List`):
               The image heights calculated from the image latents dimension
           image_width (`List`):
               The image widths calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
+              concatenated, and batch-expanded)
     """
 
     model_name = "qwenimage-edit-plus"
@@ -233,32 +231,28 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
 
       Components:
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -269,7 +263,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -317,20 +311,18 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocesses the generated image.
 
       Components:
-
           vae (`AutoencoderKLQwenImage`)
-
           image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The latents to decode, can be generated in the denoise step
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
           images (`List`):
-              Generated images.
+              Generated images. (tensor output of the vae decoder.)
     """
 
     model_name = "qwenimage-edit-plus"
@@ -365,41 +357,19 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           guider (`ClassifierFreeGuidance`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
-
           pachifier (`QwenImagePachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           transformer (`QwenImageTransformer2DModel`)
 
-      Configs:
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
-
-          prompt_template_encode_start_idx (default: 64)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
           negative_prompt (`str`, *optional*):
@@ -420,7 +390,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 2471750f2e0b..37a06e9af254 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -56,73 +56,19 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-
           guider (`ClassifierFreeGuidance`)
 
-      Configs:
-
-          image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
-    1. Write the caption using natural, descriptive language without structured formats or rich text.
-    2. Enrich caption details by including:
-     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
-     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
-    3. Maintain authenticity and accuracy:
-     - Avoid generalizations
-     - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
-
-          image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
-    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
-    2. 通过加入以下内容，丰富图注细节：
-     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
-     - 对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
-     - 环境细节：例如天气、光照、颜色、纹理、气氛等
-     - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调
-    3. 保持真实性与准确性：
-     - 不要使用笼统的描述
-     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 34)
-
-          tokenizer_max_length (default: 1024)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
           prompt (`str`, *optional*):
-              The prompt to encode
+              The prompt or prompts to guide image generation.
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
           negative_prompt (`str`, *optional*):
@@ -133,14 +79,16 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
       Outputs:
           resized_image (`List`):
               The resized images
+          prompt (`str`):
+              The prompt or prompts to guide image generation. If not provided, updated using image caption
           prompt_embeds (`Tensor`):
-              The prompt embeddings
+              The prompt embeddings.
           prompt_embeds_mask (`Tensor`):
-              The encoder attention mask
+              The encoder attention mask.
           negative_prompt_embeds (`Tensor`):
-              The negative prompt embeddings
+              The negative prompt embeddings.
           negative_prompt_embeds_mask (`Tensor`):
-              The negative prompt embeddings mask
+              The negative prompt embeddings mask.
     """
 
     model_name = "qwenimage-layered"
@@ -168,16 +116,13 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
           generator (`Generator`, *optional*):
@@ -186,10 +131,10 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
       Outputs:
           resized_image (`List`):
               The resized images
-          processed_image (`None`):
-              TODO: Add description.
+          processed_image (`Tensor`):
+              The processed image
           image_latents (`Tensor`):
-              The latents representing the reference image(s). Single tensor or list depending on input.
+              The latent representation of the input image.
     """
 
     model_name = "qwenimage-layered"
@@ -220,36 +165,46 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
-
           pachifier (`QwenImageLayeredPachifier`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           batch_size (`int`):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+              The batch size of the prompt embeddings
           dtype (`dtype`):
-              Data type of model tensor inputs (determined by `prompt_embeds`)
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
           image_height (`int`):
               The image height calculated from the image latents dimension
           image_width (`int`):
               The image width calculated from the image latents dimension
           height (`int`):
-              The height of the image output
+              if not provided, updated to image height
           width (`int`):
-              The width of the image output
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
+              pachifier and batch-expanded)
     """
 
     model_name = "qwenimage-layered"
@@ -275,28 +230,24 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Layered img2img task.
 
       Components:
-
           pachifier (`QwenImageLayeredPachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           guider (`ClassifierFreeGuidance`)
-
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          prompt_embeds (`None`):
-              TODO: Add description.
-          prompt_embeds_mask (`None`):
-              TODO: Add description.
-          negative_prompt_embeds (`None`, *optional*):
-              TODO: Add description.
-          negative_prompt_embeds_mask (`None`, *optional*):
-              TODO: Add description.
-          image_latents (`None`, *optional*):
-              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           layers (`int`, *optional*, defaults to 4):
@@ -309,7 +260,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
@@ -366,83 +317,24 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
 
       Components:
-
           image_resize_processor (`VaeImageProcessor`)
-
           text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-
           processor (`Qwen2VLProcessor`)
-
           tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-
           guider (`ClassifierFreeGuidance`)
-
           image_processor (`VaeImageProcessor`)
-
           vae (`AutoencoderKLQwenImage`)
-
           pachifier (`QwenImageLayeredPachifier`)
-
           scheduler (`FlowMatchEulerDiscreteScheduler`)
-
           transformer (`QwenImageTransformer2DModel`)
 
-      Configs:
-
-          image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
-    1. Write the caption using natural, descriptive language without structured formats or rich text.
-    2. Enrich caption details by including:
-     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
-     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
-    3. Maintain authenticity and accuracy:
-     - Avoid generalizations
-     - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
-
-          image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
-    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
-    2. 通过加入以下内容，丰富图注细节：
-     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
-     - 对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等
-     - 环境细节：例如天气、光照、颜色、纹理、气氛等
-     - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调
-    3. 保持真实性与准确性：
-     - 不要使用笼统的描述
-     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
-
-          prompt_template_encode_start_idx (default: 34)
-
-          tokenizer_max_length (default: 1024)
-
       Inputs:
-          image (`Image`):
-              Input image for img2img, editing, or conditioning.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
           prompt (`str`, *optional*):
-              The prompt to encode
+              The prompt or prompts to guide image generation.
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
           negative_prompt (`str`, *optional*):
@@ -463,7 +355,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
               Custom sigmas for the denoising process.
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-          denoiser_input_fields (`Tensor`, *optional*):
+          **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.

From f056af1fbb24b79c6cc5360ea782abacd63c34fd Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:27:40 +0100
Subject: [PATCH 18/58] make style

---
 .../modular_pipeline_utils.py                 |  18 +-
 .../qwenimage/before_denoise.py               | 133 ++++++++-----
 .../modular_pipelines/qwenimage/decoders.py   |  93 +++++----
 .../modular_pipelines/qwenimage/denoise.py    | 123 ++++++------
 .../modular_pipelines/qwenimage/encoders.py   | 177 ++++++++++--------
 .../modular_pipelines/qwenimage/inputs.py     |  91 ++++++---
 .../qwenimage/modular_blocks_qwenimage.py     | 136 +++++++-------
 .../modular_blocks_qwenimage_edit.py          |  81 ++++----
 .../modular_blocks_qwenimage_edit_plus.py     |  37 ++--
 .../modular_blocks_qwenimage_layered.py       |  40 ++--
 10 files changed, 497 insertions(+), 432 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 6f1010daf219..a57212988e28 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -438,7 +438,7 @@ class ConfigSpec:
         "description": "Number of layers to extract from the image",
     },
     # common intermediate inputs
-    "prompt_embeds":{
+    "prompt_embeds": {
         "type_hint": torch.Tensor,
         "required": True,
         "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
@@ -531,16 +531,16 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa
             raise ValueError(f"InputParam template for {template_name} not found")
 
         template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
-        
+
         # Determine the actual param name:
         # 1. From overrides if provided
         # 2. From template if present
         # 3. Fall back to template_name
         name = overrides.pop("name", template_kwargs.pop("name", template_name))
-        
+
         if note and "description" in template_kwargs:
             template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
-        
+
         template_kwargs.update(overrides)
         return cls(name=name, **template_kwargs)
 
@@ -564,18 +564,18 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "OutputP
         """Get template for name if exists, otherwise raise ValueError."""
         if template_name not in OUTPUT_PARAM_TEMPLATES:
             raise ValueError(f"OutputParam template for {template_name} not found")
-        
+
         template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
-        
+
         # Determine the actual param name:
         # 1. From overrides if provided
         # 2. From template if present
         # 3. Fall back to template_name
         name = overrides.pop("name", template_kwargs.pop("name", template_name))
-        
+
         if note and "description" in template_kwargs:
             template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
-        
+
         template_kwargs.update(overrides)
         return cls(name=name, **template_kwargs)
 
@@ -913,4 +913,4 @@ def make_doc_string(
     output += "\n\n"
     output += format_output_params(outputs, indent_level=2)
 
-    return output
\ No newline at end of file
+    return output
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 0b8cd0f4b2d2..418d927f4faa 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -117,6 +117,7 @@ def get_timesteps(scheduler, num_inference_steps, strength):
 # 1. PREPARE LATENTS
 # ====================
 
+
 # auto_docstring
 class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
     """
@@ -137,8 +138,8 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           dtype (`dtype`, *optional*, defaults to torch.float32):
               The dtype of the model inputs, can be generated in input step.
 
@@ -150,6 +151,7 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The initial latents to use for the denoising process
     """
+
     model_name = "qwenimage"
 
     @property
@@ -254,8 +256,8 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           dtype (`dtype`, *optional*, defaults to torch.float32):
               The dtype of the model inputs, can be generated in input step.
 
@@ -267,6 +269,7 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The initial latents to use for the denoising process
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -353,7 +356,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # auto_docstring
 class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
     """
-    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified.
+    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
+    prepare_latents. Both noise and image latents should alreadybe patchified.
 
       Components:
           scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -362,8 +366,8 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
-              vae encoder and updated in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
 
@@ -373,6 +377,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The scaled noisy latents to use for inpainting/image-to-image denoising.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -396,10 +401,10 @@ def inputs(self) -> List[InputParam]:
             ),
             InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
             InputParam(
-                name="timesteps", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+                name="timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
         ]
 
@@ -475,6 +480,7 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -541,10 +547,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 2. SET TIMESTEPS
 # ====================
 
+
 # auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
     """
-    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
+    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents
+    step.
 
       Components:
           scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -561,6 +569,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
           timesteps (`Tensor`):
               The timesteps to use for the denoising process
     """
+
     model_name = "qwenimage"
 
     @property
@@ -579,10 +588,10 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
             InputParam(
-                name="latents", 
+                name="latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step."
+                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.",
             ),
         ]
 
@@ -640,13 +649,14 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
-              vae encoder and packed in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and packed in input step.)
 
       Outputs:
           timesteps (`Tensor`):
               The timesteps to use for the denoising process.
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -671,9 +681,7 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name="timesteps", 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process."
+                name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process."
             ),
         ]
 
@@ -711,7 +719,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 # auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
     """
-    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step.
+    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after
+    prepare latents step.
 
       Components:
           scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -732,6 +741,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
           num_inference_steps (`int`):
               The number of denoising steps to perform at inference time. Updated based on strength.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -750,10 +760,10 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
             InputParam(
-                "latents", 
-                required=True, 
+                "latents",
+                required=True,
                 type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare latents step."
+                description="The latents to use for the denoising process. Can be generated in prepare latents step.",
             ),
             InputParam.template("strength", default=0.9),
         ]
@@ -815,6 +825,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 ## RoPE inputs for denoiser
 
+
 # auto_docstring
 class QwenImageRoPEInputsStep(ModularPipelineBlocks):
     """
@@ -822,8 +833,8 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
 
       Inputs:
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           height (`int`):
               The height in pixels of the generated image.
           width (`int`):
@@ -841,6 +852,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
           negative_txt_seq_lens (`List`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
+
     model_name = "qwenimage"
 
     @property
@@ -911,12 +923,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # auto_docstring
 class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
     """
-    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
+    prepare_latents step
 
       Inputs:
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           image_height (`int`):
               The height of the reference image. Can be generated in input step.
           image_width (`int`):
@@ -938,6 +951,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
           negative_txt_seq_lens (`List`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
+
     model_name = "qwenimage"
 
     @property
@@ -948,8 +962,18 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
-            InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."),
-            InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."),
+            InputParam(
+                name="image_height",
+                required=True,
+                type_hint=int,
+                description="The height of the reference image. Can be generated in input step.",
+            ),
+            InputParam(
+                name="image_width",
+                required=True,
+                type_hint=int,
+                description="The width of the reference image. Can be generated in input step.",
+            ),
             InputParam.template("height", required=True),
             InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
@@ -1016,13 +1040,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
     """
     Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
-      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.
-      Should be placed after prepare_latents step.
+      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
+      after prepare_latents step.
 
       Inputs:
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           image_height (`List`):
               The heights of the reference images. Can be generated in input step.
           image_width (`List`):
@@ -1044,6 +1068,7 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
           negative_txt_seq_lens (`List`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
+
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -1058,8 +1083,18 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("batch_size"),
-            InputParam(name="image_height", required=True, type_hint=List[int], description="The heights of the reference images. Can be generated in input step."),
-            InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."),
+            InputParam(
+                name="image_height",
+                required=True,
+                type_hint=List[int],
+                description="The heights of the reference images. Can be generated in input step.",
+            ),
+            InputParam(
+                name="image_width",
+                required=True,
+                type_hint=List[int],
+                description="The widths of the reference images. Can be generated in input step.",
+            ),
             InputParam.template("height", required=True),
             InputParam.template("width", required=True),
             InputParam.template("prompt_embeds_mask"),
@@ -1126,8 +1161,8 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
 
       Inputs:
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           layers (`int`, *optional*, defaults to 4):
               Number of layers to extract from the image
           height (`int`):
@@ -1149,6 +1184,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
           additional_t_cond (`Tensor`):
               The additional t cond, used for RoPE calculation
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -1231,6 +1267,7 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 ## ControlNet inputs for denoiser
 
+
 # auto_docstring
 class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
     """
@@ -1247,7 +1284,8 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
 
@@ -1255,6 +1293,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
           controlnet_keep (`List`):
               The controlnet keep values
     """
+
     model_name = "qwenimage"
 
     @property
@@ -1274,16 +1313,16 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("control_guidance_end"),
             InputParam.template("controlnet_conditioning_scale"),
             InputParam(
-                name="control_image_latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."
+                name="control_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
             ),
             InputParam(
-                name="timesteps", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+                name="timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 650bf34da7a3..1adbf6bdd355 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -30,10 +30,12 @@
 
 # after denoising loop (unpack latents)
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
     """
-    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)
+    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
+    channels, 1, height, width)
 
       Components:
           pachifier (`QwenImagePachifier`)
@@ -50,6 +52,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
           latents (`Tensor`):
               The denoisedlatents unpacked to B, C, 1, H, W
     """
+
     model_name = "qwenimage"
 
     @property
@@ -70,10 +73,10 @@ def inputs(self) -> List[InputParam]:
             InputParam.template("height", required=True),
             InputParam.template("width", required=True),
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The latents to decode, can be generated in the denoise step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to decode, can be generated in the denoise step.",
             ),
         ]
 
@@ -81,9 +84,7 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name="latents", 
-                type_hint=torch.Tensor, 
-                description="The denoisedlatents unpacked to B, C, 1, H, W"
+                name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W"
             ),
         ]
 
@@ -100,7 +101,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
     """
     Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
@@ -122,6 +123,7 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
           latents (`Tensor`):
               Denoised latents. (unpacked to B, C, layers+1, H, W)
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -138,10 +140,10 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The denoised latents to decode, can be generated in the denoise step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step.",
             ),
             InputParam.template("height", required=True),
             InputParam.template("width", required=True),
@@ -173,7 +175,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 # decode step
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageDecoderStep(ModularPipelineBlocks):
     """
     Step that decodes the latents to images
@@ -183,12 +186,14 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
 
       Outputs:
           images (`List`):
               Generated images. (tensor output of the vae decoder.)
     """
+
     model_name = "qwenimage"
 
     @property
@@ -207,10 +212,10 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
             ),
         ]
 
@@ -246,18 +251,18 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
     """
     Decode unpacked latents (B, C, layers+1, H, W) into layer images.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
@@ -265,6 +270,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -287,10 +293,10 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
             ),
             InputParam.template("output_type"),
         ]
@@ -345,7 +351,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 
 # postprocess the decoded images
 
-#auto_docstring
+
+# auto_docstring
 class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
     """
     postprocess the generated image
@@ -363,6 +370,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -384,10 +392,10 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="images", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="the generated image tensor from decoders step"
+                name="images",
+                required=True,
+                type_hint=torch.Tensor,
+                description="the generated image tensor from decoders step",
             ),
             InputParam.template("output_type"),
         ]
@@ -416,7 +424,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         return components, state
 
 
-#auto_docstring
+# auto_docstring
 class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
     """
     postprocess the generated image, optional apply the mask overally to the original image..
@@ -430,12 +438,14 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
               Generated images.
     """
+
     model_name = "qwenimage"
 
     @property
@@ -457,16 +467,17 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="images", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="the generated image tensor from decoders step"
+                name="images",
+                required=True,
+                type_hint=torch.Tensor,
+                description="the generated image tensor from decoders step",
             ),
             InputParam.template("output_type"),
             InputParam(
-                name="mask_overlay_kwargs", 
+                name="mask_overlay_kwargs",
                 type_hint=Dict[str, Any],
-                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
+                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.",
+            ),
         ]
 
     @property
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index ff6e411d7632..3b00fcb274df 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -50,10 +50,10 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
             ),
         ]
 
@@ -80,10 +80,10 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="latents", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
             ),
             InputParam.template("image_latents"),
         ]
@@ -131,10 +131,10 @@ def inputs(self) -> List[InputParam]:
             ),
             InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
             InputParam(
-                name="controlnet_keep", 
-                required=True, 
-                type_hint=List[float], 
-                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step."
+                name="controlnet_keep",
+                required=True,
+                type_hint=List[float],
+                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.",
             ),
         ]
 
@@ -467,10 +467,10 @@ def loop_expected_components(self) -> List[ComponentSpec]:
     def loop_inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="timesteps", 
-                required=True, 
-                type_hint=torch.Tensor, 
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+                name="timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
             InputParam.template("num_inference_steps", required=True),
         ]
@@ -505,21 +505,21 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 # Qwen Image (text2image, image2image)
 
+
 # auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageLoopBeforeDenoiser`
        - `QwenImageLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
       This block supports text2image and image2image tasks for QwenImage.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -539,6 +539,7 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage"
 
     block_classes = [
@@ -551,8 +552,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
     @property
     def description(self) -> str:
         return (
-            "Denoise step that iteratively denoise the latents. \n"
-            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
+            "Denoise step that iteratively denoise the latents.\n"
+            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n"
             "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
             " - `QwenImageLoopBeforeDenoiser`\n"
             " - `QwenImageLoopDenoiser`\n"
@@ -565,9 +566,9 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageLoopBeforeDenoiser`
        - `QwenImageLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
@@ -575,9 +576,8 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
       This block supports inpainting tasks for QwenImage.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -603,6 +603,7 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -630,9 +631,9 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageLoopBeforeDenoiser`
        - `QwenImageLoopBeforeDenoiserControlNet`
        - `QwenImageLoopDenoiser`
@@ -640,10 +641,8 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
       This block supports text2img/img2img tasks with controlnet for QwenImage.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          controlnet (`QwenImageControlNetModel`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
+          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -669,6 +668,7 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -696,9 +696,9 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageLoopBeforeDenoiser`
        - `QwenImageLoopBeforeDenoiserControlNet`
        - `QwenImageLoopDenoiser`
@@ -707,10 +707,8 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
       This block supports inpainting tasks with controlnet for QwenImage.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          controlnet (`QwenImageControlNetModel`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
+          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -742,6 +740,7 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -777,18 +776,17 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageEditLoopBeforeDenoiser`
        - `QwenImageEditLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
       This block supports QwenImage Edit.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -810,6 +808,7 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -835,9 +834,9 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageEditLoopBeforeDenoiser`
        - `QwenImageEditLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
@@ -845,9 +844,8 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
       This block supports inpainting tasks for QwenImage Edit.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -873,6 +871,7 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -900,18 +899,17 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
     """
-    Denoise step that iteratively denoise the latents. 
-      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method 
-      At each iteration, it runs blocks defined in `sub_blocks` sequencially:
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
        - `QwenImageEditLoopBeforeDenoiser`
        - `QwenImageEditLoopDenoiser`
        - `QwenImageLoopAfterDenoiser`
       This block supports QwenImage Layered.
 
       Components:
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
 
       Inputs:
           timesteps (`Tensor`):
@@ -933,6 +931,7 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
           latents (`Tensor`):
               Denoised latents.
     """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 083ee507ccbb..5e1821cca5c0 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -30,7 +30,7 @@
 from ...utils import logging
 from ...utils.torch_utils import unwrap_module
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import QwenImageModularPipeline
 from .prompt_templates import (
     QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE,
@@ -277,6 +277,7 @@ def encode_vae_image(
 # In most of our other pipelines, resizing is done as part of the image preprocessing step.
 # ====================
 
+
 # auto_docstring
 class QwenImageEditResizeStep(ModularPipelineBlocks):
     """
@@ -293,8 +294,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
           resized_image (`List`):
               The resized images
     """
-    model_name = "qwenimage-edit"
 
+    model_name = "qwenimage-edit"
 
     @property
     def description(self) -> str:
@@ -319,8 +320,8 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name="resized_image", 
-                type_hint=List[PIL.Image.Image], 
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
                 description="The resized images",
             ),
         ]
@@ -353,7 +354,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageLayeredResizeStep(ModularPipelineBlocks):
     """
-    Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio.
+    Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while
+    maintaining the aspect ratio.
 
       Components:
           image_resize_processor (`VaeImageProcessor`)
@@ -368,11 +370,12 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
           resized_image (`List`):
               The resized images
     """
+
     model_name = "qwenimage-layered"
 
     @property
     def description(self) -> str:
-        return f"Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
+        return "Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -399,11 +402,13 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(
-            name="resized_image", 
-            type_hint=List[PIL.Image.Image], 
-            description="The resized images",
-        )]
+        return [
+            OutputParam(
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
+                description="The resized images",
+            )
+        ]
 
     @staticmethod
     def check_inputs(resolution: int):
@@ -442,8 +447,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
     """
     Resize images for QwenImage Edit Plus pipeline.
-      Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding.
-      Each image is resized independently based on its own aspect ratio.
+      Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text
+      encoding. Each image is resized independently based on its own aspect ratio.
 
       Components:
           image_resize_processor (`VaeImageProcessor`)
@@ -484,7 +489,7 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         # image
-        return [InputParam.template("image")] 
+        return [InputParam.template("image")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -518,13 +523,11 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         resized_cond_images = []
         for image in images:
             image_width, image_height = image.size
-            
+
             # For VAE encoder (1024x1024 target area)
             vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height)
-            resized_images.append(
-                components.image_resize_processor.resize(image, height=vae_height, width=vae_width)
-            )
-            
+            resized_images.append(components.image_resize_processor.resize(image, height=vae_height, width=vae_width))
+
             # For VL text encoder (384x384 target area)
             vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height)
             resized_cond_images.append(
@@ -541,16 +544,16 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # 2. GET IMAGE PROMPT
 # ====================
 
+
 # auto_docstring
 class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
     """
     Auto-caption step that generates a text prompt from the input image if none is provided.
-      Uses the VL model (text_encoder) to generate a description of the image.
-      If prompt is already provided, this step passes through unchanged.
+      Uses the VL model (text_encoder) to generate a description of the image. If prompt is already provided, this step
+      passes through unchanged.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`)
 
       Inputs:
           prompt (`str`, *optional*):
@@ -590,7 +593,9 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template("prompt", required=False), # it is not required for qwenimage-layered, unlike other pipelines
+            InputParam.template(
+                "prompt", required=False
+            ),  # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -653,15 +658,15 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 3. TEXT ENCODER
 # ====================
 
+
 # auto_docstring
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
     """
     Text Encoder step that generates text embeddings to guide the image generation.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`)
 
       Inputs:
           prompt (`str`):
@@ -681,6 +686,7 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask.
     """
+
     model_name = "qwenimage"
 
     def __init__(self):
@@ -706,7 +712,6 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
@@ -786,12 +791,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
     """
-    Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation.
+    Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image
+    generation.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
+          (`ClassifierFreeGuidance`)
 
       Inputs:
           prompt (`str`):
@@ -811,6 +816,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask.
     """
+
     model_name = "qwenimage"
 
     def __init__(self):
@@ -835,7 +841,6 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
@@ -909,12 +914,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
     """
-    Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation.
+    Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text
+    embeddings for guiding image generation.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
+          (`ClassifierFreeGuidance`)
 
       Inputs:
           prompt (`str`):
@@ -922,7 +927,8 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
           resized_cond_image (`Tensor`):
-              The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step
+              The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using
+              resize step
 
       Outputs:
           prompt_embeds (`Tensor`):
@@ -963,7 +969,6 @@ def expected_components(self) -> List[ComponentSpec]:
             ),
         ]
 
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
@@ -1042,10 +1047,12 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # 4. IMAGE PREPROCESS
 # ====================
 
+
 # auto_docstring
 class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
     """
-    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width.
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be
+    resized to the given height and width.
 
       Components:
           image_mask_processor (`InpaintProcessor`)
@@ -1070,6 +1077,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
     """
+
     model_name = "qwenimage"
 
     @property
@@ -1152,7 +1160,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
     """
-    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first.
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be
+    resized first.
 
       Components:
           image_mask_processor (`InpaintProcessor`)
@@ -1173,6 +1182,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
     """
+
     model_name = "qwenimage-edit"
 
     @property
@@ -1206,11 +1216,7 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="processed_image", 
-                type_hint=torch.Tensor, 
-                description="The processed image"
-            ),
+            OutputParam(name="processed_image", type_hint=torch.Tensor, description="The processed image"),
             OutputParam(
                 name="processed_mask_image",
                 type_hint=torch.Tensor,
@@ -1263,6 +1269,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
           processed_image (`Tensor`):
               The processed image
     """
+
     model_name = "qwenimage"
 
     @property
@@ -1290,11 +1297,13 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(
-            name="processed_image",
-            type_hint=torch.Tensor,
-            description="The processed image",
-        )]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @staticmethod
     def check_inputs(height, width, vae_scale_factor):
@@ -1340,6 +1349,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
           processed_image (`Tensor`):
               The processed image
     """
+
     model_name = "qwenimage-edit"
 
     @property
@@ -1361,7 +1371,7 @@ def expected_components(self) -> List[ComponentSpec]:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                name="resized_image", 
+                name="resized_image",
                 required=True,
                 type_hint=List[PIL.Image.Image],
                 description="The resized image. should be generated using a resize step",
@@ -1370,11 +1380,13 @@ def inputs(self) -> List[InputParam]:
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(
-            name="processed_image",
-            type_hint=torch.Tensor,
-            description="The processed image",
-        )]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1395,7 +1407,8 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # auto_docstring
 class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
     """
-    Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images.
+    Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of
+    processed images.
 
       Components:
           image_processor (`VaeImageProcessor`)
@@ -1408,6 +1421,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
           processed_image (`Tensor`):
               The processed image
     """
+
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -1427,20 +1441,24 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam(
-            name="resized_image",
-            required=True,
-            type_hint=List[PIL.Image.Image],
-            description="The resized image. should be generated using a resize step",
-        )]
+        return [
+            InputParam(
+                name="resized_image",
+                required=True,
+                type_hint=List[PIL.Image.Image],
+                description="The resized image. should be generated using a resize step",
+            )
+        ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(
-            name="processed_image",
-            type_hint=torch.Tensor,
-            description="The processed image",
-        )]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1472,6 +1490,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 # 5. VAE ENCODER
 # ====================
 
+
 # auto_docstring
 class QwenImageVaeEncoderStep(ModularPipelineBlocks):
     """
@@ -1509,7 +1528,9 @@ def __init__(
             output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents".
         """
         if input is None:
-            input = InputParam(name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode")
+            input = InputParam(
+                name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode"
+            )
 
         if output is None:
             output = OutputParam.template("image_latents")
@@ -1539,13 +1560,13 @@ def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            self._input, # default is "processed_image"
+            self._input,  # default is "processed_image"
             InputParam.template("generator"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [self._output] # default is "image_latents"
+        return [self._output]  # default is "image_latents"
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -1588,9 +1609,8 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
     VAE Encoder step that converts `control_image` into latent representations control_image_latents.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          controlnet (`QwenImageControlNetModel`)
-          control_image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
+          (`VaeImageProcessor`)
 
       Inputs:
           control_image (`Image`):
@@ -1606,6 +1626,7 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
           control_image_latents (`Tensor`):
               The latents representing the control image
     """
+
     model_name = "qwenimage"
 
     @property
@@ -1720,6 +1741,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # 6. PERMUTE LATENTS
 # ====================
 
+
 # auto_docstring
 class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
     """
@@ -1733,11 +1755,12 @@ class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
           image_latents (`Tensor`):
               The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W])
     """
+
     model_name = "qwenimage-layered"
 
     @property
     def description(self) -> str:
-        return f"Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
+        return "Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
 
     @property
     def inputs(self) -> List[InputParam]:
@@ -1760,4 +1783,4 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         block_state.image_latents = latents.permute(0, 2, 1, 3, 4)
 
         self.set_block_state(state, block_state)
-        return components, state
\ No newline at end of file
+        return components, state
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index 0e03242e5e49..818bbca5ed0a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple, Optional
+from typing import List, Optional, Tuple
 
 import torch
 
@@ -117,7 +117,8 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
         1. Determines `batch_size` and `dtype` based on `prompt_embeds`
         2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
 
-      This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps.
+      This block should be placed after all encoder steps to process the text embeddings before they are used in
+      subsequent pipeline steps.
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -145,6 +146,7 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask. (batch-expanded)
     """
+
     model_name = "qwenimage"
 
     @property
@@ -271,8 +273,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -300,7 +302,7 @@ def __init__(
         self,
         image_latent_inputs: Optional[List[InputParam]] = None,
         additional_batch_inputs: Optional[List[InputParam]] = None,
-    ):   
+    ):
         # by default, process `image_latents`
         if image_latent_inputs is None:
             image_latent_inputs = [InputParam.template("image_latents")]
@@ -319,7 +321,9 @@ def __init__(
         else:
             for input_param in additional_batch_inputs:
                 if not isinstance(input_param, InputParam):
-                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -376,13 +380,17 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 name="image_width",
                 type_hint=int,
                 description="The image width calculated from the image latents dimension",
-            )
+            ),
         ]
 
         # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
 
         # image latent inputs are modified in place (patchified and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -479,8 +487,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -526,7 +534,9 @@ def __init__(
         else:
             for input_param in additional_batch_inputs:
                 if not isinstance(input_param, InputParam):
-                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -587,11 +597,15 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 description="The image widths calculated from the image latents dimension",
             ),
         ]
-        
+
         # `height`/`width` are updated if any image latent inputs are provided
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
 
         # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -686,11 +700,13 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 
 # same as QwenImageAdditionalInputsStep, but with layered pachifier.
 
+
 # auto_docstring
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
     """
     Input processing step for Layered that:
-        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size
+        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
+           size
         2. For additional batch inputs: Expands batch dimensions to match final batch size
 
       Configured inputs:
@@ -705,8 +721,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
 
@@ -720,8 +736,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
           width (`int`):
               if not provided, updated to image width
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
-              pachifier and batch-expanded)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
+              with layered pachifier and batch-expanded)
     """
 
     model_name = "qwenimage-layered"
@@ -748,7 +764,9 @@ def __init__(
         else:
             for input_param in additional_batch_inputs:
                 if not isinstance(input_param, InputParam):
-                    raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -808,8 +826,12 @@ def intermediate_outputs(self) -> List[OutputParam]:
         ]
 
         if len(self._image_latent_inputs) > 0:
-            outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
-            outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
 
         # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
         for input_param in self._image_latent_inputs:
@@ -895,10 +917,11 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
 
       Inputs:
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           batch_size (`int`, *optional*, defaults to 1):
-              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
-              generated in input step.
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           height (`int`, *optional*):
@@ -914,6 +937,7 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
           width (`int`):
               if not provided, updated to control image width
     """
+
     model_name = "qwenimage"
 
     @property
@@ -923,17 +947,26 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
+            InputParam(
+                name="control_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
+            ),
             InputParam.template("batch_size"),
             InputParam.template("num_images_per_prompt"),
             InputParam.template("height"),
             InputParam.template("width"),
         ]
-    
+
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."),
+            OutputParam(
+                name="control_image_latents",
+                type_hint=torch.Tensor,
+                description="The control image latents (patchified and batch-expanded).",
+            ),
             OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
             OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index b50e41bb5079..5837799d3431 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import torch
+
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
     QwenImageControlNetBeforeDenoiserStep,
     QwenImageCreateMaskLatentsStep,
@@ -65,9 +66,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`)
 
       Inputs:
           prompt (`str`, *optional*):
@@ -114,8 +114,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
        - Creates `image_latents`.
 
       Components:
-          image_mask_processor (`InpaintProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)
 
       Inputs:
           mask_image (`Image`):
@@ -162,8 +161,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that preprocess andencode the image inputs into their latent representations.
 
       Components:
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -218,9 +216,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
        - if `control_image` is not provided, step will be skipped.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          controlnet (`QwenImageControlNetModel`)
-          control_image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
+          (`VaeImageProcessor`)
 
       Inputs:
           control_image (`Image`, *optional*):
@@ -380,7 +377,9 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     block_classes = [
         QwenImageTextInputsStep(),
         QwenImageAdditionalInputsStep(
-            additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
+            additional_batch_inputs=[
+                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
+            ]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -401,15 +400,14 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
        - Create the pachified latents `mask` based on the processedmask image.
 
       Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          pachifier (`QwenImagePachifier`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
 
       Inputs:
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
-              vae encoder and updated in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
@@ -450,13 +448,12 @@ def description(self) -> str:
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -524,13 +521,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -606,13 +602,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -686,14 +681,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          controlnet (`QwenImageControlNetModel`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -707,7 +700,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
           negative_prompt_embeds_mask (`Tensor`, *optional*):
               mask for the negative text embeddings. Can be generated from text_encoder step.
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -773,14 +767,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          controlnet (`QwenImageControlNetModel`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -802,7 +794,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           processed_mask_image (`Tensor`, *optional*):
               The processed mask image
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -868,14 +861,12 @@ def outputs(self):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          controlnet (`QwenImageControlNetModel`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -895,7 +886,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
           control_image_latents (`Tensor`):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -1030,12 +1022,12 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
@@ -1057,19 +1049,21 @@ def description(self):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+    overally to the original image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_mask_processor (`InpaintProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
@@ -1125,17 +1119,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       - for text-to-image generation, all you need to provide is `prompt`
 
       Components:
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
-          image_mask_processor (`InpaintProcessor`)
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
-          controlnet (`QwenImageControlNetModel`)
-          control_image_processor (`VaeImageProcessor`)
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          transformer (`QwenImageTransformer2DModel`)
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
+          control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           prompt (`str`, *optional*):
@@ -1185,7 +1173,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
           control_image_latents (`Tensor`, *optional*):
-              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
           control_guidance_end (`float`, *optional*, defaults to 1.0):
@@ -1195,7 +1184,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 0c1fa00842e5..e1e5c4335481 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 from typing import Optional
+
 import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
     QwenImageCreateMaskLatentsStep,
     QwenImageEditRoPEInputsStep,
@@ -63,10 +64,8 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     QwenImage-Edit VL encoder step that encode the image and text prompts together.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -113,9 +112,8 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -155,9 +153,8 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
        - create image latents.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          image_mask_processor (`InpaintProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -354,7 +351,10 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
+        QwenImageAdditionalInputsStep(
+            additional_batch_inputs=[
+                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
+            ]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -377,15 +377,14 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
        - Create the patchified latents `mask` based on the processed mask image.
 
       Components:
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          pachifier (`QwenImagePachifier`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
 
       Inputs:
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
-              vae encoder and updated in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
@@ -426,10 +425,8 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit edit (img2img) task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -502,10 +499,8 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit edit inpaint task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -623,12 +618,12 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
@@ -650,19 +645,21 @@ def description(self):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+    overlay to the original image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_mask_processor (`InpaintProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
@@ -719,19 +716,14 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
-          image_mask_processor (`InpaintProcessor`)
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          transformer (`QwenImageTransformer2DModel`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -771,7 +763,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
           mask_overlay_kwargs (`Dict`, *optional*):
-              The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 726c000f4b38..37656cef5d76 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
+from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
     QwenImageEditPlusRoPEInputsStep,
     QwenImagePrepareLatentsStep,
@@ -55,10 +54,8 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -107,9 +104,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
       Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -231,10 +227,8 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
 
       Components:
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -311,12 +305,12 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     Decode step that decodes the latents to images and postprocesses the generated image.
 
       Components:
-          vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
 
       Inputs:
           latents (`Tensor`):
-              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
 
@@ -357,14 +351,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          guider (`ClassifierFreeGuidance`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
-          pachifier (`QwenImagePachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`)
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 37a06e9af254..fdfeab048835 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -53,14 +52,12 @@
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+    provided.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -116,9 +113,8 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
 
       Inputs:
           image (`Union[Image, List]`):
@@ -203,8 +199,8 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
           width (`int`):
               if not provided, updated to image width
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
-              pachifier and batch-expanded)
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
+              with layered pachifier and batch-expanded)
     """
 
     model_name = "qwenimage-layered"
@@ -230,10 +226,8 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     Core denoising workflow for QwenImage-Layered img2img task.
 
       Components:
-          pachifier (`QwenImageLayeredPachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          guider (`ClassifierFreeGuidance`)
-          transformer (`QwenImageTransformer2DModel`)
+          pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -317,16 +311,10 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
 
       Components:
-          image_resize_processor (`VaeImageProcessor`)
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
-          processor (`Qwen2VLProcessor`)
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
-          guider (`ClassifierFreeGuidance`)
-          image_processor (`VaeImageProcessor`)
-          vae (`AutoencoderKLQwenImage`)
-          pachifier (`QwenImageLayeredPachifier`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`)
-          transformer (`QwenImageTransformer2DModel`)
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
           image (`Union[Image, List]`):

From 94525200fdbc55f1f2ed1c6ef64cba8cd990da21 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:35:39 +0100
Subject: [PATCH 19/58] rmove space in make docstring

---
 src/diffusers/modular_pipelines/modular_pipeline_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index a57212988e28..5468cf54d0fc 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -893,7 +893,7 @@ def make_doc_string(
     # Add description
     if description:
         desc_lines = description.strip().split("\n")
-        aligned_desc = "\n".join("  " + line for line in desc_lines)
+        aligned_desc = "\n".join("  " + line.rstrip() for line in desc_lines)
         output += aligned_desc + "\n\n"
 
     # Add components section if provided

From 7e9d2b954e734d382a138d69743025eab9f7aeba Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Sun, 18 Jan 2026 22:44:44 -1000
Subject: [PATCH 20/58] Apply suggestions from code review

---
 src/diffusers/modular_pipelines/modular_pipeline_utils.py   | 2 +-
 src/diffusers/modular_pipelines/qwenimage/before_denoise.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 5468cf54d0fc..8116f26d39a3 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -549,7 +549,7 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa
 class OutputParam:
     """Specification for an output parameter."""
 
-    name: str = None
+    name: str 
     type_hint: Any = None
     description: str = ""
     kwargs_type: str = None
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 418d927f4faa..aae6eb50d935 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -674,7 +674,7 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam.template("num_inference_steps"),
             InputParam.template("sigmas"),
-            InputParam.template("image_latents", note="Can be generated from vae encoder and packed in input step."),
+            InputParam.template("image_latents"),
         ]
 
     @property

From b7127ce7a72ddffadaf70c334effb24cf0422649 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:54:40 +0100
Subject: [PATCH 21/58] revert change in z

---
 src/diffusers/modular_pipelines/z_image/denoise.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index a165fb513f3c..5f76a8459fde 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -129,7 +129,10 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam.denoiser_input_fields(),
+            InputParam(
+                kwargs_type="denoiser_input_fields",
+                description="The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+            ),
         ]
         guider_input_names = []
         uncond_guider_input_names = []

From 1f9576a2ca97c6bacef9f79b570c7b859b663b13 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Mon, 19 Jan 2026 09:56:14 +0100
Subject: [PATCH 22/58] fix

---
 src/diffusers/modular_pipelines/modular_pipeline_utils.py   | 2 +-
 src/diffusers/modular_pipelines/qwenimage/before_denoise.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 8116f26d39a3..f3b12d716160 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -549,7 +549,7 @@ def template(cls, template_name: str, note: str = None, **overrides) -> "InputPa
 class OutputParam:
     """Specification for an output parameter."""
 
-    name: str 
+    name: str
     type_hint: Any = None
     description: str = ""
     kwargs_type: str = None
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index aae6eb50d935..3c9d29260d12 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -649,8 +649,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
           image_latents (`Tensor`):
-              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
-              generated from vae encoder and packed in input step.)
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
           timesteps (`Tensor`):

From 23d06423abf84f70414d2c42908fdd03485a7cf3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Mon, 19 Jan 2026 09:23:31 +0000
Subject: [PATCH 23/58] Apply style fixes

---
 .../pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py    | 1 -
 src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py  | 1 -
 .../pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py       | 1 -
 .../stable_diffusion/pipeline_stable_diffusion_latent_upscale.py | 1 -
 4 files changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index 94c4c394465b..2ea7307fec32 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -84,7 +84,6 @@
         >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
         >>> from diffusers.utils import load_image
 
-
         >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
         >>> feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
         >>> controlnet = ControlNetModel.from_pretrained(
diff --git a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
index d259f7ee7865..b41d9772a7cc 100644
--- a/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
+++ b/src/diffusers/pipelines/hidream_image/pipeline_hidream_image.py
@@ -53,7 +53,6 @@
         >>> from transformers import AutoTokenizer, LlamaForCausalLM
         >>> from diffusers import HiDreamImagePipeline
 
-
         >>> tokenizer_4 = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
         >>> text_encoder_4 = LlamaForCausalLM.from_pretrained(
         ...     "meta-llama/Meta-Llama-3.1-8B-Instruct",
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
index df5b3f5c10a5..5a6b8d5e9f37 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -85,7 +85,6 @@
         >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetPAGImg2ImgPipeline, AutoencoderKL
         >>> from diffusers.utils import load_image
 
-
         >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
         >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
         >>> controlnet = ControlNetModel.from_pretrained(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index 66d5ffa6b849..a1d0407caf5e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -459,7 +459,6 @@ def __call__(
         >>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
         >>> import torch
 
-
         >>> pipeline = StableDiffusionPipeline.from_pretrained(
         ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
         ... )

From 412e51c85651874fa01f6cd044c15bf08e7ddbc8 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 24 Jan 2026 14:04:24 +0530
Subject: [PATCH 24/58] include auto-docstring check in the modular ci.
 (#13004)

---
 .github/workflows/pr_modular_tests.yml | 20 +++++++++++++++++++-
 Makefile                               |  4 ++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr_modular_tests.yml b/.github/workflows/pr_modular_tests.yml
index 3bdfb4ca99c6..89b502d364ec 100644
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -75,9 +75,27 @@ jobs:
         if: ${{ failure() }}
         run: |
           echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
+  check_auto_docs:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[quality]
+      - name: Check auto docs
+        run: make modular-autodoctrings
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Auto docstring checks failed. Please run `python utils/modular_auto_docstring.py --fix_and_overwrite`." >> $GITHUB_STEP_SUMMARY
 
   run_fast_tests:
-    needs: [check_code_quality, check_repository_consistency]
+    needs: [check_code_quality, check_repository_consistency, check_auto_docs]
     name: Fast PyTorch Modular Pipeline CPU tests
 
     runs-on:
diff --git a/Makefile b/Makefile
index 9af2e8b1a5c9..b90ff82ab268 100644
--- a/Makefile
+++ b/Makefile
@@ -70,6 +70,10 @@ fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 
+# Auto docstrings in modular blocks
+modular-autodoctrings:
+	python utils/modular_auto_docstring.py
+
 # Run tests for the library
 
 test:

From 6a549f5f5532c874f9fc5f8a958a1aec53c8442c Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sun, 25 Jan 2026 11:40:52 +0100
Subject: [PATCH 25/58] initial support: workflow

---
 .../modular_pipelines/modular_pipeline.py     | 316 +++++++++---------
 .../modular_pipeline_utils.py                 |  69 +++-
 .../qwenimage/modular_blocks_qwenimage.py     |  15 +
 3 files changed, 247 insertions(+), 153 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index d857fd040955..f9bf257c3606 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -42,6 +42,8 @@
     format_components,
     format_configs,
     make_doc_string,
+    combine_inputs,
+    combine_outputs,
 )
 
 
@@ -242,6 +244,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
 
     config_name = "modular_config.json"
     model_name = None
+    _workflow_map = None
 
     @classmethod
     def _get_signature_keys(cls, obj):
@@ -297,6 +300,36 @@ def _get_outputs(self):
     def outputs(self) -> List[OutputParam]:
         return self._get_outputs()
 
+    # currentlyonly ConditionalPipelineBlocks and SequentialPipelineBlocks support `get_execution_blocks`
+    def get_execution_blocks(self, **kwargs):
+        """
+        Get the block(s) that would execute given the inputs.
+        Must be implemented by subclasses that support conditional block selection.
+        
+        Args:
+            **kwargs: Input names and values. Only trigger inputs affect block selection.
+        """
+        raise NotImplementedError(f"`get_execution_blocks` is not implemented for {self.__class__.__name__}")
+
+    # currently only SequentialPipelineBlocks support workflows
+    @property
+    def workflow_names(self):
+        """
+        Returns a list of available workflow names.
+        Must be implemented by subclasses that define `_workflow_map`.
+        """
+        raise NotImplementedError(f"`workflow_names` is not implemented for {self.__class__.__name__}")
+
+    def get_workflow(self, workflow_name: str):
+        """
+        Get the execution blocks for a specific workflow.
+        Must be implemented by subclasses that define `_workflow_map`.
+        
+        Args:
+            workflow_name: Name of the workflow to retrieve.
+        """
+        raise NotImplementedError(f"`get_workflow` is not implemented for {self.__class__.__name__}")
+
     @classmethod
     def from_pretrained(
         cls,
@@ -434,72 +467,6 @@ def set_block_state(self, state: PipelineState, block_state: BlockState):
                     if current_value is not param:  # Using identity comparison to check if object was modified
                         state.set(param_name, param, input_param.kwargs_type)
 
-    @staticmethod
-    def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> List[InputParam]:
-        """
-        Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if
-        current default value is None and new default value is not None. Warns if multiple non-None default values
-        exist for the same input.
-
-        Args:
-            named_input_lists: List of tuples containing (block_name, input_param_list) pairs
-
-        Returns:
-            List[InputParam]: Combined list of unique InputParam objects
-        """
-        combined_dict = {}  # name -> InputParam
-        value_sources = {}  # name -> block_name
-
-        for block_name, inputs in named_input_lists:
-            for input_param in inputs:
-                if input_param.name is None and input_param.kwargs_type is not None:
-                    input_name = "*_" + input_param.kwargs_type
-                else:
-                    input_name = input_param.name
-                if input_name in combined_dict:
-                    current_param = combined_dict[input_name]
-                    if (
-                        current_param.default is not None
-                        and input_param.default is not None
-                        and current_param.default != input_param.default
-                    ):
-                        warnings.warn(
-                            f"Multiple different default values found for input '{input_name}': "
-                            f"{current_param.default} (from block '{value_sources[input_name]}') and "
-                            f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
-                        )
-                    if current_param.default is None and input_param.default is not None:
-                        combined_dict[input_name] = input_param
-                        value_sources[input_name] = block_name
-                else:
-                    combined_dict[input_name] = input_param
-                    value_sources[input_name] = block_name
-
-        return list(combined_dict.values())
-
-    @staticmethod
-    def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) -> List[OutputParam]:
-        """
-        Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
-        occurrence of each output name.
-
-        Args:
-            named_output_lists: List of tuples containing (block_name, output_param_list) pairs
-
-        Returns:
-            List[OutputParam]: Combined list of unique OutputParam objects
-        """
-        combined_dict = {}  # name -> OutputParam
-
-        for block_name, outputs in named_output_lists:
-            for output_param in outputs:
-                if (output_param.name not in combined_dict) or (
-                    combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
-                ):
-                    combined_dict[output_param.name] = output_param
-
-        return list(combined_dict.values())
-
     @property
     def input_names(self) -> List[str]:
         return [input_param.name for input_param in self.inputs if input_param.name is not None]
@@ -531,7 +498,8 @@ def doc(self):
 class ConditionalPipelineBlocks(ModularPipelineBlocks):
     """
     A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the
-    `select_block` method to define the logic for selecting the block.
+    `select_block` method to define the logic for selecting the block. Currently, we only support selection logic 
+    based on the presence or absence of inputs (i.e., whether they are `None` or not)
 
     This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
     library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -539,15 +507,20 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
     > [!WARNING] > This is an experimental feature and is likely to change in the future.
 
     Attributes:
-        block_classes: List of block classes to be used
-        block_names: List of prefixes for each block
-        block_trigger_inputs: List of input names that select_block() uses to determine which block to run
+        block_classes: List of block classes to be used. Must have the same length as `block_names`.
+        block_names: List of names for each block. Must have the same length as `block_classes`.
+        block_trigger_inputs: List of input names that `select_block()` uses to determine which block to run.
+            For `ConditionalPipelineBlocks`, this does not need to correspond to `block_names` and `block_classes`.
+            For `AutoPipelineBlocks`, this must have the same length as `block_names` and `block_classes`,
+            where each element specifies the trigger input for the corresponding block.
+        default_block_name: Name of the default block to run when no trigger inputs match.
+            If None, this block can be skipped entirely when no trigger inputs are provided.
     """
 
     block_classes = []
     block_names = []
     block_trigger_inputs = []
-    default_block_name = None  # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided
+    default_block_name = None
 
     def __init__(self):
         sub_blocks = InsertableDict()
@@ -611,7 +584,7 @@ def required_inputs(self) -> List[str]:
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
-        combined_inputs = self.combine_inputs(*named_inputs)
+        combined_inputs = combine_inputs(*named_inputs)
         # mark Required inputs only if that input is required by all the blocks
         for input_param in combined_inputs:
             if input_param.name in self.required_inputs:
@@ -623,15 +596,16 @@ def inputs(self) -> List[Tuple[str, Any]]:
     @property
     def intermediate_outputs(self) -> List[str]:
         named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
         return combined_outputs
 
     @property
     def outputs(self) -> List[str]:
         named_outputs = [(name, block.outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
         return combined_outputs
 
+    # used for `__repr__`
     def _get_trigger_inputs(self) -> set:
         """
         Returns a set of all unique trigger input values found in this block and nested blocks.
@@ -660,11 +634,6 @@ def fn_recursive_get_trigger(blocks):
 
         return all_triggers
 
-    @property
-    def trigger_inputs(self):
-        """All trigger inputs including from nested blocks."""
-        return self._get_trigger_inputs()
-
     def select_block(self, **kwargs) -> Optional[str]:
         """
         Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic
@@ -704,6 +673,39 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
             logger.error(error_msg)
             raise
 
+    def get_execution_blocks(self, **kwargs) -> Optional["ModularPipelineBlocks"]:
+        """
+        Get the block(s) that would execute given the inputs.
+        
+        Recursively resolves nested ConditionalPipelineBlocks until reaching either:
+        - A leaf block (no sub_blocks) → returns single `ModularPipelineBlocks`
+        - A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns 
+        a `SequentialPipelineBlocks` containing the resolved execution blocks
+        
+        Args:
+            **kwargs: Input names and values. Only trigger inputs affect block selection.
+            
+        Returns:
+            - `ModularPipelineBlocks`: A leaf block or resolved `SequentialPipelineBlocks`
+            - `None`: If this block would be skipped (no trigger matched and no default)
+        """
+        trigger_kwargs = {name: kwargs.get(name) for name in self.block_trigger_inputs if name is not None}
+        block_name = self.select_block(**trigger_kwargs)
+        
+        if block_name is None:
+            block_name = self.default_block_name
+        
+        if block_name is None:
+            return None
+        
+        block = self.sub_blocks[block_name]
+        
+        # Recursively resolve until we hit a leaf block or a SequentialPipelineBlocks
+        if block.sub_blocks:
+            return block.get_execution_blocks(**kwargs)
+        
+        return block
+
     def __repr__(self):
         class_name = self.__class__.__name__
         base_class = self.__class__.__bases__[0].__name__
@@ -711,11 +713,11 @@ def __repr__(self):
             f"{class_name}(\n  Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
         )
 
-        if self.trigger_inputs:
+        if self._get_trigger_inputs():
             header += "\n"
             header += "  " + "=" * 100 + "\n"
             header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {sorted(self.trigger_inputs)}\n"
+            header += f"  Trigger Inputs: {sorted(self._get_trigger_inputs())}\n"
             header += "  " + "=" * 100 + "\n\n"
 
         # Format description with proper indentation
@@ -783,23 +785,51 @@ def doc(self):
 class AutoPipelineBlocks(ConditionalPipelineBlocks):
     """
     A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
+    
+    This is a specialized version of `ConditionalPipelineBlocks` where:
+    - Each block has one corresponding trigger input (1:1 mapping)
+    - Block selection is automatic: the first block whose trigger input is present gets selected
+    - `block_trigger_inputs` must have the same length as `block_names` and `block_classes`
+    - Use `None` in `block_trigger_inputs` to specify the default block, i.e the block that will run if no trigger inputs are present
+    
+    Attributes:
+        block_classes: List of block classes to be used. Must have the same length as `block_names` and `block_trigger_inputs`.
+        block_names: List of names for each block. Must have the same length as `block_classes` and `block_trigger_inputs`.
+        block_trigger_inputs: List of input names where each element specifies the trigger input for the corresponding block.
+            Use `None` to mark the default block.
+    
+    Example:
+```python
+    class MyAutoBlock(AutoPipelineBlocks):
+        block_classes = [InpaintEncoderBlock, ImageEncoderBlock, TextEncoderBlock]
+        block_names = ["inpaint", "img2img", "text2img"]
+        block_trigger_inputs = ["mask_image", "image", None]  # text2img is the default
+```
+        
+    With this definition:
+    - As long as `mask_image` is provided, "inpaint" block runs (regardless of `image` being provided or not)
+    - If `mask_image` is not provided but `image` is provided, "img2img" block runs
+    - Otherwise, "text2img" block runs (default, trigger is `None`)
     """
 
     def __init__(self):
         super().__init__()
 
+        if self.default_block_name is not None:
+            raise ValueError(
+                f"In {self.__class__.__name__}, do not set `default_block_name` for AutoPipelineBlocks. "
+                f"Use `None` in `block_trigger_inputs` to specify the default block."
+            )
+
         if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
             raise ValueError(
                 f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
             )
 
-    @property
-    def default_block_name(self) -> Optional[str]:
-        """Derive default_block_name from block_trigger_inputs (None entry)."""
         if None in self.block_trigger_inputs:
             idx = self.block_trigger_inputs.index(None)
-            return self.block_names[idx]
-        return None
+            self.default_block_name = self.block_names[idx]
+
 
     def select_block(self, **kwargs) -> Optional[str]:
         """Select block based on which trigger input is present (not None)."""
@@ -853,6 +883,26 @@ def expected_configs(self):
                     expected_configs.append(config)
         return expected_configs
 
+
+    @property
+    def workflow_names(self):
+        if self._workflow_map is None:
+            raise NotImplementedError(f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}")
+        
+        return list(self._workflow_map.keys())
+
+    def get_workflow(self, workflow_name: str):
+        if self._workflow_map is None:
+            raise NotImplementedError(f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}")
+        
+        if workflow_name not in self._workflow_map:
+            raise ValueError(f"Workflow {workflow_name} not found in {self.__class__.__name__}")
+        
+        trigger_inputs = self._workflow_map[workflow_name]
+        workflow_blocks = self.get_execution_blocks(**trigger_inputs)
+
+        return workflow_blocks
+
     @classmethod
     def from_blocks_dict(
         cls, blocks_dict: Dict[str, Any], description: Optional[str] = None
@@ -948,7 +998,7 @@ def intermediate_outputs(self) -> List[str]:
             # filter out them here so they do not end up as intermediate_outputs
             if name not in inp_names:
                 named_outputs.append((name, block.intermediate_outputs))
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
         return combined_outputs
 
     # YiYi TODO: I think we can remove the outputs property
@@ -972,6 +1022,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
                 raise
         return pipeline, state
 
+    # used for `trigger_inputs` property
     def _get_trigger_inputs(self):
         """
         Returns a set of all unique trigger input values found in the blocks.
@@ -995,89 +1046,50 @@ def fn_recursive_get_trigger(blocks):
 
         return fn_recursive_get_trigger(self.sub_blocks)
 
-    @property
-    def trigger_inputs(self):
-        return self._get_trigger_inputs()
-
-    def _traverse_trigger_blocks(self, active_inputs):
+    def get_execution_blocks(self, **kwargs) -> "SequentialPipelineBlocks":
         """
-        Traverse blocks and select which ones would run given the active inputs.
+        Get the blocks that would execute given the specified inputs.
 
         Args:
-            active_inputs: Dict of input names to values that are "present"
+            **kwargs: Input names and values. Only trigger inputs affect block selection.
 
         Returns:
-            OrderedDict of block_name -> block that would execute
+            SequentialPipelineBlocks containing only the blocks that would execute
         """
-
+        # Copy kwargs so we can add outputs as we traverse
+        active_inputs = dict(kwargs)
+        
         def fn_recursive_traverse(block, block_name, active_inputs):
             result_blocks = OrderedDict()
 
             # ConditionalPipelineBlocks (includes AutoPipelineBlocks)
             if isinstance(block, ConditionalPipelineBlocks):
-                trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
-                selected_block_name = block.select_block(**trigger_kwargs)
-
-                if selected_block_name is None:
-                    selected_block_name = block.default_block_name
-
-                if selected_block_name is None:
+                block = block.get_execution_blocks(**active_inputs)
+                if block is None:
                     return result_blocks
 
-                selected_block = block.sub_blocks[selected_block_name]
-
-                if selected_block.sub_blocks:
-                    result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
-                else:
-                    result_blocks[block_name] = selected_block
-                    if hasattr(selected_block, "outputs"):
-                        for out in selected_block.outputs:
-                            active_inputs[out.name] = True
-
-                return result_blocks
-
-            # SequentialPipelineBlocks or LoopSequentialPipelineBlocks
-            if block.sub_blocks:
+            # Has sub_blocks (SequentialPipelineBlocks/ConditionalPipelineBlocks)
+            if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
                 for sub_block_name, sub_block in block.sub_blocks.items():
-                    blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
-                    blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
-                    result_blocks.update(blocks_to_update)
+                    nested_blocks = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
+                    nested_blocks = {f"{block_name}.{k}": v for k, v in nested_blocks.items()}
+                    result_blocks.update(nested_blocks)
             else:
+                # Leaf block: single ModularPipelineBlocks or LoopSequentialPipelineBlocks
                 result_blocks[block_name] = block
-                if hasattr(block, "outputs"):
-                    for out in block.outputs:
+                # Add outputs to active_inputs so subsequent blocks can use them as triggers
+                if hasattr(block, "intermediate_outputs"):
+                    for out in block.intermediate_outputs:
                         active_inputs[out.name] = True
 
             return result_blocks
 
         all_blocks = OrderedDict()
         for block_name, block in self.sub_blocks.items():
-            blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
-            all_blocks.update(blocks_to_update)
-        return all_blocks
-
-    def get_execution_blocks(self, **kwargs):
-        """
-        Get the blocks that would execute given the specified inputs.
-
-        Args:
-            **kwargs: Input names and values. Only trigger inputs affect block selection.
-                    Pass any inputs that would be non-None at runtime.
-
-        Returns:
-            SequentialPipelineBlocks containing only the blocks that would execute
-
-        Example:
-            # Get blocks for inpainting workflow blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask,
-            image=image)
-
-            # Get blocks for text2image workflow blocks = pipeline.get_execution_blocks(prompt="a cat")
-        """
-        # Filter out None values
-        active_inputs = {k: v for k, v in kwargs.items() if v is not None}
-
-        blocks_triggered = self._traverse_trigger_blocks(active_inputs)
-        return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)
+            nested_blocks = fn_recursive_traverse(block, block_name, active_inputs)
+            all_blocks.update(nested_blocks)
+        
+        return SequentialPipelineBlocks.from_blocks_dict(all_blocks)
 
     def __repr__(self):
         class_name = self.__class__.__name__
@@ -1086,13 +1098,13 @@ def __repr__(self):
             f"{class_name}(\n  Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
         )
 
-        if self.trigger_inputs:
+        if self._get_trigger_inputs():
             header += "\n"
             header += "  " + "=" * 100 + "\n"
             header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
+            header += f"  Trigger Inputs: {[inp for inp in self._get_trigger_inputs() if inp is not None]}\n"
             # Get first trigger input as example
-            example_input = next(t for t in self.trigger_inputs if t is not None)
+            example_input = next(t for t in self._get_trigger_inputs() if t is not None)
             header += f"  Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
             header += "  " + "=" * 100 + "\n\n"
 
@@ -1281,7 +1293,7 @@ def required_inputs(self) -> List[str]:
     @property
     def intermediate_outputs(self) -> List[str]:
         named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
         for output in self.loop_intermediate_outputs:
             if output.name not in {output.name for output in combined_outputs}:
                 combined_outputs.append(output)
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index f3b12d716160..e075f88a0bbb 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -14,9 +14,11 @@
 
 import inspect
 import re
+import numpy as np
+import warnings
 from collections import OrderedDict
 from dataclasses import dataclass, field, fields
-from typing import Any, Dict, List, Literal, Optional, Type, Union
+from typing import Any, Dict, List, Literal, Optional, Type, Union, Set, Tuple
 
 import PIL.Image
 import torch
@@ -914,3 +916,68 @@ def make_doc_string(
     output += format_output_params(outputs, indent_level=2)
 
     return output
+
+
+def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> List[InputParam]:
+    """
+    Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if
+    current default value is None and new default value is not None. Warns if multiple non-None default values
+    exist for the same input.
+
+    Args:
+        named_input_lists: List of tuples containing (block_name, input_param_list) pairs
+
+    Returns:
+        List[InputParam]: Combined list of unique InputParam objects
+    """
+    combined_dict = {}  # name -> InputParam
+    value_sources = {}  # name -> block_name
+
+    for block_name, inputs in named_input_lists:
+        for input_param in inputs:
+            if input_param.name is None and input_param.kwargs_type is not None:
+                input_name = "*_" + input_param.kwargs_type
+            else:
+                input_name = input_param.name
+            if input_name in combined_dict:
+                current_param = combined_dict[input_name]
+                if (
+                    current_param.default is not None
+                    and input_param.default is not None
+                    and current_param.default != input_param.default
+                ):
+                    warnings.warn(
+                        f"Multiple different default values found for input '{input_name}': "
+                        f"{current_param.default} (from block '{value_sources[input_name]}') and "
+                        f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
+                    )
+                if current_param.default is None and input_param.default is not None:
+                    combined_dict[input_name] = input_param
+                    value_sources[input_name] = block_name
+            else:
+                combined_dict[input_name] = input_param
+                value_sources[input_name] = block_name
+
+    return list(combined_dict.values())
+
+def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) -> List[OutputParam]:
+    """
+    Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
+    occurrence of each output name.
+
+    Args:
+        named_output_lists: List of tuples containing (block_name, output_param_list) pairs
+
+    Returns:
+        List[OutputParam]: Combined list of unique OutputParam objects
+    """
+    combined_dict = {}  # name -> OutputParam
+
+    for block_name, outputs in named_output_lists:
+        for output_param in outputs:
+            if (output_param.name not in combined_dict) or (
+                combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
+            ):
+                combined_dict[output_param.name] = output_param
+
+    return list(combined_dict.values())
\ No newline at end of file
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 5837799d3431..66f861da65f3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -1197,6 +1197,21 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
     block_classes = AUTO_BLOCKS.values()
     block_names = AUTO_BLOCKS.keys()
 
+    # Workflow map defines the trigger conditions for each workflow.
+    # How to define:
+    #   - Only include required inputs and trigger inputs (inputs that determine which blocks run)
+    #   - `True` means the workflow triggers when the input is not None (most common case)
+    #   - Use specific values (e.g., `{"strength": 0.5}`) if your `select_block` logic depends on the value
+
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image2image": {"prompt": True, "image": True},
+        "inpainting": {"prompt": True, "mask_image": True, "image": True},
+        "controlnet_text2image": {"prompt": True, "control_image": True},
+        "controlnet_image2image": {"prompt": True, "image": True, "control_image": True},
+        "controlnet_inpainting": {"prompt": True, "mask_image": True, "image": True, "control_image": True},
+    }
+
     @property
     def description(self):
         return (

From 20c35da75c5db720a9082617008d9f1022445f00 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sun, 25 Jan 2026 12:11:37 +0100
Subject: [PATCH 26/58] up up

---
 .../modular_pipelines/modular_pipeline.py     | 141 ++++++++++--------
 .../modular_pipeline_utils.py                 |  36 ++++-
 .../qwenimage/before_denoise.py               |  15 +-
 .../qwenimage/modular_blocks_qwenimage.py     |  20 ++-
 4 files changed, 121 insertions(+), 91 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index f9bf257c3606..cbc6ee2470d5 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -39,11 +39,12 @@
     InputParam,
     InsertableDict,
     OutputParam,
+    combine_inputs,
+    combine_outputs,
     format_components,
     format_configs,
+    format_workflow,
     make_doc_string,
-    combine_inputs,
-    combine_outputs,
 )
 
 
@@ -303,9 +304,9 @@ def outputs(self) -> List[OutputParam]:
     # currentlyonly ConditionalPipelineBlocks and SequentialPipelineBlocks support `get_execution_blocks`
     def get_execution_blocks(self, **kwargs):
         """
-        Get the block(s) that would execute given the inputs.
-        Must be implemented by subclasses that support conditional block selection.
-        
+        Get the block(s) that would execute given the inputs. Must be implemented by subclasses that support
+        conditional block selection.
+
         Args:
             **kwargs: Input names and values. Only trigger inputs affect block selection.
         """
@@ -315,16 +316,15 @@ def get_execution_blocks(self, **kwargs):
     @property
     def workflow_names(self):
         """
-        Returns a list of available workflow names.
-        Must be implemented by subclasses that define `_workflow_map`.
+        Returns a list of available workflow names. Must be implemented by subclasses that define `_workflow_map`.
         """
         raise NotImplementedError(f"`workflow_names` is not implemented for {self.__class__.__name__}")
 
     def get_workflow(self, workflow_name: str):
         """
-        Get the execution blocks for a specific workflow.
-        Must be implemented by subclasses that define `_workflow_map`.
-        
+        Get the execution blocks for a specific workflow. Must be implemented by subclasses that define
+        `_workflow_map`.
+
         Args:
             workflow_name: Name of the workflow to retrieve.
         """
@@ -498,8 +498,8 @@ def doc(self):
 class ConditionalPipelineBlocks(ModularPipelineBlocks):
     """
     A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the
-    `select_block` method to define the logic for selecting the block. Currently, we only support selection logic 
-    based on the presence or absence of inputs (i.e., whether they are `None` or not)
+    `select_block` method to define the logic for selecting the block. Currently, we only support selection logic based
+    on the presence or absence of inputs (i.e., whether they are `None` or not)
 
     This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
     library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -510,9 +510,9 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
         block_classes: List of block classes to be used. Must have the same length as `block_names`.
         block_names: List of names for each block. Must have the same length as `block_classes`.
         block_trigger_inputs: List of input names that `select_block()` uses to determine which block to run.
-            For `ConditionalPipelineBlocks`, this does not need to correspond to `block_names` and `block_classes`.
-            For `AutoPipelineBlocks`, this must have the same length as `block_names` and `block_classes`,
-            where each element specifies the trigger input for the corresponding block.
+            For `ConditionalPipelineBlocks`, this does not need to correspond to `block_names` and `block_classes`. For
+            `AutoPipelineBlocks`, this must have the same length as `block_names` and `block_classes`, where each
+            element specifies the trigger input for the corresponding block.
         default_block_name: Name of the default block to run when no trigger inputs match.
             If None, this block can be skipped entirely when no trigger inputs are provided.
     """
@@ -676,34 +676,34 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
     def get_execution_blocks(self, **kwargs) -> Optional["ModularPipelineBlocks"]:
         """
         Get the block(s) that would execute given the inputs.
-        
+
         Recursively resolves nested ConditionalPipelineBlocks until reaching either:
         - A leaf block (no sub_blocks) → returns single `ModularPipelineBlocks`
-        - A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns 
+        - A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns
         a `SequentialPipelineBlocks` containing the resolved execution blocks
-        
+
         Args:
             **kwargs: Input names and values. Only trigger inputs affect block selection.
-            
+
         Returns:
             - `ModularPipelineBlocks`: A leaf block or resolved `SequentialPipelineBlocks`
             - `None`: If this block would be skipped (no trigger matched and no default)
         """
         trigger_kwargs = {name: kwargs.get(name) for name in self.block_trigger_inputs if name is not None}
         block_name = self.select_block(**trigger_kwargs)
-        
+
         if block_name is None:
             block_name = self.default_block_name
-        
+
         if block_name is None:
             return None
-        
+
         block = self.sub_blocks[block_name]
-        
+
         # Recursively resolve until we hit a leaf block or a SequentialPipelineBlocks
         if block.sub_blocks:
             return block.get_execution_blocks(**kwargs)
-        
+
         return block
 
     def __repr__(self):
@@ -784,32 +784,37 @@ def doc(self):
 
 class AutoPipelineBlocks(ConditionalPipelineBlocks):
     """
-    A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
-    
-    This is a specialized version of `ConditionalPipelineBlocks` where:
-    - Each block has one corresponding trigger input (1:1 mapping)
-    - Block selection is automatic: the first block whose trigger input is present gets selected
-    - `block_trigger_inputs` must have the same length as `block_names` and `block_classes`
-    - Use `None` in `block_trigger_inputs` to specify the default block, i.e the block that will run if no trigger inputs are present
-    
-    Attributes:
-        block_classes: List of block classes to be used. Must have the same length as `block_names` and `block_trigger_inputs`.
-        block_names: List of names for each block. Must have the same length as `block_classes` and `block_trigger_inputs`.
-        block_trigger_inputs: List of input names where each element specifies the trigger input for the corresponding block.
-            Use `None` to mark the default block.
-    
-    Example:
-```python
-    class MyAutoBlock(AutoPipelineBlocks):
-        block_classes = [InpaintEncoderBlock, ImageEncoderBlock, TextEncoderBlock]
-        block_names = ["inpaint", "img2img", "text2img"]
-        block_trigger_inputs = ["mask_image", "image", None]  # text2img is the default
-```
-        
-    With this definition:
-    - As long as `mask_image` is provided, "inpaint" block runs (regardless of `image` being provided or not)
-    - If `mask_image` is not provided but `image` is provided, "img2img" block runs
-    - Otherwise, "text2img" block runs (default, trigger is `None`)
+        A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
+
+        This is a specialized version of `ConditionalPipelineBlocks` where:
+        - Each block has one corresponding trigger input (1:1 mapping)
+        - Block selection is automatic: the first block whose trigger input is present gets selected
+        - `block_trigger_inputs` must have the same length as `block_names` and `block_classes`
+        - Use `None` in `block_trigger_inputs` to specify the default block, i.e the block that will run if no trigger
+          inputs are present
+
+        Attributes:
+            block_classes:
+                List of block classes to be used. Must have the same length as `block_names` and
+                `block_trigger_inputs`.
+            block_names:
+                List of names for each block. Must have the same length as `block_classes` and `block_trigger_inputs`.
+            block_trigger_inputs:
+                List of input names where each element specifies the trigger input for the corresponding block. Use
+                `None` to mark the default block.
+
+        Example:
+    ```python
+        class MyAutoBlock(AutoPipelineBlocks):
+            block_classes = [InpaintEncoderBlock, ImageEncoderBlock, TextEncoderBlock]
+            block_names = ["inpaint", "img2img", "text2img"]
+            block_trigger_inputs = ["mask_image", "image", None]  # text2img is the default
+    ```
+
+        With this definition:
+        - As long as `mask_image` is provided, "inpaint" block runs (regardless of `image` being provided or not)
+        - If `mask_image` is not provided but `image` is provided, "img2img" block runs
+        - Otherwise, "text2img" block runs (default, trigger is `None`)
     """
 
     def __init__(self):
@@ -830,7 +835,6 @@ def __init__(self):
             idx = self.block_trigger_inputs.index(None)
             self.default_block_name = self.block_names[idx]
 
-
     def select_block(self, **kwargs) -> Optional[str]:
         """Select block based on which trigger input is present (not None)."""
         for trigger_input, block_name in zip(self.block_trigger_inputs, self.block_names):
@@ -883,21 +887,24 @@ def expected_configs(self):
                     expected_configs.append(config)
         return expected_configs
 
-
     @property
     def workflow_names(self):
         if self._workflow_map is None:
-            raise NotImplementedError(f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}")
-        
+            raise NotImplementedError(
+                f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
+            )
+
         return list(self._workflow_map.keys())
 
     def get_workflow(self, workflow_name: str):
         if self._workflow_map is None:
-            raise NotImplementedError(f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}")
-        
+            raise NotImplementedError(
+                f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
+            )
+
         if workflow_name not in self._workflow_map:
             raise ValueError(f"Workflow {workflow_name} not found in {self.__class__.__name__}")
-        
+
         trigger_inputs = self._workflow_map[workflow_name]
         workflow_blocks = self.get_execution_blocks(**trigger_inputs)
 
@@ -1058,7 +1065,7 @@ def get_execution_blocks(self, **kwargs) -> "SequentialPipelineBlocks":
         """
         # Copy kwargs so we can add outputs as we traverse
         active_inputs = dict(kwargs)
-        
+
         def fn_recursive_traverse(block, block_name, active_inputs):
             result_blocks = OrderedDict()
 
@@ -1088,7 +1095,7 @@ def fn_recursive_traverse(block, block_name, active_inputs):
         for block_name, block in self.sub_blocks.items():
             nested_blocks = fn_recursive_traverse(block, block_name, active_inputs)
             all_blocks.update(nested_blocks)
-        
+
         return SequentialPipelineBlocks.from_blocks_dict(all_blocks)
 
     def __repr__(self):
@@ -1098,7 +1105,7 @@ def __repr__(self):
             f"{class_name}(\n  Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
         )
 
-        if self._get_trigger_inputs():
+        if self._workflow_map is None and self._get_trigger_inputs():
             header += "\n"
             header += "  " + "=" * 100 + "\n"
             header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
@@ -1108,8 +1115,13 @@ def __repr__(self):
             header += f"  Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
             header += "  " + "=" * 100 + "\n\n"
 
+        description = self.description
+        if self._workflow_map is not None:
+            workflow_str = format_workflow(self._workflow_map)
+            description = f"{self.description}\n\n{workflow_str}"
+
         # Format description with proper indentation
-        desc_lines = self.description.split("\n")
+        desc_lines = description.split("\n")
         desc = []
         # First line with "Description:" label
         desc.append(f"  Description: {desc_lines[0]}")
@@ -1157,10 +1169,15 @@ def __repr__(self):
 
     @property
     def doc(self):
+        description = self.description
+        if self._workflow_map is not None:
+            workflow_str = format_workflow(self._workflow_map)
+            description = f"{self.description}\n\n{workflow_str}"
+
         return make_doc_string(
             self.inputs,
             self.outputs,
-            self.description,
+            description=description,
             class_name=self.__class__.__name__,
             expected_components=self.expected_components,
             expected_configs=self.expected_configs,
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index e075f88a0bbb..6792d7db429e 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -14,11 +14,10 @@
 
 import inspect
 import re
-import numpy as np
 import warnings
 from collections import OrderedDict
 from dataclasses import dataclass, field, fields
-from typing import Any, Dict, List, Literal, Optional, Type, Union, Set, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
 
 import PIL.Image
 import torch
@@ -862,6 +861,30 @@ def format_configs(configs, indent_level=4, max_line_length=115, add_empty_lines
     return "\n".join(formatted_configs)
 
 
+def format_workflow(workflow_map):
+    """Format a workflow map into a readable string representation.
+
+    Args:
+        workflow_map: Dictionary mapping workflow names to trigger inputs
+
+    Returns:
+        A formatted string representing all workflows
+    """
+    if workflow_map is None:
+        return ""
+
+    lines = ["Supported workflows:"]
+    for workflow_name, trigger_inputs in workflow_map.items():
+        required_inputs = [k for k, v in trigger_inputs.items() if v]
+        if required_inputs:
+            inputs_str = ", ".join(f"`{t}`" for t in required_inputs)
+            lines.append(f"  - `{workflow_name}`: requires {inputs_str}")
+        else:
+            lines.append(f"  - `{workflow_name}`: default (no additional inputs required)")
+
+    return "\n".join(lines)
+
+
 def make_doc_string(
     inputs,
     outputs,
@@ -920,9 +943,9 @@ def make_doc_string(
 
 def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> List[InputParam]:
     """
-    Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if
-    current default value is None and new default value is not None. Warns if multiple non-None default values
-    exist for the same input.
+    Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if current
+    default value is None and new default value is not None. Warns if multiple non-None default values exist for the
+    same input.
 
     Args:
         named_input_lists: List of tuples containing (block_name, input_param_list) pairs
@@ -960,6 +983,7 @@ def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> Li
 
     return list(combined_dict.values())
 
+
 def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) -> List[OutputParam]:
     """
     Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
@@ -980,4 +1004,4 @@ def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) ->
             ):
                 combined_dict[output_param.name] = output_param
 
-    return list(combined_dict.values())
\ No newline at end of file
+    return list(combined_dict.values())
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index 338caf514b1d..80a379da6be0 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -551,8 +551,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
 # auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
     """
-    Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents
-    step.
+    Step that sets the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
 
       Components:
           scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -718,8 +717,8 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
 # auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
     """
-    Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after
-    prepare latents step.
+    Step that sets the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare
+    latents step.
 
       Components:
           scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -846,10 +845,6 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
       Outputs:
           img_shapes (`List`):
               The shapes of the images latents, used for RoPE calculation
-          txt_seq_lens (`List`):
-              The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
-              The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
 
     model_name = "qwenimage"
@@ -925,10 +920,6 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
       Outputs:
           img_shapes (`List`):
               The shapes of the images latents, used for RoPE calculation
-          txt_seq_lens (`List`):
-              The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
-              The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
 
     model_name = "qwenimage"
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 66f861da65f3..9bdc49ff914c 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -1113,10 +1113,14 @@ def description(self):
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
-      - for image-to-image generation, you need to provide `image`
-      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
-      - to run the controlnet workflow, you need to provide `control_image`
-      - for text-to-image generation, all you need to provide is `prompt`
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image2image`: requires `prompt`, `image`
+        - `inpainting`: requires `prompt`, `mask_image`, `image`
+        - `controlnet_text2image`: requires `prompt`, `control_image`
+        - `controlnet_image2image`: requires `prompt`, `image`, `control_image`
+        - `controlnet_inpainting`: requires `prompt`, `mask_image`, `image`, `control_image`
 
       Components:
           text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
@@ -1214,13 +1218,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
-            + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
-            + "- to run the controlnet workflow, you need to provide `control_image`\n"
-            + "- for text-to-image generation, all you need to provide is `prompt`"
-        )
+        return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage."
 
     @property
     def outputs(self):

From 3c3b56c86a93918622fba62497fe818cf0cb3c13 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Thu, 12 Feb 2026 00:57:15 +0000
Subject: [PATCH 27/58] treeat loop sequential pipeline blocks as leaf

---
 src/diffusers/modular_pipelines/modular_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 45ee4ac09fe5..5490d9c7544c 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -747,7 +747,7 @@ def get_execution_blocks(self, **kwargs) -> Optional["ModularPipelineBlocks"]:
         block = self.sub_blocks[block_name]
 
         # Recursively resolve until we hit a leaf block or a SequentialPipelineBlocks
-        if block.sub_blocks:
+        if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
             return block.get_execution_blocks(**kwargs)
 
         return block

From ba41614e7505b23389997a1b4d7f1a75988bc483 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Thu, 12 Feb 2026 00:57:45 +0000
Subject: [PATCH 28/58] update qwen image docstring note

---
 .../modular_pipelines/qwenimage/modular_blocks_qwenimage.py    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 9bdc49ff914c..950438083cb8 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -1204,8 +1204,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
     # Workflow map defines the trigger conditions for each workflow.
     # How to define:
     #   - Only include required inputs and trigger inputs (inputs that determine which blocks run)
-    #   - `True` means the workflow triggers when the input is not None (most common case)
-    #   - Use specific values (e.g., `{"strength": 0.5}`) if your `select_block` logic depends on the value
+    #   - currently, only supports `True` means the workflow triggers when the input is not None
 
     _workflow_map = {
         "text2image": {"prompt": True},

From 1f8dc96f17a6822c5bdb8aed56bc58314f07bb1e Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Thu, 12 Feb 2026 00:58:23 +0000
Subject: [PATCH 29/58] add workflow support for sdxl

---
 .../stable_diffusion_xl/__init__.py           |  22 ----
 .../stable_diffusion_xl/modular_blocks.py     | 117 ++++--------------
 2 files changed, 22 insertions(+), 117 deletions(-)

diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
index 59ec46dc6d36..644cc408ba37 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
@@ -23,18 +23,7 @@
 else:
     _import_structure["encoders"] = ["StableDiffusionXLTextEncoderStep"]
     _import_structure["modular_blocks"] = [
-        "ALL_BLOCKS",
-        "AUTO_BLOCKS",
-        "CONTROLNET_BLOCKS",
-        "IMAGE2IMAGE_BLOCKS",
-        "INPAINT_BLOCKS",
-        "IP_ADAPTER_BLOCKS",
-        "TEXT2IMAGE_BLOCKS",
         "StableDiffusionXLAutoBlocks",
-        "StableDiffusionXLAutoControlnetStep",
-        "StableDiffusionXLAutoDecodeStep",
-        "StableDiffusionXLAutoIPAdapterStep",
-        "StableDiffusionXLAutoVaeEncoderStep",
     ]
     _import_structure["modular_pipeline"] = ["StableDiffusionXLModularPipeline"]
 
@@ -49,18 +38,7 @@
             StableDiffusionXLTextEncoderStep,
         )
         from .modular_blocks import (
-            ALL_BLOCKS,
-            AUTO_BLOCKS,
-            CONTROLNET_BLOCKS,
-            IMAGE2IMAGE_BLOCKS,
-            INPAINT_BLOCKS,
-            IP_ADAPTER_BLOCKS,
-            TEXT2IMAGE_BLOCKS,
             StableDiffusionXLAutoBlocks,
-            StableDiffusionXLAutoControlnetStep,
-            StableDiffusionXLAutoDecodeStep,
-            StableDiffusionXLAutoIPAdapterStep,
-            StableDiffusionXLAutoVaeEncoderStep,
         )
         from .modular_pipeline import StableDiffusionXLModularPipeline
 else:
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
index 68b5e33755b5..6cba442ca9db 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
@@ -277,6 +277,7 @@ def description(self):
 
 
 # ip-adapter, controlnet, text2img, img2img, inpainting
+# auto_docstring
 class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
     block_classes = [
         StableDiffusionXLTextEncoderStep,
@@ -293,103 +294,29 @@ class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
         "decode",
     ]
 
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL.\n"
-            + "- for image-to-image generation, you need to provide either `image` or `image_latents`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
-            + "- to run the controlnet workflow, you need to provide `control_image`\n"
-            + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
-            + "- to run the ip_adapter workflow, you need to provide `ip_adapter_image`\n"
-            + "- for text-to-image generation, all you need to provide is `prompt`"
-        )
-
-
-# controlnet (input + denoise step)
-class StableDiffusionXLAutoControlnetStep(SequentialPipelineBlocks):
-    block_classes = [
-        StableDiffusionXLAutoControlNetInputStep,
-        StableDiffusionXLAutoControlNetDenoiseStep,
-    ]
-    block_names = ["controlnet_input", "controlnet_denoise"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image2image": {"image": True, "prompt": True},
+        "inpainting": {"mask_image": True, "image": True, "prompt": True},
+        "controlnet_text2image": {"control_image": True, "prompt": True},
+        "controlnet_image2image": {"control_image": True, "image": True, "prompt": True},
+        "controlnet_inpainting": {"control_image": True, "mask_image": True, "image": True, "prompt": True},
+        "controlnet_union_text2image": {"control_image": True, "control_mode": True, "prompt": True},
+        "controlnet_union_image2image": {"control_image": True, "control_mode": True, "image": True, "prompt": True},
+        "controlnet_union_inpainting": {"control_image": True, "control_mode": True, "mask_image": True, "image": True, "prompt": True},
+        "ip_adapter_text2image": {"ip_adapter_image": True, "prompt": True},
+        "ip_adapter_image2image": {"ip_adapter_image": True, "image": True, "prompt": True},
+        "ip_adapter_inpainting": {"ip_adapter_image": True, "mask_image": True, "image": True, "prompt": True},
+        "ip_adapter_controlnet_text2image": {"ip_adapter_image": True, "control_image": True, "prompt": True},
+        "ip_adapter_controlnet_image2image": {"ip_adapter_image": True, "control_image": True, "image": True, "prompt": True},
+        "ip_adapter_controlnet_inpainting": {"ip_adapter_image": True, "control_image": True, "mask_image": True, "image": True, "prompt": True},
+        "ip_adapter_controlnet_union_text2image": {"ip_adapter_image": True, "control_image": True, "control_mode": True, "prompt": True},
+        "ip_adapter_controlnet_union_image2image": {"ip_adapter_image": True, "control_image": True, "control_mode": True, "image": True, "prompt": True},
+        "ip_adapter_controlnet_union_inpainting": {"ip_adapter_image": True, "control_image": True, "control_mode": True, "mask_image": True, "image": True, "prompt": True},
+    }
 
     @property
     def description(self):
         return (
-            "Controlnet auto step that prepare the controlnet input and denoise the latents. "
-            + "It works for both controlnet and controlnet_union and supports text2img, img2img and inpainting tasks."
-            + " (it should be replace at 'denoise' step)"
+            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL."
         )
-
-
-TEXT2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("input", StableDiffusionXLInputStep),
-        ("set_timesteps", StableDiffusionXLSetTimestepsStep),
-        ("prepare_latents", StableDiffusionXLPrepareLatentsStep),
-        ("prepare_add_cond", StableDiffusionXLPrepareAdditionalConditioningStep),
-        ("denoise", StableDiffusionXLDenoiseStep),
-        ("decode", StableDiffusionXLDecodeStep),
-    ]
-)
-
-IMAGE2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("vae_encoder", StableDiffusionXLVaeEncoderStep),
-        ("input", StableDiffusionXLInputStep),
-        ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
-        ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
-        ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
-        ("denoise", StableDiffusionXLDenoiseStep),
-        ("decode", StableDiffusionXLDecodeStep),
-    ]
-)
-
-INPAINT_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep),
-        ("input", StableDiffusionXLInputStep),
-        ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
-        ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
-        ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
-        ("denoise", StableDiffusionXLInpaintDenoiseStep),
-        ("decode", StableDiffusionXLInpaintDecodeStep),
-    ]
-)
-
-CONTROLNET_BLOCKS = InsertableDict(
-    [
-        ("denoise", StableDiffusionXLAutoControlnetStep),
-    ]
-)
-
-
-IP_ADAPTER_BLOCKS = InsertableDict(
-    [
-        ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
-    ]
-)
-
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
-        ("vae_encoder", StableDiffusionXLAutoVaeEncoderStep),
-        ("denoise", StableDiffusionXLCoreDenoiseStep),
-        ("decode", StableDiffusionXLAutoDecodeStep),
-    ]
-)
-
-
-ALL_BLOCKS = {
-    "text2img": TEXT2IMAGE_BLOCKS,
-    "img2img": IMAGE2IMAGE_BLOCKS,
-    "inpaint": INPAINT_BLOCKS,
-    "controlnet": CONTROLNET_BLOCKS,
-    "ip_adapter": IP_ADAPTER_BLOCKS,
-    "auto": AUTO_BLOCKS,
-}

From b0b8fcfef7d9000a4767e6a9bf0ca2e2c657d317 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Thu, 12 Feb 2026 00:58:34 +0000
Subject: [PATCH 30/58] add a test suit

---
 ...st_modular_pipeline_stable_diffusion_xl.py | 173 ++++++++++++++++++
 .../test_modular_pipelines_common.py          |  35 ++++
 2 files changed, 208 insertions(+)

diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
index 7b55933e4caf..4aec782960b2 100644
--- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
+++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
@@ -267,6 +267,60 @@ def test_controlnet_cfg(self):
         assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
 
 
+TEXT2IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise", "StableDiffusionXLDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "controlnet_text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "controlnet_union_text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "ip_adapter_text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise", "StableDiffusionXLDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "ip_adapter_controlnet_text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+}
+
+
 class TestSDXLModularPipelineFast(
     SDXLModularTesterMixin,
     SDXLModularIPAdapterTesterMixin,
@@ -291,6 +345,9 @@ class TestSDXLModularPipelineFast(
     batch_params = frozenset(["prompt", "negative_prompt"])
     expected_image_output_shape = (1, 3, 64, 64)
 
+    expected_workflow_blocks = TEXT2IMAGE_WORKFLOWS
+
+
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
         inputs = {
@@ -313,6 +370,63 @@ def test_stable_diffusion_xl_euler(self):
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
+IMAGE2IMAGE_WORKFLOWS = {
+    "image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise", "StableDiffusionXLDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "controlnet_image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "controlnet_union_image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "ip_adapter_image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise", "StableDiffusionXLDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "ip_adapter_controlnet_image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+}
 
 class TestSDXLImg2ImgModularPipelineFast(
     SDXLModularTesterMixin,
@@ -338,6 +452,7 @@ class TestSDXLImg2ImgModularPipelineFast(
     )
     batch_params = frozenset(["prompt", "negative_prompt", "image"])
     expected_image_output_shape = (1, 3, 64, 64)
+    expected_workflow_blocks = IMAGE2IMAGE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -366,6 +481,63 @@ def test_stable_diffusion_xl_euler(self):
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
+INPAINTING_WORKFLOWS = {
+    "inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise", "StableDiffusionXLInpaintDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+    "controlnet_inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+    "controlnet_union_inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+        ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+    "ip_adapter_inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise", "StableDiffusionXLInpaintDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+    "ip_adapter_controlnet_inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+}
 
 class SDXLInpaintingModularPipelineFastTests(
     SDXLModularTesterMixin,
@@ -392,6 +564,7 @@ class SDXLInpaintingModularPipelineFastTests(
     )
     batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
     expected_image_output_shape = (1, 3, 64, 64)
+    expected_workflow_blocks = INPAINTING_WORKFLOWS
 
     def get_dummy_inputs(self, device, seed=0):
         generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py
index 9ee5c6c2ac80..d76815427108 100644
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -100,6 +100,14 @@ def batch_params(self) -> frozenset:
             "See existing pipeline tests for reference."
         )
 
+    @property
+    def expected_workflow_blocks(self) -> dict:
+        raise NotImplementedError(
+            "You need to set the attribute `expected_workflow_blocks` in the child test class. "
+            "`expected_workflow_blocks` is a dictionary that maps workflow names to list of block names. "
+            "See existing pipeline tests for reference."
+        )
+
     def setup_method(self):
         # clean up the VRAM before each test
         torch.compiler.reset()
@@ -341,6 +349,33 @@ def test_save_from_pretrained(self):
 
         assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
 
+    def test_workflow_map(self):
+        blocks = self.pipeline_blocks_class()
+        if blocks._workflow_map is None:
+            pytest.skip("Skipping test as _workflow_map is not set")
+
+        assert hasattr(self, "expected_workflow_blocks") and self.expected_workflow_blocks, (
+            "expected_workflow_blocks must be defined in the test class"
+        )
+
+        for workflow_name, expected_blocks in self.expected_workflow_blocks.items():
+            workflow_blocks = blocks.get_workflow(workflow_name)
+            actual_blocks = list(workflow_blocks.sub_blocks.items())
+
+            # Check that the number of blocks matches
+            assert len(actual_blocks) == len(expected_blocks), (
+                f"Workflow '{workflow_name}' has {len(actual_blocks)} blocks, "
+                f"expected {len(expected_blocks)}"
+            )
+
+            # Check that each block name and type matches
+            for i, ((actual_name, actual_block), (expected_name, expected_class_name)) in enumerate(
+                zip(actual_blocks, expected_blocks)
+            ):
+                assert actual_block.__class__.__name__ == expected_class_name, (
+                    f"Workflow '{workflow_name}': block '{actual_name}' has type "
+                    f"{actual_block.__class__.__name__}, expected {expected_class_name}"
+                )
 
 class ModularGuiderTesterMixin:
     def test_guider_cfg(self, expected_max_diff=1e-2):

From fab1013e4d7e6a1869ad91078c2123162e225cc4 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Thu, 12 Feb 2026 05:36:04 +0000
Subject: [PATCH 31/58] add test for qwen-image

---
 .../modular_blocks_qwenimage_edit.py          |   4 +
 .../qwen/test_modular_pipeline_qwenimage.py   | 134 ++++++++++++++++++
 2 files changed, 138 insertions(+)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index e1e5c4335481..5ed54b50e179 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -774,6 +774,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     model_name = "qwenimage-edit"
     block_classes = EDIT_AUTO_BLOCKS.values()
     block_names = EDIT_AUTO_BLOCKS.keys()
+    _workflow_map = {
+        "edit": {"prompt": True, "image": True},
+        "edit_inpainting": {"prompt": True, "mask_image": True, "image": True},
+    }
 
     @property
     def description(self):
diff --git a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
index f4bd27b7ea47..b2bf55396ae6 100644
--- a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
+++ b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
@@ -30,6 +30,102 @@
 from ..test_modular_pipelines_common import ModularGuiderTesterMixin, ModularPipelineTesterMixin
 
 
+QWEN_IMAGE_TEXT2IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("denoise.input", "QwenImageTextInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "image2image": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("vae_encoder.preprocess", "QwenImageProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_img2img_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "inpainting": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("vae_encoder.preprocess", "QwenImageInpaintProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageInpaintDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+    ],
+    "controlnet_text2image": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+        ("denoise.input", "QwenImageTextInputsStep"),
+        ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+        ("denoise.controlnet_denoise", "QwenImageControlNetDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "controlnet_image2image": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("vae_encoder.preprocess", "QwenImageProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_img2img_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+        ("denoise.controlnet_denoise", "QwenImageControlNetDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "controlnet_inpainting": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("vae_encoder.preprocess", "QwenImageInpaintProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+        ("denoise.controlnet_denoise", "QwenImageInpaintControlNetDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+    ],
+}
+
 class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
     pipeline_class = QwenImageModularPipeline
     pipeline_blocks_class = QwenImageAutoBlocks
@@ -37,6 +133,7 @@ class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuider
 
     params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
     batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+    expected_workflow_blocks = QWEN_IMAGE_TEXT2IMAGE_WORKFLOWS
 
     def get_dummy_inputs(self):
         generator = self.get_generator()
@@ -55,6 +152,42 @@ def get_dummy_inputs(self):
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=5e-4)
 
+QWEN_IMAGE_EDIT_WORKFLOWS = {
+    "edit": [
+        ("text_encoder.resize", "QwenImageEditResizeStep"),
+        ("text_encoder.encode", "QwenImageEditTextEncoderStep"),
+        ("vae_encoder.resize", "QwenImageEditResizeStep"),
+        ("vae_encoder.preprocess", "QwenImageEditProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageEditRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageEditDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "edit_inpainting": [
+        ("text_encoder.resize", "QwenImageEditResizeStep"),
+        ("text_encoder.encode", "QwenImageEditTextEncoderStep"),
+        ("vae_encoder.resize", "QwenImageEditResizeStep"),
+        ("vae_encoder.preprocess", "QwenImageEditInpaintProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageEditRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageEditInpaintDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+    ],
+}
 
 class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
     pipeline_class = QwenImageEditModularPipeline
@@ -63,6 +196,7 @@ class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGu
 
     params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
     batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+    expected_workflow_blocks = QWEN_IMAGE_EDIT_WORKFLOWS
 
     def get_dummy_inputs(self):
         generator = self.get_generator()

From 931d62c081681d686d326206bd0dcac3b53105c0 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-161-123.ec2.internal>
Date: Thu, 12 Feb 2026 21:22:15 +0000
Subject: [PATCH 32/58] refactor flux a bit, seperate modular_blocks into
 modular_blocks_flux and modular_blocks_flux_kontext + support workflow

---
 .../modular_pipelines/flux/__init__.py        |  34 +-
 .../modular_pipelines/flux/encoders.py        |   2 +-
 .../modular_pipelines/flux/modular_blocks.py  | 446 ------------------
 .../flux/modular_blocks_flux.py               | 244 ++++++++++
 .../flux/modular_blocks_flux_kontext.py       | 235 +++++++++
 5 files changed, 484 insertions(+), 477 deletions(-)
 delete mode 100644 src/diffusers/modular_pipelines/flux/modular_blocks.py
 create mode 100644 src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
 create mode 100644 src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py

diff --git a/src/diffusers/modular_pipelines/flux/__init__.py b/src/diffusers/modular_pipelines/flux/__init__.py
index ec00986611c8..4754ed01ce6a 100644
--- a/src/diffusers/modular_pipelines/flux/__init__.py
+++ b/src/diffusers/modular_pipelines/flux/__init__.py
@@ -21,21 +21,8 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["encoders"] = ["FluxTextEncoderStep"]
-    _import_structure["modular_blocks"] = [
-        "ALL_BLOCKS",
-        "AUTO_BLOCKS",
-        "AUTO_BLOCKS_KONTEXT",
-        "FLUX_KONTEXT_BLOCKS",
-        "TEXT2IMAGE_BLOCKS",
-        "FluxAutoBeforeDenoiseStep",
-        "FluxAutoBlocks",
-        "FluxAutoDecodeStep",
-        "FluxAutoDenoiseStep",
-        "FluxKontextAutoBlocks",
-        "FluxKontextAutoDenoiseStep",
-        "FluxKontextBeforeDenoiseStep",
-    ]
+    _import_structure["modular_blocks_flux"] = ["FluxAutoBlocks"]
+    _import_structure["modular_blocks_flux_kontext"] = ["FluxKontextAutoBlocks"]
     _import_structure["modular_pipeline"] = ["FluxKontextModularPipeline", "FluxModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -45,21 +32,8 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .encoders import FluxTextEncoderStep
-        from .modular_blocks import (
-            ALL_BLOCKS,
-            AUTO_BLOCKS,
-            AUTO_BLOCKS_KONTEXT,
-            FLUX_KONTEXT_BLOCKS,
-            TEXT2IMAGE_BLOCKS,
-            FluxAutoBeforeDenoiseStep,
-            FluxAutoBlocks,
-            FluxAutoDecodeStep,
-            FluxAutoDenoiseStep,
-            FluxKontextAutoBlocks,
-            FluxKontextAutoDenoiseStep,
-            FluxKontextBeforeDenoiseStep,
-        )
+        from .modular_blocks_flux import FluxAutoBlocks
+        from .modular_blocks_flux_kontext import FluxKontextAutoBlocks
         from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
index f0314d4771b0..06bd8fd07b3d 100644
--- a/src/diffusers/modular_pipelines/flux/encoders.py
+++ b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -206,7 +206,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState):
         return components, state
 
 
-class FluxVaeEncoderDynamicStep(ModularPipelineBlocks):
+class FluxVaeEncoderStep(ModularPipelineBlocks):
     model_name = "flux"
 
     def __init__(
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
deleted file mode 100644
index bd9b2d1b40c9..000000000000
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    FluxImg2ImgPrepareLatentsStep,
-    FluxImg2ImgSetTimestepsStep,
-    FluxKontextRoPEInputsStep,
-    FluxPrepareLatentsStep,
-    FluxRoPEInputsStep,
-    FluxSetTimestepsStep,
-)
-from .decoders import FluxDecodeStep
-from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
-from .encoders import (
-    FluxKontextProcessImagesInputStep,
-    FluxProcessImagesInputStep,
-    FluxTextEncoderStep,
-    FluxVaeEncoderDynamicStep,
-)
-from .inputs import (
-    FluxInputsDynamicStep,
-    FluxKontextInputsDynamicStep,
-    FluxKontextSetResolutionStep,
-    FluxTextInputStep,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# vae encoder (run before before_denoise)
-FluxImg2ImgVaeEncoderBlocks = InsertableDict(
-    [("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep())]
-)
-
-
-class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "flux"
-
-    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
-    block_names = FluxImg2ImgVaeEncoderBlocks.keys()
-
-    @property
-    def description(self) -> str:
-        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
-
-
-class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [FluxImg2ImgVaeEncoderStep]
-    block_names = ["img2img"]
-    block_trigger_inputs = ["image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block that works for img2img tasks.\n"
-            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
-            + " - if `image` is not provided, step will be skipped."
-        )
-
-
-# Flux Kontext vae encoder (run before before_denoise)
-
-FluxKontextVaeEncoderBlocks = InsertableDict(
-    [("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep(sample_mode="argmax"))]
-)
-
-
-class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "flux-kontext"
-
-    block_classes = FluxKontextVaeEncoderBlocks.values()
-    block_names = FluxKontextVaeEncoderBlocks.keys()
-
-    @property
-    def description(self) -> str:
-        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
-
-
-class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextVaeEncoderStep]
-    block_names = ["img2img"]
-    block_trigger_inputs = ["image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block that works for img2img tasks.\n"
-            + " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided."
-            + " - if `image` is not provided, step will be skipped."
-        )
-
-
-# before_denoise: text2img
-FluxBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-    ]
-)
-
-
-class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxBeforeDenoiseBlocks.values()
-    block_names = FluxBeforeDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
-
-
-# before_denoise: img2img
-FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
-        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-    ]
-)
-
-
-class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
-    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs for the denoise step for img2img task."
-
-
-# before_denoise: all task (text2img, img2img)
-class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    model_name = "flux-kontext"
-    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
-    block_names = ["img2img", "text2image"]
-    block_trigger_inputs = ["image_latents", None]
-
-    @property
-    def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
-            + "This is an auto pipeline block that works for text2image.\n"
-            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
-            + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
-        )
-
-
-# before_denoise: FluxKontext
-
-FluxKontextBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
-    ]
-)
-
-
-class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxKontextBeforeDenoiseBlocks.values()
-    block_names = FluxKontextBeforeDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step\n"
-            "for img2img/text2img task for Flux Kontext."
-        )
-
-
-class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep]
-    block_names = ["img2img", "text2image"]
-    block_trigger_inputs = ["image_latents", None]
-
-    @property
-    def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
-            + "This is an auto pipeline block that works for text2image.\n"
-            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
-            + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
-        )
-
-
-# denoise: text2image
-class FluxAutoDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxDenoiseStep]
-    block_names = ["denoise"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Denoise step that iteratively denoise the latents. "
-            "This is a auto pipeline block that works for text2image and img2img tasks."
-            " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
-        )
-
-
-# denoise: Flux Kontext
-
-
-class FluxKontextAutoDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextDenoiseStep]
-    block_names = ["denoise"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Denoise step that iteratively denoise the latents for Flux Kontext. "
-            "This is a auto pipeline block that works for text2image and img2img tasks."
-            " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
-        )
-
-
-# decode: all task (text2img, img2img)
-class FluxAutoDecodeStep(AutoPipelineBlocks):
-    block_classes = [FluxDecodeStep]
-    block_names = ["non-inpaint"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self):
-        return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
-
-
-# inputs: text2image/img2img
-FluxImg2ImgBlocks = InsertableDict(
-    [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
-)
-
-
-class FluxImg2ImgInputStep(SequentialPipelineBlocks):
-    model_name = "flux"
-    block_classes = FluxImg2ImgBlocks.values()
-    block_names = FluxImg2ImgBlocks.keys()
-
-    @property
-    def description(self):
-        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
-        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
-        " - update height/width based `image_latents`, patchify `image_latents`."
-
-
-class FluxAutoInputStep(AutoPipelineBlocks):
-    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
-    block_names = ["img2img", "text2image"]
-    block_trigger_inputs = ["image_latents", None]
-
-    @property
-    def description(self):
-        return (
-            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
-            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
-            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
-            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
-        )
-
-
-# inputs: Flux Kontext
-
-FluxKontextBlocks = InsertableDict(
-    [
-        ("set_resolution", FluxKontextSetResolutionStep()),
-        ("text_inputs", FluxTextInputStep()),
-        ("additional_inputs", FluxKontextInputsDynamicStep()),
-    ]
-)
-
-
-class FluxKontextInputStep(SequentialPipelineBlocks):
-    model_name = "flux-kontext"
-    block_classes = FluxKontextBlocks.values()
-    block_names = FluxKontextBlocks.keys()
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
-            " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
-            " - update height/width based `image_latents`, patchify `image_latents`."
-        )
-
-
-class FluxKontextAutoInputStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextInputStep, FluxTextInputStep]
-    # block_classes = [FluxKontextInputStep]
-    block_names = ["img2img", "text2img"]
-    # block_names = ["img2img"]
-    block_trigger_inputs = ["image_latents", None]
-    # block_trigger_inputs = ["image_latents"]
-
-    @property
-    def description(self):
-        return (
-            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
-            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
-            + " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n"
-            + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
-        )
-
-
-class FluxCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "flux"
-    block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
-    block_names = ["input", "before_denoise", "denoise"]
-
-    @property
-    def description(self):
-        return (
-            "Core step that performs the denoising process. \n"
-            + " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
-            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
-            + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
-            + " - for image-to-image generation, you need to provide `image_latents`\n"
-            + " - for text-to-image generation, all you need to provide is prompt embeddings."
-        )
-
-
-class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "flux-kontext"
-    block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextAutoDenoiseStep]
-    block_names = ["input", "before_denoise", "denoise"]
-
-    @property
-    def description(self):
-        return (
-            "Core step that performs the denoising process. \n"
-            + " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
-            + " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
-            + " - `FluxKontextAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
-            + " - for image-to-image generation, you need to provide `image_latents`\n"
-            + " - for text-to-image generation, all you need to provide is prompt embeddings."
-        )
-
-
-# Auto blocks (text2image and img2img)
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxAutoVaeEncoderStep()),
-        ("denoise", FluxCoreDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-AUTO_BLOCKS_KONTEXT = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
-        ("denoise", FluxKontextCoreDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-
-class FluxAutoBlocks(SequentialPipelineBlocks):
-    model_name = "flux"
-
-    block_classes = AUTO_BLOCKS.values()
-    block_names = AUTO_BLOCKS.keys()
-
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image and image-to-image using Flux.\n"
-            + "- for text-to-image generation, all you need to provide is `prompt`\n"
-            + "- for image-to-image generation, you need to provide either `image` or `image_latents`"
-        )
-
-
-class FluxKontextAutoBlocks(FluxAutoBlocks):
-    model_name = "flux-kontext"
-
-    block_classes = AUTO_BLOCKS_KONTEXT.values()
-    block_names = AUTO_BLOCKS_KONTEXT.keys()
-
-
-TEXT2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("input", FluxTextInputStep()),
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-        ("denoise", FluxDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-IMAGE2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxVaeEncoderDynamicStep()),
-        ("input", FluxImg2ImgInputStep()),
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
-        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-        ("denoise", FluxDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-FLUX_KONTEXT_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxVaeEncoderDynamicStep(sample_mode="argmax")),
-        ("input", FluxKontextInputStep()),
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
-        ("denoise", FluxKontextDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-ALL_BLOCKS = {
-    "text2image": TEXT2IMAGE_BLOCKS,
-    "img2img": IMAGE2IMAGE_BLOCKS,
-    "auto": AUTO_BLOCKS,
-    "auto_kontext": AUTO_BLOCKS_KONTEXT,
-    "kontext": FLUX_KONTEXT_BLOCKS,
-}
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
new file mode 100644
index 000000000000..7099f89f0b0e
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
@@ -0,0 +1,244 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict
+from .before_denoise import (
+    FluxImg2ImgPrepareLatentsStep,
+    FluxImg2ImgSetTimestepsStep,
+    FluxKontextRoPEInputsStep,
+    FluxPrepareLatentsStep,
+    FluxRoPEInputsStep,
+    FluxSetTimestepsStep,
+)
+from .decoders import FluxDecodeStep
+from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
+from .encoders import (
+    FluxKontextProcessImagesInputStep,
+    FluxProcessImagesInputStep,
+    FluxTextEncoderStep,
+    FluxVaeEncoderStep,
+)
+from .inputs import (
+    FluxInputsDynamicStep,
+    FluxKontextInputsDynamicStep,
+    FluxKontextSetResolutionStep,
+    FluxTextInputStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# vae encoder (run before before_denoise)
+FluxImg2ImgVaeEncoderBlocks = InsertableDict(
+    [("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderStep())]
+)
+
+# auto_docstring
+class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
+    block_names = FluxImg2ImgVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+# auto_docstring
+class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
+    model_name = "flux"
+    block_classes = [FluxImg2ImgVaeEncoderStep]
+    block_names = ["img2img"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            + "This is an auto pipeline block that works for img2img tasks.\n"
+            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + " - if `image` is not provided, step will be skipped."
+        )
+
+# before_denoise: text2img
+FluxBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+    ]
+)
+
+# auto_docstring
+class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = FluxBeforeDenoiseBlocks.values()
+    block_names = FluxBeforeDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
+
+
+# before_denoise: img2img
+FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
+        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+    ]
+)
+
+# auto_docstring
+class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux"
+    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
+    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return "Before denoise step that prepare the inputs for the denoise step for img2img task."
+
+
+# before_denoise: all task (text2img, img2img)
+# auto_docstring
+class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    model_name = "flux"
+    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is an auto pipeline block that works for text2image.\n"
+            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
+            + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
+        )
+
+
+
+# inputs: text2image/img2img
+FluxImg2ImgBlocks = InsertableDict(
+    [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
+)
+
+# auto_docstring
+class FluxImg2ImgInputStep(SequentialPipelineBlocks):
+    model_name = "flux"
+    block_classes = FluxImg2ImgBlocks.values()
+    block_names = FluxImg2ImgBlocks.keys()
+
+    @property
+    def description(self):
+        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
+        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+        " - update height/width based `image_latents`, patchify `image_latents`."
+
+
+# auto_docstring
+class FluxAutoInputStep(AutoPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
+            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
+        )
+
+
+# auto_docstring
+class FluxCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux"
+    block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxDenoiseStep]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `FluxDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings."
+        )
+
+
+# Auto blocks (text2image and img2img)
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", FluxTextEncoderStep()),
+        ("vae_encoder", FluxAutoVaeEncoderStep()),
+        ("denoise", FluxCoreDenoiseStep()),
+        ("decode", FluxDecodeStep()),
+    ]
+)
+
+# auto_docstring
+class FluxAutoBlocks(SequentialPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = AUTO_BLOCKS.values()
+    block_names = AUTO_BLOCKS.keys()
+
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "img2img": {"image": True, "prompt": True},
+    }
+
+    @property
+    def description(self):
+        return (
+            "Auto Modular pipeline for text-to-image and image-to-image using Flux."
+        )
+
+
+# TEXT2IMAGE_BLOCKS = InsertableDict(
+#     [
+#         ("text_encoder", FluxTextEncoderStep()),
+#         ("input", FluxTextInputStep()),
+#         ("prepare_latents", FluxPrepareLatentsStep()),
+#         ("set_timesteps", FluxSetTimestepsStep()),
+#         ("prepare_rope_inputs", FluxRoPEInputsStep()),
+#         ("denoise", FluxDenoiseStep()),
+#         ("decode", FluxDecodeStep()),
+#     ]
+# )
+
+# IMAGE2IMAGE_BLOCKS = InsertableDict(
+#     [
+#         ("text_encoder", FluxTextEncoderStep()),
+#         ("vae_encoder", FluxVaeEncoderStep()),
+#         ("input", FluxImg2ImgInputStep()),
+#         ("prepare_latents", FluxPrepareLatentsStep()),
+#         ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
+#         ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
+#         ("prepare_rope_inputs", FluxRoPEInputsStep()),
+#         ("denoise", FluxDenoiseStep()),
+#         ("decode", FluxDecodeStep()),
+#     ]
+# )
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
new file mode 100644
index 000000000000..3b12d8c53eea
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
@@ -0,0 +1,235 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict
+from .before_denoise import (
+    FluxImg2ImgPrepareLatentsStep,
+    FluxImg2ImgSetTimestepsStep,
+    FluxKontextRoPEInputsStep,
+    FluxPrepareLatentsStep,
+    FluxRoPEInputsStep,
+    FluxSetTimestepsStep,
+)
+from .decoders import FluxDecodeStep
+from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
+from .encoders import (
+    FluxKontextProcessImagesInputStep,
+    FluxProcessImagesInputStep,
+    FluxTextEncoderStep,
+    FluxVaeEncoderStep,
+)
+from .inputs import (
+    FluxInputsDynamicStep,
+    FluxKontextInputsDynamicStep,
+    FluxKontextSetResolutionStep,
+    FluxTextInputStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Flux Kontext vae encoder (run before before_denoise)
+
+FluxKontextVaeEncoderBlocks = InsertableDict(
+    [("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderStep(sample_mode="argmax"))]
+)
+
+
+class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
+
+    block_classes = FluxKontextVaeEncoderBlocks.values()
+    block_names = FluxKontextVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
+class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [FluxKontextVaeEncoderStep]
+    block_names = ["img2img"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            + "This is an auto pipeline block that works for img2img tasks.\n"
+            + " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + " - if `image` is not provided, step will be skipped."
+        )
+
+
+# before_denoise: text2img
+FluxBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+    ]
+)
+
+
+class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = FluxBeforeDenoiseBlocks.values()
+    block_names = FluxBeforeDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
+
+
+# before_denoise: FluxKontext
+
+FluxKontextBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
+    ]
+)
+
+
+class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = FluxKontextBeforeDenoiseBlocks.values()
+    block_names = FluxKontextBeforeDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step\n"
+            "for img2img/text2img task for Flux Kontext."
+        )
+
+
+class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is an auto pipeline block that works for text2image.\n"
+            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
+            + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
+        )
+
+# inputs: Flux Kontext
+
+FluxKontextBlocks = InsertableDict(
+    [
+        ("set_resolution", FluxKontextSetResolutionStep()),
+        ("text_inputs", FluxTextInputStep()),
+        ("additional_inputs", FluxKontextInputsDynamicStep()),
+    ]
+)
+
+
+class FluxKontextInputStep(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
+    block_classes = FluxKontextBlocks.values()
+    block_names = FluxKontextBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
+            " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+            " - update height/width based `image_latents`, patchify `image_latents`."
+        )
+
+
+class FluxKontextAutoInputStep(AutoPipelineBlocks):
+    model_name = "flux-kontext"
+    block_classes = [FluxKontextInputStep, FluxTextInputStep]
+    block_names = ["img2img", "text2img"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+            + " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n"
+            + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
+        )
+
+
+# auto_docstring
+class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
+    block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextDenoiseStep]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `FluxKontextDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            + "This step supports text-to-image and image-to-image tasks for Flux-Kontext:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings."
+        )
+
+
+AUTO_BLOCKS_KONTEXT = InsertableDict(
+    [
+        ("text_encoder", FluxTextEncoderStep()),
+        ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
+        ("denoise", FluxKontextCoreDenoiseStep()),
+        ("decode", FluxDecodeStep()),
+    ]
+)
+
+
+
+class FluxKontextAutoBlocks(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
+
+    block_classes = AUTO_BLOCKS_KONTEXT.values()
+    block_names = AUTO_BLOCKS_KONTEXT.keys()
+    _workflow_map = {
+        "img2img": {"image": True, "prompt": True},
+        "text2image": {"prompt": True},
+    }
+
+    @property
+    def description(self):
+        return (
+            "Modular pipeline for image-to-image using Flux Kontext."
+        )
+
+
+
+# FLUX_KONTEXT_BLOCKS = InsertableDict(
+#     [
+#         ("text_encoder", FluxTextEncoderStep()),
+#         ("vae_encoder", FluxVaeEncoderStep(sample_mode="argmax")),
+#         ("input", FluxKontextInputStep()),
+#         ("prepare_latents", FluxPrepareLatentsStep()),
+#         ("set_timesteps", FluxSetTimestepsStep()),
+#         ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
+#         ("denoise", FluxKontextDenoiseStep()),
+#         ("decode", FluxDecodeStep()),
+#     ]
+# )
+

From 1017e8a7c7f9fb500e7d9387614ed65c1989cbfe Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Fri, 13 Feb 2026 23:08:11 +0000
Subject: [PATCH 33/58] refactor flux2: seperate blocks for klein_base +
 workflow

---
 .../modular_pipelines/flux2/__init__.py       |  85 ++--------
 .../flux2/modular_blocks_flux2.py             |  72 ++-------
 .../flux2/modular_blocks_flux2_klein.py       | 101 ++----------
 .../flux2/modular_blocks_flux2_klein_base.py  | 149 ++++++++++++++++++
 4 files changed, 178 insertions(+), 229 deletions(-)
 create mode 100644 src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py

diff --git a/src/diffusers/modular_pipelines/flux2/__init__.py b/src/diffusers/modular_pipelines/flux2/__init__.py
index 74907a9af806..bf193df113ec 100644
--- a/src/diffusers/modular_pipelines/flux2/__init__.py
+++ b/src/diffusers/modular_pipelines/flux2/__init__.py
@@ -21,45 +21,11 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["encoders"] = [
-        "Flux2TextEncoderStep",
-        "Flux2RemoteTextEncoderStep",
-        "Flux2VaeEncoderStep",
-    ]
-    _import_structure["before_denoise"] = [
-        "Flux2SetTimestepsStep",
-        "Flux2PrepareLatentsStep",
-        "Flux2RoPEInputsStep",
-        "Flux2PrepareImageLatentsStep",
-    ]
-    _import_structure["denoise"] = [
-        "Flux2LoopDenoiser",
-        "Flux2LoopAfterDenoiser",
-        "Flux2DenoiseLoopWrapper",
-        "Flux2DenoiseStep",
-    ]
-    _import_structure["decoders"] = ["Flux2DecodeStep"]
-    _import_structure["inputs"] = [
-        "Flux2ProcessImagesInputStep",
-        "Flux2TextInputStep",
-    ]
-    _import_structure["modular_blocks_flux2"] = [
-        "ALL_BLOCKS",
-        "AUTO_BLOCKS",
-        "REMOTE_AUTO_BLOCKS",
-        "TEXT2IMAGE_BLOCKS",
-        "IMAGE_CONDITIONED_BLOCKS",
-        "Flux2AutoBlocks",
-        "Flux2AutoVaeEncoderStep",
-        "Flux2CoreDenoiseStep",
-        "Flux2VaeEncoderSequentialStep",
-    ]
-    _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks", "Flux2KleinBaseAutoBlocks"]
-    _import_structure["modular_pipeline"] = [
-        "Flux2ModularPipeline",
-        "Flux2KleinModularPipeline",
-        "Flux2KleinBaseModularPipeline",
-    ]
+    _import_structure["encoders"] = ["Flux2RemoteTextEncoderStep"]
+    _import_structure["modular_blocks_flux2"] = ["Flux2AutoBlocks"]
+    _import_structure["modular_blocks_flux2_klein_base"] = ["Flux2KleinBaseAutoBlocks"]
+    _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks"]
+    _import_structure["modular_pipeline"] = ["Flux2ModularPipeline", "Flux2KleinModularPipeline", "Flux2KleinBaseModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -68,43 +34,10 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .before_denoise import (
-            Flux2PrepareImageLatentsStep,
-            Flux2PrepareLatentsStep,
-            Flux2RoPEInputsStep,
-            Flux2SetTimestepsStep,
-        )
-        from .decoders import Flux2DecodeStep
-        from .denoise import (
-            Flux2DenoiseLoopWrapper,
-            Flux2DenoiseStep,
-            Flux2LoopAfterDenoiser,
-            Flux2LoopDenoiser,
-        )
-        from .encoders import (
-            Flux2RemoteTextEncoderStep,
-            Flux2TextEncoderStep,
-            Flux2VaeEncoderStep,
-        )
-        from .inputs import (
-            Flux2ProcessImagesInputStep,
-            Flux2TextInputStep,
-        )
-        from .modular_blocks_flux2 import (
-            ALL_BLOCKS,
-            AUTO_BLOCKS,
-            IMAGE_CONDITIONED_BLOCKS,
-            REMOTE_AUTO_BLOCKS,
-            TEXT2IMAGE_BLOCKS,
-            Flux2AutoBlocks,
-            Flux2AutoVaeEncoderStep,
-            Flux2CoreDenoiseStep,
-            Flux2VaeEncoderSequentialStep,
-        )
-        from .modular_blocks_flux2_klein import (
-            Flux2KleinAutoBlocks,
-            Flux2KleinBaseAutoBlocks,
-        )
+        from .encoders import Flux2RemoteTextEncoderStep
+        from .modular_blocks_flux2 import Flux2AutoBlocks
+        from .modular_blocks_flux2_klein_base import Flux2KleinBaseAutoBlocks
+        from .modular_blocks_flux2_klein import Flux2KleinAutoBlocks
         from .modular_pipeline import Flux2KleinBaseModularPipeline, Flux2KleinModularPipeline, Flux2ModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
index 41a0ff7dee28..af3252866914 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
@@ -51,6 +51,7 @@
 )
 
 
+# auto_docstring
 class Flux2VaeEncoderSequentialStep(SequentialPipelineBlocks):
     model_name = "flux2"
 
@@ -62,6 +63,7 @@ def description(self) -> str:
         return "VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning."
 
 
+# auto_docstring
 class Flux2AutoVaeEncoderStep(AutoPipelineBlocks):
     block_classes = [Flux2VaeEncoderSequentialStep]
     block_names = ["img_conditioning"]
@@ -91,6 +93,7 @@ def description(self):
 )
 
 
+# auto_docstring
 class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux2"
 
@@ -100,15 +103,7 @@ class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return (
-            "Core denoise step that performs the denoising process for Flux2-dev.\n"
-            " - `Flux2TextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n"
-            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
-            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
-            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
-            " - `Flux2PrepareGuidanceStep` (prepare_guidance) prepares the guidance tensor for the denoising step.\n"
-            " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
-            " - `Flux2DenoiseStep` (denoise) iteratively denoises the latents.\n"
-            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
+            "Core denoise step that performs the denoising process for Flux2-dev."
         )
 
     @property
@@ -131,29 +126,21 @@ def outputs(self):
     ]
 )
 
-
-REMOTE_AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", Flux2RemoteTextEncoderStep()),
-        ("vae_encoder", Flux2AutoVaeEncoderStep()),
-        ("denoise", Flux2CoreDenoiseStep()),
-        ("decode", Flux2DecodeStep()),
-    ]
-)
-
-
+# auto_docstring
 class Flux2AutoBlocks(SequentialPipelineBlocks):
     model_name = "flux2"
 
     block_classes = AUTO_BLOCKS.values()
     block_names = AUTO_BLOCKS.keys()
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image_conditioned": {"image": True, "prompt": True},
+    }
 
     @property
     def description(self):
         return (
-            "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.\n"
-            "- For text-to-image generation, all you need to provide is `prompt`.\n"
-            "- For image-conditioned generation, you need to provide `image` (list of PIL images)."
+            "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2."
         )
 
     @property
@@ -165,42 +152,3 @@ def outputs(self):
                 description="The images from the decoding step.",
             )
         ]
-
-
-TEXT2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", Flux2TextEncoderStep()),
-        ("text_input", Flux2TextInputStep()),
-        ("prepare_latents", Flux2PrepareLatentsStep()),
-        ("set_timesteps", Flux2SetTimestepsStep()),
-        ("prepare_guidance", Flux2PrepareGuidanceStep()),
-        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
-        ("denoise", Flux2DenoiseStep()),
-        ("after_denoise", Flux2UnpackLatentsStep()),
-        ("decode", Flux2DecodeStep()),
-    ]
-)
-
-IMAGE_CONDITIONED_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", Flux2TextEncoderStep()),
-        ("text_input", Flux2TextInputStep()),
-        ("preprocess_images", Flux2ProcessImagesInputStep()),
-        ("vae_encoder", Flux2VaeEncoderStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
-        ("prepare_latents", Flux2PrepareLatentsStep()),
-        ("set_timesteps", Flux2SetTimestepsStep()),
-        ("prepare_guidance", Flux2PrepareGuidanceStep()),
-        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
-        ("denoise", Flux2DenoiseStep()),
-        ("after_denoise", Flux2UnpackLatentsStep()),
-        ("decode", Flux2DecodeStep()),
-    ]
-)
-
-ALL_BLOCKS = {
-    "text2image": TEXT2IMAGE_BLOCKS,
-    "image_conditioned": IMAGE_CONDITIONED_BLOCKS,
-    "auto": AUTO_BLOCKS,
-    "remote": REMOTE_AUTO_BLOCKS,
-}
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
index 984832d77be5..9c8b891335e0 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -47,19 +47,12 @@
 # VAE encoder
 ################
 
-Flux2KleinVaeEncoderBlocks = InsertableDict(
-    [
-        ("preprocess", Flux2ProcessImagesInputStep()),
-        ("encode", Flux2VaeEncoderStep()),
-    ]
-)
-
 
 class Flux2KleinVaeEncoderSequentialStep(SequentialPipelineBlocks):
     model_name = "flux2"
 
-    block_classes = Flux2KleinVaeEncoderBlocks.values()
-    block_names = Flux2KleinVaeEncoderBlocks.keys()
+    block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+    block_names = ["preprocess", "encode"]
 
     @property
     def description(self) -> str:
@@ -107,14 +100,7 @@ class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model).\n"
-            " - `Flux2KleinTextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n"
-            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents  and image_latent_ids for the denoising step.\n"
-            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
-            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
-            " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
-            " - `Flux2KleinDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
+            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model)."
         )
 
     @property
@@ -128,52 +114,12 @@ def outputs(self):
         ]
 
 
-Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
-    [
-        ("input", Flux2KleinBaseTextInputStep()),
-        ("prepare_latents", Flux2PrepareLatentsStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
-        ("set_timesteps", Flux2SetTimestepsStep()),
-        ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
-        ("denoise", Flux2KleinBaseDenoiseStep()),
-        ("after_denoise", Flux2UnpackLatentsStep()),
-    ]
-)
-
-
-class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "flux2-klein"
-    block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
-    block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return "Core denoise step that performs the denoising process for Flux2-Klein (base model)."
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (base model).\n"
-            " - `Flux2KleinBaseTextInputStep` (input) standardizes the text inputs (prompt_embeds + negative_prompt_embeds) for the denoising step.\n"
-            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
-            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
-            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
-            " - `Flux2KleinBaseRoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids + negative_txt_ids) for the denoising step.\n"
-            " - `Flux2KleinBaseDenoiseStep` (denoise) iteratively denoises the latents using Classifier-Free Guidance.\n"
-            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
-        )
-
-    @property
-    def outputs(self):
-        return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
-        ]
-
 
 ###
 ### Auto blocks
 ###
+
+# auto_docstring
 class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
     model_name = "flux2-klein"
     block_classes = [
@@ -183,42 +129,15 @@ class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
         Flux2DecodeStep(),
     ]
     block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image_conditioned": {"image": True, "prompt": True},
+    }
 
     @property
     def description(self):
         return (
-            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.\n"
-            + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
-            + " - for text-to-image generation, all you need to provide is `prompt`.\n"
-        )
-
-    @property
-    def outputs(self):
-        return [
-            OutputParam(
-                name="images",
-                type_hint=List[PIL.Image.Image],
-                description="The images from the decoding step.",
-            )
-        ]
-
-
-class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
-    model_name = "flux2-klein"
-    block_classes = [
-        Flux2KleinBaseTextEncoderStep(),
-        Flux2KleinAutoVaeEncoderStep(),
-        Flux2KleinBaseCoreDenoiseStep(),
-        Flux2DecodeStep(),
-    ]
-    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
-
-    @property
-    def description(self):
-        return (
-            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).\n"
-            + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
-            + " - for text-to-image generation, all you need to provide is `prompt`.\n"
+            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein."
         )
 
     @property
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
new file mode 100644
index 000000000000..4ffbdbac8e9f
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
@@ -0,0 +1,149 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import PIL.Image
+import torch
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+    Flux2KleinBaseRoPEInputsStep,
+    Flux2PrepareImageLatentsStep,
+    Flux2PrepareLatentsStep,
+    Flux2RoPEInputsStep,
+    Flux2SetTimestepsStep,
+)
+from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
+from .denoise import Flux2KleinBaseDenoiseStep, Flux2KleinDenoiseStep
+from .encoders import (
+    Flux2KleinBaseTextEncoderStep,
+    Flux2KleinTextEncoderStep,
+    Flux2VaeEncoderStep,
+)
+from .inputs import (
+    Flux2KleinBaseTextInputStep,
+    Flux2ProcessImagesInputStep,
+    Flux2TextInputStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+################
+# VAE encoder
+################
+
+
+class Flux2KleinBaseVaeEncoderSequentialStep(SequentialPipelineBlocks):
+    model_name = "flux2"
+
+    block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+    block_names = ["preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
+
+
+class Flux2KleinBaseAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [Flux2KleinBaseVaeEncoderSequentialStep]
+    block_names = ["img_conditioning"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "VAE encoder step that encodes the image inputs into their latent representations.\n"
+            "This is an auto pipeline block that works for image conditioning tasks.\n"
+            " - `Flux2KleinBaseVaeEncoderSequentialStep` is used when `image` is provided.\n"
+            " - If `image` is not provided, step will be skipped."
+        )
+
+
+###
+### Core denoise
+###
+Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
+    [
+        ("input", Flux2KleinBaseTextInputStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
+        ("denoise", Flux2KleinBaseDenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
+    ]
+)
+
+
+class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux2-klein"
+    block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
+    block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Core denoise step that performs the denoising process for Flux2-Klein (base model)."
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The latents from the denoising step.",
+            )
+        ]
+
+
+###
+### Auto blocks
+###
+
+
+# auto_docstring
+class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
+    model_name = "flux2-klein"
+    block_classes = [
+        Flux2KleinBaseTextEncoderStep(),
+        Flux2KleinBaseAutoVaeEncoderStep(),
+        Flux2KleinBaseCoreDenoiseStep(),
+        Flux2DecodeStep(),
+    ]
+    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image_conditioned": {"image": True, "prompt": True},
+    }
+
+    @property
+    def description(self):
+        return (
+            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model)."
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="images",
+                type_hint=List[PIL.Image.Image],
+                description="The images from the decoding step.",
+            )
+        ]

From 53fbb40a37d85f9fc316654184a772833f80675d Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Fri, 13 Feb 2026 23:09:34 +0000
Subject: [PATCH 34/58] qwen: remove import support for stuff other than the
 default blocks

---
 .../modular_pipelines/qwenimage/__init__.py   | 40 ++++---------------
 1 file changed, 8 insertions(+), 32 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py
index 2b01a5b5a4b5..1c7098a09e08 100644
--- a/src/diffusers/modular_pipelines/qwenimage/__init__.py
+++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py
@@ -21,22 +21,10 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["modular_blocks_qwenimage"] = [
-        "AUTO_BLOCKS",
-        "QwenImageAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_edit"] = [
-        "EDIT_AUTO_BLOCKS",
-        "QwenImageEditAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_edit_plus"] = [
-        "EDIT_PLUS_AUTO_BLOCKS",
-        "QwenImageEditPlusAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_layered"] = [
-        "LAYERED_AUTO_BLOCKS",
-        "QwenImageLayeredAutoBlocks",
-    ]
+    _import_structure["modular_blocks_qwenimage"] = ["QwenImageAutoBlocks"]
+    _import_structure["modular_blocks_qwenimage_edit"] = ["QwenImageEditAutoBlocks"]
+    _import_structure["modular_blocks_qwenimage_edit_plus"] = ["QwenImageEditPlusAutoBlocks"]
+    _import_structure["modular_blocks_qwenimage_layered"] = ["QwenImageLayeredAutoBlocks"]
     _import_structure["modular_pipeline"] = [
         "QwenImageEditModularPipeline",
         "QwenImageEditPlusModularPipeline",
@@ -51,22 +39,10 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .modular_blocks_qwenimage import (
-            AUTO_BLOCKS,
-            QwenImageAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_edit import (
-            EDIT_AUTO_BLOCKS,
-            QwenImageEditAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_edit_plus import (
-            EDIT_PLUS_AUTO_BLOCKS,
-            QwenImageEditPlusAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_layered import (
-            LAYERED_AUTO_BLOCKS,
-            QwenImageLayeredAutoBlocks,
-        )
+        from .modular_blocks_qwenimage import QwenImageAutoBlocks
+        from .modular_blocks_qwenimage_edit import QwenImageEditAutoBlocks
+        from .modular_blocks_qwenimage_edit_plus import QwenImageEditPlusAutoBlocks
+        from .modular_blocks_qwenimage_layered import QwenImageLayeredAutoBlocks
         from .modular_pipeline import (
             QwenImageEditModularPipeline,
             QwenImageEditPlusModularPipeline,

From 26a34c3deb3033b755ea3c69d62c84e9b35f1a13 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Fri, 13 Feb 2026 23:11:51 +0000
Subject: [PATCH 35/58] add workflow support for wan

---
 .../modular_pipelines/wan/modular_blocks_wan.py        |  2 ++
 .../modular_pipelines/wan/modular_blocks_wan22.py      |  3 +++
 .../modular_pipelines/wan/modular_blocks_wan22_i2v.py  |  3 +++
 .../modular_pipelines/wan/modular_blocks_wan_i2v.py    | 10 +++++++---
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
index d01a86ca09b5..cd71f4f69999 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
@@ -37,6 +37,7 @@
 
 
 # inputs(text) -> set_timesteps -> prepare_latents -> denoise
+# auto_docstring
 class WanCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "wan"
     block_classes = [
@@ -64,6 +65,7 @@ def description(self):
 # ====================
 
 
+# auto_docstring
 class WanBlocks(SequentialPipelineBlocks):
     model_name = "wan"
     block_classes = [
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
index 21164422f3d9..45985249c6d4 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
@@ -38,6 +38,7 @@
 # inputs(text) -> set_timesteps -> prepare_latents -> denoise
 
 
+# auto_docstring
 class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "wan"
     block_classes = [
@@ -65,6 +66,8 @@ def description(self):
 # ====================
 
 
+
+# auto_docstring
 class Wan22Blocks(SequentialPipelineBlocks):
     model_name = "wan"
     block_classes = [
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
index 3db1c8fa837b..887e79bb6ad1 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
@@ -40,6 +40,7 @@
 # ====================
 
 
+# auto_docstring
 class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep]
@@ -56,6 +57,7 @@ def description(self):
 
 
 # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents)
+# auto_docstring
 class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [
@@ -91,6 +93,7 @@ def description(self):
 # ====================
 
 
+# auto_docstring
 class Wan22Image2VideoBlocks(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
index d07ab8ecf473..a3bb30557600 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
@@ -177,6 +177,7 @@ def description(self):
 
 
 # wan2.1 Image2Video Auto Blocks
+# auto_docstring
 class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [
@@ -194,10 +195,13 @@ class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
         "decode",
     ]
 
+    _workflow_map = {
+        "image2video": {"image": True, "prompt": True},
+        "flf2v": {"last_image": True, "image": True, "prompt": True},
+    }
+
     @property
     def description(self):
         return (
-            "Auto Modular pipeline for image-to-video using Wan.\n"
-            + "- for I2V workflow, all you need to provide is `image`"
-            + "- for FLF2V workflow, all you need to provide is `last_image` and `image`"
+            "Auto Modular pipeline for image-to-video using Wan."
         )

From 0d44493d1ba1393345f4e197ee30de79f69291d4 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Fri, 13 Feb 2026 23:12:31 +0000
Subject: [PATCH 36/58] sdxl: remove some imports:

---
 .../stable_diffusion_xl/__init__.py                  | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
index 644cc408ba37..c154bc8bd4fe 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
@@ -21,10 +21,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["encoders"] = ["StableDiffusionXLTextEncoderStep"]
-    _import_structure["modular_blocks"] = [
-        "StableDiffusionXLAutoBlocks",
-    ]
+    _import_structure["modular_blocks"] = ["StableDiffusionXLAutoBlocks"]
     _import_structure["modular_pipeline"] = ["StableDiffusionXLModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -34,12 +31,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .encoders import (
-            StableDiffusionXLTextEncoderStep,
-        )
-        from .modular_blocks import (
-            StableDiffusionXLAutoBlocks,
-        )
+        from .modular_blocks import StableDiffusionXLAutoBlocks
         from .modular_pipeline import StableDiffusionXLModularPipeline
 else:
     import sys

From 14466c88d86f6743f898c2d7f5d57c5ae9b0b201 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Fri, 13 Feb 2026 23:49:54 +0000
Subject: [PATCH 37/58] refactor z

---
 .../modular_pipelines/z_image/__init__.py     | 14 +---
 ...ar_blocks.py => modular_blocks_z_image.py} | 68 +++++--------------
 2 files changed, 18 insertions(+), 64 deletions(-)
 rename src/diffusers/modular_pipelines/z_image/{modular_blocks.py => modular_blocks_z_image.py} (76%)

diff --git a/src/diffusers/modular_pipelines/z_image/__init__.py b/src/diffusers/modular_pipelines/z_image/__init__.py
index c8a8c14396c0..5c04008d3305 100644
--- a/src/diffusers/modular_pipelines/z_image/__init__.py
+++ b/src/diffusers/modular_pipelines/z_image/__init__.py
@@ -21,12 +21,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["decoders"] = ["ZImageVaeDecoderStep"]
-    _import_structure["encoders"] = ["ZImageTextEncoderStep", "ZImageVaeImageEncoderStep"]
-    _import_structure["modular_blocks"] = [
-        "ALL_BLOCKS",
-        "ZImageAutoBlocks",
-    ]
+    _import_structure["modular_blocks_z_image"] = ["ZImageAutoBlocks"]
     _import_structure["modular_pipeline"] = ["ZImageModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -36,12 +31,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .decoders import ZImageVaeDecoderStep
-        from .encoders import ZImageTextEncoderStep
-        from .modular_blocks import (
-            ALL_BLOCKS,
-            ZImageAutoBlocks,
-        )
+        from .modular_blocks_z_image import ZImageAutoBlocks
         from .modular_pipeline import ZImageModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks.py b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
similarity index 76%
rename from src/diffusers/modular_pipelines/z_image/modular_blocks.py
rename to src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
index a54baeccaf0c..b85c959c661c 100644
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
@@ -36,8 +36,12 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# z-image
-# text2image
+# ====================
+# 1. DENOISE
+# ====================
+
+# text2image: inputs(text) -> set_timesteps -> prepare_latents -> denoise
+# auto_docstring
 class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
         ZImageTextInputStep,
@@ -59,8 +63,8 @@ def description(self):
         )
 
 
-# z-image: image2image
-## denoise
+# image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise 
+# auto_docstring
 class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
         ZImageTextInputStep,
@@ -96,7 +100,7 @@ def description(self):
         )
 
 
-## auto blocks
+# auto_docstring
 class ZImageAutoDenoiseStep(AutoPipelineBlocks):
     block_classes = [
         ZImageImage2ImageCoreDenoiseStep,
@@ -117,6 +121,7 @@ def description(self) -> str:
         )
 
 
+# auto_docstring
 class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
     block_classes = [ZImageVaeImageEncoderStep]
     block_names = ["vae_encoder"]
@@ -130,6 +135,7 @@ def description(self) -> str:
         +" - if `image` is not provided, step will be skipped."
 
 
+# auto_docstring
 class ZImageAutoBlocks(SequentialPipelineBlocks):
     block_classes = [
         ZImageTextEncoderStep,
@@ -138,54 +144,12 @@ class ZImageAutoBlocks(SequentialPipelineBlocks):
         ZImageVaeDecoderStep,
     ]
     block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image2image": {"image": True, "prompt": True},
+    }
 
     @property
     def description(self) -> str:
-        return "Auto Modular pipeline for text-to-image and image-to-image using ZImage.\n"
-        +" - for text-to-image generation, all you need to provide is `prompt`\n"
-        +" - for image-to-image generation, you need to provide `image`\n"
-        +" - if `image` is not provided, step will be skipped."
-
-
-# presets
-TEXT2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", ZImageTextEncoderStep),
-        ("input", ZImageTextInputStep),
-        ("prepare_latents", ZImagePrepareLatentsStep),
-        ("set_timesteps", ZImageSetTimestepsStep),
-        ("denoise", ZImageDenoiseStep),
-        ("decode", ZImageVaeDecoderStep),
-    ]
-)
-
-IMAGE2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", ZImageTextEncoderStep),
-        ("vae_encoder", ZImageVaeImageEncoderStep),
-        ("input", ZImageTextInputStep),
-        ("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])),
-        ("prepare_latents", ZImagePrepareLatentsStep),
-        ("set_timesteps", ZImageSetTimestepsStep),
-        ("set_timesteps_with_strength", ZImageSetTimestepsWithStrengthStep),
-        ("prepare_latents_with_image", ZImagePrepareLatentswithImageStep),
-        ("denoise", ZImageDenoiseStep),
-        ("decode", ZImageVaeDecoderStep),
-    ]
-)
-
-
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", ZImageTextEncoderStep),
-        ("vae_encoder", ZImageAutoVaeImageEncoderStep),
-        ("denoise", ZImageAutoDenoiseStep),
-        ("decode", ZImageVaeDecoderStep),
-    ]
-)
+        return "Auto Modular pipeline for text-to-image and image-to-image using ZImage."
 
-ALL_BLOCKS = {
-    "text2image": TEXT2IMAGE_BLOCKS,
-    "image2image": IMAGE2IMAGE_BLOCKS,
-    "auto": AUTO_BLOCKS,
-}

From 63deec89c33ef129ededec934d621d852f3cca4d Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 01:14:06 +0000
Subject: [PATCH 38/58] update flux2 auto core denoise

---
 .../flux2/modular_blocks_flux2.py             | 55 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
index af3252866914..1f2f38571b9b 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
@@ -82,7 +82,6 @@ def description(self):
 Flux2CoreDenoiseBlocks = InsertableDict(
     [
         ("input", Flux2TextInputStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
         ("prepare_guidance", Flux2PrepareGuidanceStep()),
@@ -117,11 +116,63 @@ def outputs(self):
         ]
 
 
+Flux2ImageConditionedCoreDenoiseBlocks = InsertableDict(
+    [
+        ("input", Flux2TextInputStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_guidance", Flux2PrepareGuidanceStep()),
+        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
+        ("denoise", Flux2DenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
+    ]
+)
+
+
+# auto_docstring
+class Flux2ImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux2"
+
+    block_classes = Flux2ImageConditionedCoreDenoiseBlocks.values()
+    block_names = Flux2ImageConditionedCoreDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Core denoise step that performs the denoising process for Flux2-dev with image conditioning."
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The latents from the denoising step.",
+            )
+        ]
+
+class Flux2AutoCoreDenoiseStep(AutoPipelineBlocks):
+    model_name = "flux2"
+    block_classes = [Flux2ImageConditionedCoreDenoiseStep, Flux2CoreDenoiseStep]
+    block_names = ["image_conditioned", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Auto core denoise step that performs the denoising process for Flux2-dev."
+            "This is an auto pipeline block that works for text-to-image and image-conditioned generation."
+            " - `Flux2CoreDenoiseStep` is used for text-to-image generation.\n"
+            " - `Flux2ImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
+        )
+
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", Flux2TextEncoderStep()),
         ("vae_encoder", Flux2AutoVaeEncoderStep()),
-        ("denoise", Flux2CoreDenoiseStep()),
+        ("denoise", Flux2AutoCoreDenoiseStep()),
         ("decode", Flux2DecodeStep()),
     ]
 )

From 5c7adebfde4b78db93bc7ec4c1dd7b6014e4750e Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 01:14:20 +0000
Subject: [PATCH 39/58] add workflow test for z and flux2

---
 .../flux2/test_modular_pipeline_flux2.py      | 31 +++++++++++++++++++
 .../z_image/test_modular_pipeline_z_image.py  | 24 ++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
index 8fd529e97e71..290e7244deb3 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
@@ -27,6 +27,19 @@
 from ...testing_utils import floats_tensor, torch_device
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
+FLUX2_TEXT2IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "Flux2TextEncoderStep"),
+        ("text_input", "Flux2TextInputStep"),
+        ("prepare_latents", "Flux2PrepareLatentsStep"),
+        ("set_timesteps", "Flux2SetTimestepsStep"),
+        ("prepare_guidance", "Flux2PrepareGuidanceStep"),
+        ("prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise", "Flux2DenoiseStep"),
+        ("after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
 
 class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2ModularPipeline
@@ -35,6 +48,7 @@ class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = FLUX2_TEXT2IMAGE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -55,6 +69,22 @@ def get_dummy_inputs(self, seed=0):
     def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
+FLUX2_IMAGE_CONDITIONED_WORKFLOWS = {
+    "image_conditioned": [
+        ("text_encoder", "Flux2TextEncoderStep"),
+        ("preprocess_images", "Flux2ProcessImagesInputStep"),
+        ("vae_encoder", "Flux2VaeEncoderStep"),
+        ("text_input", "Flux2TextInputStep"),
+        ("prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+        ("prepare_latents", "Flux2PrepareLatentsStep"),
+        ("set_timesteps", "Flux2SetTimestepsStep"),
+        ("prepare_guidance", "Flux2PrepareGuidanceStep"),
+        ("prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise", "Flux2DenoiseStep"),
+        ("after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
 
 class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2ModularPipeline
@@ -63,6 +93,7 @@ class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX2_IMAGE_CONDITIONED_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
index 29da18fce61b..a16e019768fb 100644
--- a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
+++ b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
@@ -19,6 +19,29 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
+ZIMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "ZImageTextEncoderStep"),
+        ("input", "ZImageTextInputStep"),
+        ("prepare_latents", "ZImagePrepareLatentsStep"),
+        ("set_timesteps", "ZImageSetTimestepsStep"),
+        ("denoise", "ZImageDenoiseStep"),
+        ("decode", "ZImageVaeDecoderStep"),
+    ],
+    "image2image": [
+        ("text_encoder", "ZImageTextEncoderStep"),
+        ("vae_encoder", "ZImageVaeImageEncoderStep"),
+        ("input", "ZImageTextInputStep"),
+        ("additional_inputs", "ZImageAdditionalInputsStep"),
+        ("prepare_latents", "ZImagePrepareLatentsStep"),
+        ("set_timesteps", "ZImageSetTimestepsStep"),
+        ("set_timesteps_with_strength", "ZImageSetTimestepsWithStrengthStep"),
+        ("prepare_latents_with_image", "ZImagePrepareLatentswithImageStep"),
+        ("denoise", "ZImageDenoiseStep"),
+        ("decode", "ZImageVaeDecoderStep"),
+    ],
+}
+
 class TestZImageModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = ZImageModularPipeline
     pipeline_blocks_class = ZImageAutoBlocks
@@ -26,6 +49,7 @@ class TestZImageModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = ZIMAGE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)

From fb83b635fca2676df0b88ac1e40b9ca32b0053df Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Fri, 13 Feb 2026 16:18:01 -1000
Subject: [PATCH 40/58] Apply suggestions from code review

---
 src/diffusers/modular_pipelines/modular_pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 432eb9e01b78..5e1302da2e76 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -724,7 +724,7 @@ def get_execution_blocks(self, **kwargs) -> Optional["ModularPipelineBlocks"]:
         Get the block(s) that would execute given the inputs.
 
         Recursively resolves nested ConditionalPipelineBlocks until reaching either:
-        - A leaf block (no sub_blocks) → returns single `ModularPipelineBlocks`
+        - A leaf block (no sub_blocks or LoopSequentialPipelineBlocks) → returns single `ModularPipelineBlocks`
         - A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns
         a `SequentialPipelineBlocks` containing the resolved execution blocks
 
@@ -746,7 +746,7 @@ def get_execution_blocks(self, **kwargs) -> Optional["ModularPipelineBlocks"]:
 
         block = self.sub_blocks[block_name]
 
-        # Recursively resolve until we hit a leaf block or a SequentialPipelineBlocks
+        # Recursively resolve until we hit a leaf block
         if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
             return block.get_execution_blocks(**kwargs)
 

From c396a66e34084126d319191c0902c3b7f27ad570 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Fri, 13 Feb 2026 16:21:33 -1000
Subject: [PATCH 41/58] Apply suggestions from code review

---
 src/diffusers/modular_pipelines/modular_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 5e1302da2e76..4fee1766555d 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -1075,7 +1075,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
                 raise
         return pipeline, state
 
-    # used for `trigger_inputs` property
+    # used for `__repr__`
     def _get_trigger_inputs(self):
         """
         Returns a set of all unique trigger input values found in the blocks.

From 65a33e9b7075c95deaa901ddcb3c7e6ced8c13f2 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 02:48:15 +0000
Subject: [PATCH 42/58] add test for flux

---
 .../flux/modular_blocks_flux.py               | 29 +---------
 .../flux/modular_blocks_flux_kontext.py       | 15 -----
 .../modular_pipelines/modular_pipeline.py     |  2 +-
 .../flux/test_modular_pipeline_flux.py        | 56 +++++++++++++++++++
 4 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
index 7099f89f0b0e..39ad3d911520 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
@@ -207,7 +207,7 @@ class FluxAutoBlocks(SequentialPipelineBlocks):
 
     _workflow_map = {
         "text2image": {"prompt": True},
-        "img2img": {"image": True, "prompt": True},
+        "image2image": {"image": True, "prompt": True},
     }
 
     @property
@@ -215,30 +215,3 @@ def description(self):
         return (
             "Auto Modular pipeline for text-to-image and image-to-image using Flux."
         )
-
-
-# TEXT2IMAGE_BLOCKS = InsertableDict(
-#     [
-#         ("text_encoder", FluxTextEncoderStep()),
-#         ("input", FluxTextInputStep()),
-#         ("prepare_latents", FluxPrepareLatentsStep()),
-#         ("set_timesteps", FluxSetTimestepsStep()),
-#         ("prepare_rope_inputs", FluxRoPEInputsStep()),
-#         ("denoise", FluxDenoiseStep()),
-#         ("decode", FluxDecodeStep()),
-#     ]
-# )
-
-# IMAGE2IMAGE_BLOCKS = InsertableDict(
-#     [
-#         ("text_encoder", FluxTextEncoderStep()),
-#         ("vae_encoder", FluxVaeEncoderStep()),
-#         ("input", FluxImg2ImgInputStep()),
-#         ("prepare_latents", FluxPrepareLatentsStep()),
-#         ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
-#         ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
-#         ("prepare_rope_inputs", FluxRoPEInputsStep()),
-#         ("denoise", FluxDenoiseStep()),
-#         ("decode", FluxDecodeStep()),
-#     ]
-# )
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
index 3b12d8c53eea..25508206d875 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
@@ -218,18 +218,3 @@ def description(self):
             "Modular pipeline for image-to-image using Flux Kontext."
         )
 
-
-
-# FLUX_KONTEXT_BLOCKS = InsertableDict(
-#     [
-#         ("text_encoder", FluxTextEncoderStep()),
-#         ("vae_encoder", FluxVaeEncoderStep(sample_mode="argmax")),
-#         ("input", FluxKontextInputStep()),
-#         ("prepare_latents", FluxPrepareLatentsStep()),
-#         ("set_timesteps", FluxSetTimestepsStep()),
-#         ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
-#         ("denoise", FluxKontextDenoiseStep()),
-#         ("decode", FluxDecodeStep()),
-#     ]
-# )
-
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 4fee1766555d..eff5685f6b05 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -19,7 +19,7 @@
 from collections import OrderedDict
 from copy import deepcopy
 from dataclasses import dataclass, field
-from typing import Any
+from typing import Any, Optional
 
 import torch
 from huggingface_hub import create_repo
diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
index 854b5218c617..bd3b4152c231 100644
--- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -33,6 +33,20 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
+
+FLUX_TEXT2IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "FluxTextEncoderStep"),
+        ("input", "FluxTextInputStep"),
+        ("prepare_latents", "FluxPrepareLatentsStep"),
+        ("set_timesteps", "FluxSetTimestepsStep"),
+        ("prepare_rope_inputs", "FluxRoPEInputsStep"),
+        ("denoise", "FluxDenoiseStep"),
+        ("decode", "FluxDecodeStep"),
+    ]
+}
+
+
 class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxModularPipeline
     pipeline_blocks_class = FluxAutoBlocks
@@ -40,6 +54,7 @@ class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = FLUX_TEXT2IMAGE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -59,6 +74,20 @@ def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
 
+FLUX_IMAGE2IMAGE_WORKFLOWS = {
+    "image2image": [
+        ("text_encoder", "FluxTextEncoderStep"),
+        ("vae_encoder", "FluxVaeEncoderStep"),
+        ("input", "FluxImg2ImgInputStep"),
+        ("prepare_latents", "FluxPrepareLatentsStep"),
+        ("set_timesteps", "FluxImg2ImgSetTimestepsStep"),
+        ("prepare_img2img_latents", "FluxImg2ImgPrepareLatentsStep"),
+        ("prepare_rope_inputs", "FluxRoPEInputsStep"),
+        ("denoise", "FluxDenoiseStep"),
+        ("decode", "FluxDecodeStep"),
+    ]
+}
+
 class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxModularPipeline
     pipeline_blocks_class = FluxAutoBlocks
@@ -66,6 +95,7 @@ class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX_IMAGE2IMAGE_WORKFLOWS
 
     def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
         pipeline = super().get_pipeline(components_manager, torch_dtype)
@@ -124,6 +154,31 @@ def test_save_from_pretrained(self):
     def test_float16_inference(self):
         super().test_float16_inference(8e-2)
 
+FLUX_KONTEXT_WORKFLOWS = {
+    "text2image": [
+        [
+            ("text_encoder", "FluxTextEncoderStep"),
+            ("input", "FluxKontextInputStep"),
+            ("prepare_latents", "FluxPrepareLatentsStep"),
+            ("set_timesteps", "FluxSetTimestepsStep"),
+            ("prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
+            ("denoise", "FluxKontextDenoiseStep"),
+            ("decode", "FluxDecodeStep"),
+        ]
+    ],
+    "image2image": [
+        [
+            ("text_encoder", "FluxTextEncoderStep"),
+            ("vae_encoder", "FluxVaeEncoderStep"),
+            ("input", "FluxKontextInputStep"),
+            ("prepare_latents", "FluxPrepareLatentsStep"),
+            ("set_timesteps", "FluxSetTimestepsStep"),
+            ("prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
+            ("denoise", "FluxKontextDenoiseStep"),
+            ("decode", "FluxDecodeStep"),
+        ]
+    ]
+}
 
 class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxKontextModularPipeline
@@ -132,6 +187,7 @@ class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX_KONTEXT_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)

From 791e2a35665ecf61bee556d9aff26c792e56f925 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 05:39:35 +0000
Subject: [PATCH 43/58] add workflow test for flux

---
 .../modular_pipelines/flux/inputs.py          |  7 +-
 .../flux/modular_blocks_flux.py               | 49 +++-------
 .../flux/modular_blocks_flux_kontext.py       | 89 ++++++-------------
 .../flux/test_modular_pipeline_flux.py        | 39 ++++----
 4 files changed, 65 insertions(+), 119 deletions(-)

diff --git a/src/diffusers/modular_pipelines/flux/inputs.py b/src/diffusers/modular_pipelines/flux/inputs.py
index dbf42e0c6df4..9d2f69dbe26f 100644
--- a/src/diffusers/modular_pipelines/flux/inputs.py
+++ b/src/diffusers/modular_pipelines/flux/inputs.py
@@ -121,7 +121,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
 
 
 # Adapted from `QwenImageAdditionalInputsStep`
-class FluxInputsDynamicStep(ModularPipelineBlocks):
+class FluxAdditionalInputsStep(ModularPipelineBlocks):
     model_name = "flux"
 
     def __init__(
@@ -243,7 +243,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         return components, state
 
 
-class FluxKontextInputsDynamicStep(FluxInputsDynamicStep):
+class FluxKontextAdditionalInputsStep(FluxAdditionalInputsStep):
     model_name = "flux-kontext"
 
     def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
@@ -256,7 +256,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
                 continue
 
             # 1. Calculate height/width from latents
-            # Unlike the `FluxInputsDynamicStep`, we don't overwrite the `block.height` and `block.width`
+            # Unlike the `FluxAdditionalInputsStep`, we don't overwrite the `block.height` and `block.width`
             height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
             if not hasattr(block_state, "image_height"):
                 block_state.image_height = height
@@ -303,6 +303,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
 class FluxKontextSetResolutionStep(ModularPipelineBlocks):
     model_name = "flux-kontext"
 
+    @property
     def description(self):
         return (
             "Determines the height and width to be used during the subsequent computations.\n"
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
index 39ad3d911520..279cd1f305ff 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
@@ -32,8 +32,8 @@
     FluxVaeEncoderStep,
 )
 from .inputs import (
-    FluxInputsDynamicStep,
-    FluxKontextInputsDynamicStep,
+    FluxAdditionalInputsStep,
+    FluxKontextAdditionalInputsStep,
     FluxKontextSetResolutionStep,
     FluxTextInputStep,
 )
@@ -43,16 +43,13 @@
 
 
 # vae encoder (run before before_denoise)
-FluxImg2ImgVaeEncoderBlocks = InsertableDict(
-    [("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderStep())]
-)
 
 # auto_docstring
 class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     model_name = "flux"
 
-    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
-    block_names = FluxImg2ImgVaeEncoderBlocks.keys()
+    block_classes = [FluxProcessImagesInputStep(), FluxVaeEncoderStep()]
+    block_names = ["preprocess", "encode"]
 
     @property
     def description(self) -> str:
@@ -75,18 +72,11 @@ def description(self):
         )
 
 # before_denoise: text2img
-FluxBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-    ]
-)
-
 # auto_docstring
 class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxBeforeDenoiseBlocks.values()
-    block_names = FluxBeforeDenoiseBlocks.keys()
+    model_name = "flux"
+    block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()]
+    block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
 
     @property
     def description(self):
@@ -94,20 +84,11 @@ def description(self):
 
 
 # before_denoise: img2img
-FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
-        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-    ]
-)
-
 # auto_docstring
 class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux"
-    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
-    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
+    block_classes = [FluxPrepareLatentsStep(), FluxImg2ImgSetTimestepsStep(), FluxImg2ImgPrepareLatentsStep(), FluxRoPEInputsStep()]
+    block_names = ["prepare_latents", "set_timesteps", "prepare_img2img_latents", "prepare_rope_inputs"]
 
     @property
     def description(self):
@@ -134,15 +115,12 @@ def description(self):
 
 
 # inputs: text2image/img2img
-FluxImg2ImgBlocks = InsertableDict(
-    [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
-)
 
 # auto_docstring
 class FluxImg2ImgInputStep(SequentialPipelineBlocks):
     model_name = "flux"
-    block_classes = FluxImg2ImgBlocks.values()
-    block_names = FluxImg2ImgBlocks.keys()
+    block_classes = [FluxTextInputStep(), FluxAdditionalInputsStep()]
+    block_names = ["text_inputs", "additional_inputs"]
 
     @property
     def description(self):
@@ -178,10 +156,7 @@ class FluxCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return (
-            "Core step that performs the denoising process. \n"
-            + " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
-            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
-            + " - `FluxDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            "Core step that performs the denoising process for Flux.\n"
             + "This step supports text-to-image and image-to-image tasks for Flux:\n"
             + " - for image-to-image generation, you need to provide `image_latents`\n"
             + " - for text-to-image generation, all you need to provide is prompt embeddings."
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
index 25508206d875..3c935f0446c0 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
@@ -32,8 +32,8 @@
     FluxVaeEncoderStep,
 )
 from .inputs import (
-    FluxInputsDynamicStep,
-    FluxKontextInputsDynamicStep,
+    FluxAdditionalInputsStep,
+    FluxKontextAdditionalInputsStep,
     FluxKontextSetResolutionStep,
     FluxTextInputStep,
 )
@@ -43,17 +43,11 @@
 
 
 # Flux Kontext vae encoder (run before before_denoise)
-
-FluxKontextVaeEncoderBlocks = InsertableDict(
-    [("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderStep(sample_mode="argmax"))]
-)
-
-
 class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
     model_name = "flux-kontext"
 
-    block_classes = FluxKontextVaeEncoderBlocks.values()
-    block_names = FluxKontextVaeEncoderBlocks.keys()
+    block_classes = [FluxKontextProcessImagesInputStep(), FluxVaeEncoderStep(sample_mode="argmax")]
+    block_names = ["preprocess", "encode"]
 
     @property
     def description(self) -> str:
@@ -61,8 +55,10 @@ def description(self) -> str:
 
 
 class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
+    model_name = "flux-kontext"
+
     block_classes = [FluxKontextVaeEncoderStep]
-    block_names = ["img2img"]
+    block_names = ["image_conditioned"]
     block_trigger_inputs = ["image"]
 
     @property
@@ -76,18 +72,12 @@ def description(self):
 
 
 # before_denoise: text2img
-FluxBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-    ]
-)
 
+class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
 
-class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxBeforeDenoiseBlocks.values()
-    block_names = FluxBeforeDenoiseBlocks.keys()
+    block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()]
+    block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
 
     @property
     def description(self):
@@ -95,19 +85,11 @@ def description(self):
 
 
 # before_denoise: FluxKontext
+class FluxKontextImageConditionedBeforeDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux-kontext"
 
-FluxKontextBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
-    ]
-)
-
-
-class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxKontextBeforeDenoiseBlocks.values()
-    block_names = FluxKontextBeforeDenoiseBlocks.keys()
+    block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxKontextRoPEInputsStep()]
+    block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
 
     @property
     def description(self):
@@ -118,8 +100,10 @@ def description(self):
 
 
 class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep]
-    block_names = ["img2img", "text2image"]
+    model_name = "flux-kontext"
+
+    block_classes = [FluxKontextImageConditionedBeforeDenoiseStep, FluxKontextBeforeDenoiseStep]
+    block_names = ["image_conditioned", "text2image"]
     block_trigger_inputs = ["image_latents", None]
 
     @property
@@ -127,25 +111,15 @@ def description(self):
         return (
             "Before denoise step that prepare the inputs for the denoise step.\n"
             + "This is an auto pipeline block that works for text2image.\n"
-            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
-            + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
+            + " - `FluxKontextBeforeDenoiseStep` (text2image) is used.\n"
+            + " - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is provided.\n"
         )
 
 # inputs: Flux Kontext
-
-FluxKontextBlocks = InsertableDict(
-    [
-        ("set_resolution", FluxKontextSetResolutionStep()),
-        ("text_inputs", FluxTextInputStep()),
-        ("additional_inputs", FluxKontextInputsDynamicStep()),
-    ]
-)
-
-
 class FluxKontextInputStep(SequentialPipelineBlocks):
     model_name = "flux-kontext"
-    block_classes = FluxKontextBlocks.values()
-    block_names = FluxKontextBlocks.keys()
+    block_classes = [FluxKontextSetResolutionStep(), FluxTextInputStep(), FluxKontextAdditionalInputsStep()]
+    block_names = ["set_resolution", "text_inputs", "additional_inputs"]
 
     @property
     def description(self):
@@ -159,7 +133,7 @@ def description(self):
 class FluxKontextAutoInputStep(AutoPipelineBlocks):
     model_name = "flux-kontext"
     block_classes = [FluxKontextInputStep, FluxTextInputStep]
-    block_names = ["img2img", "text2img"]
+    block_names = ["image_conditioned", "text2image"]
     block_trigger_inputs = ["image_latents", None]
 
     @property
@@ -167,7 +141,7 @@ def description(self):
         return (
             "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
             " This is an auto pipeline block that works for text2image/img2img tasks.\n"
-            + " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n"
+            + " - `FluxKontextInputStep` (image_conditioned) is used when `image_latents` is provided.\n"
             + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
         )
 
@@ -181,12 +155,9 @@ class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return (
-            "Core step that performs the denoising process. \n"
-            + " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
-            + " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
-            + " - `FluxKontextDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            + "This step supports text-to-image and image-to-image tasks for Flux-Kontext:\n"
-            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            "Core step that performs the denoising process for Flux Kontext.\n"
+            + "This step supports text-to-image and image-conditioned tasks for Flux Kontext:\n"
+            + " - for image-conditioned generation, you need to provide `image_latents`\n"
             + " - for text-to-image generation, all you need to provide is prompt embeddings."
         )
 
@@ -200,15 +171,13 @@ def description(self):
     ]
 )
 
-
-
 class FluxKontextAutoBlocks(SequentialPipelineBlocks):
     model_name = "flux-kontext"
 
     block_classes = AUTO_BLOCKS_KONTEXT.values()
     block_names = AUTO_BLOCKS_KONTEXT.keys()
     _workflow_map = {
-        "img2img": {"image": True, "prompt": True},
+        "image_conditioned": {"image": True, "prompt": True},
         "text2image": {"prompt": True},
     }
 
diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
index bd3b4152c231..8144294c8eed 100644
--- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -77,8 +77,10 @@ def test_float16_inference(self):
 FLUX_IMAGE2IMAGE_WORKFLOWS = {
     "image2image": [
         ("text_encoder", "FluxTextEncoderStep"),
-        ("vae_encoder", "FluxVaeEncoderStep"),
-        ("input", "FluxImg2ImgInputStep"),
+        ("vae_encoder.preprocess", "FluxProcessImagesInputStep"),
+        ("vae_encoder.encode", "FluxVaeEncoderStep"),
+        ("input", "FluxTextInputStep"),
+        ("additional_inputs", "FluxAdditionalInputsStep"),
         ("prepare_latents", "FluxPrepareLatentsStep"),
         ("set_timesteps", "FluxImg2ImgSetTimestepsStep"),
         ("prepare_img2img_latents", "FluxImg2ImgPrepareLatentsStep"),
@@ -156,28 +158,27 @@ def test_float16_inference(self):
 
 FLUX_KONTEXT_WORKFLOWS = {
     "text2image": [
-        [
             ("text_encoder", "FluxTextEncoderStep"),
-            ("input", "FluxKontextInputStep"),
-            ("prepare_latents", "FluxPrepareLatentsStep"),
-            ("set_timesteps", "FluxSetTimestepsStep"),
-            ("prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
-            ("denoise", "FluxKontextDenoiseStep"),
+            ("denoise.input", "FluxTextInputStep"),
+            ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+            ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+            ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+            ("denoise.denoise", "FluxKontextDenoiseStep"),
             ("decode", "FluxDecodeStep"),
-        ]
-    ],
-    "image2image": [
-        [
+        ],
+    "image_conditioned": [
             ("text_encoder", "FluxTextEncoderStep"),
-            ("vae_encoder", "FluxVaeEncoderStep"),
-            ("input", "FluxKontextInputStep"),
-            ("prepare_latents", "FluxPrepareLatentsStep"),
-            ("set_timesteps", "FluxSetTimestepsStep"),
-            ("prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
-            ("denoise", "FluxKontextDenoiseStep"),
+            ("vae_encoder.preprocess", "FluxKontextProcessImagesInputStep"),
+            ("vae_encoder.encode", "FluxVaeEncoderStep"),
+            ("denoise.input.set_resolution", "FluxKontextSetResolutionStep"),
+            ("denoise.input.text_inputs", "FluxTextInputStep"),
+            ("denoise.input.additional_inputs", "FluxKontextAdditionalInputsStep"),
+            ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+            ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+            ("denoise.before_denoise.prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
+            ("denoise.denoise", "FluxKontextDenoiseStep"),
             ("decode", "FluxDecodeStep"),
         ]
-    ]
 }
 
 class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):

From 97111e06e6abad9faf47b6c5485d6e45f0d0235c Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-161-123.ec2.internal>
Date: Sat, 14 Feb 2026 07:12:44 +0000
Subject: [PATCH 44/58] add test for flux-klein

---
 .../flux2/modular_blocks_flux2_klein.py       | 56 ++++++++++++++++++-
 .../flux2/modular_blocks_flux2_klein_base.py  | 55 +++++++++++++++++-
 .../modular_pipeline_utils.py                 |  3 +-
 .../test_modular_pipeline_flux2_klein.py      | 35 +++++++++++-
 4 files changed, 140 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
index 9c8b891335e0..7a963cf76338 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -81,7 +81,6 @@ def description(self):
 Flux2KleinCoreDenoiseBlocks = InsertableDict(
     [
         ("input", Flux2TextInputStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
         ("prepare_rope_inputs", Flux2RoPEInputsStep()),
@@ -100,7 +99,7 @@ class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model)."
+            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image generation."
         )
 
     @property
@@ -114,6 +113,57 @@ def outputs(self):
         ]
 
 
+Flux2KleinImageConditionedCoreDenoiseBlocks = InsertableDict(
+    [
+        ("input", Flux2TextInputStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
+        ("denoise", Flux2KleinDenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
+    ]
+)
+
+
+class Flux2KleinImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux2-klein"
+
+    block_classes = Flux2KleinImageConditionedCoreDenoiseBlocks.values()
+    block_names = Flux2KleinImageConditionedCoreDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning."
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The latents from the denoising step.",
+            )
+        ]
+
+
+class Flux2KleinAutoCoreDenoiseStep(AutoPipelineBlocks):
+    model_name = "flux2-klein"
+    block_classes = [Flux2KleinImageConditionedCoreDenoiseStep, Flux2KleinCoreDenoiseStep]
+    block_names = ["image_conditioned", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Auto core denoise step that performs the denoising process for Flux2-Klein.\n"
+            "This is an auto pipeline block that works for text-to-image and image-conditioned generation.\n"
+            " - `Flux2KleinCoreDenoiseStep` is used for text-to-image generation.\n"
+            " - `Flux2KleinImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
+        )
+
 
 ###
 ### Auto blocks
@@ -125,7 +175,7 @@ class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
     block_classes = [
         Flux2KleinTextEncoderStep(),
         Flux2KleinAutoVaeEncoderStep(),
-        Flux2KleinCoreDenoiseStep(),
+        Flux2KleinAutoCoreDenoiseStep(),
         Flux2DecodeStep(),
     ]
     block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
index 4ffbdbac8e9f..391a0116639e 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
@@ -77,11 +77,11 @@ def description(self):
 ###
 ### Core denoise
 ###
+
 Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
     [
         ("input", Flux2KleinBaseTextInputStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
         ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
         ("denoise", Flux2KleinBaseDenoiseStep()),
@@ -98,7 +98,42 @@ class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (base model)."
+            "Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation."
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The latents from the denoising step.",
+            )
+        ]
+
+
+Flux2KleinBaseImageConditionedCoreDenoiseBlocks = InsertableDict(
+    [
+        ("input", Flux2KleinBaseTextInputStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
+        ("denoise", Flux2KleinBaseDenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
+    ]
+)
+
+
+class Flux2KleinBaseImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux2-klein"
+    block_classes = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.values()
+    block_names = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning."
         )
 
     @property
@@ -111,6 +146,20 @@ def outputs(self):
             )
         ]
 
+class Flux2KleinBaseAutoCoreDenoiseStep(AutoPipelineBlocks):
+    model_name = "flux2-klein"
+    block_classes = [Flux2KleinBaseImageConditionedCoreDenoiseStep, Flux2KleinBaseCoreDenoiseStep]
+    block_names = ["image_conditioned", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Auto core denoise step that performs the denoising process for Flux2-Klein (base model).\n"
+            "This is an auto pipeline block that works for text-to-image and image-conditioned generation.\n"
+            " - `Flux2KleinBaseCoreDenoiseStep` is used for text-to-image generation.\n"
+            " - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
+        )
 
 ###
 ### Auto blocks
@@ -123,7 +172,7 @@ class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
     block_classes = [
         Flux2KleinBaseTextEncoderStep(),
         Flux2KleinBaseAutoVaeEncoderStep(),
-        Flux2KleinBaseCoreDenoiseStep(),
+        Flux2KleinBaseAutoCoreDenoiseStep(),
         Flux2DecodeStep(),
     ]
     block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 754267c519b6..5fe4a2e59c6d 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -17,7 +17,8 @@
 import warnings
 from collections import OrderedDict
 from dataclasses import dataclass, field, fields
-from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union, get_args, get_origin
+from types import UnionType
 
 import PIL.Image
 import torch
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
index 26653b20f8c4..0ad3e0d3bcbf 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
@@ -28,13 +28,27 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
-class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
+FLUX2_KLEIN_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "Flux2KleinTextEncoderStep"),
+        ("denoise.input", "Flux2TextInputStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise.denoise", "Flux2KleinDenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+class TestFlux2KleinModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2KleinModularPipeline
     pipeline_blocks_class = Flux2KleinAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular"
 
     params = frozenset(["prompt", "height", "width"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = FLUX2_KLEIN_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -55,13 +69,30 @@ def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
 
-class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+FLUX2_KLEIN_IMAGE_CONDITIONED_WORKFLOWS = {
+    "image_conditioned": [
+        ("text_encoder", "Flux2KleinTextEncoderStep"),
+        ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+        ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+        ("denoise.input", "Flux2TextInputStep"),
+        ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise.denoise", "Flux2KleinDenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+class TestFlux2KleinImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2KleinModularPipeline
     pipeline_blocks_class = Flux2KleinAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular"
 
     params = frozenset(["prompt", "height", "width", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX2_KLEIN_IMAGE_CONDITIONED_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)

From 2fd26329b3945358be7c9d13b5d740aeea394f00 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-161-123.ec2.internal>
Date: Sat, 14 Feb 2026 07:15:59 +0000
Subject: [PATCH 45/58] sdxl: modular_blocks.py ->
 modular_blocks_stable_diffusion_xl.py

---
 .../modular_pipelines/stable_diffusion_xl/__init__.py         | 4 ++--
 ...odular_blocks.py => modular_blocks_stable_diffusion_xl.py} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename src/diffusers/modular_pipelines/stable_diffusion_xl/{modular_blocks.py => modular_blocks_stable_diffusion_xl.py} (100%)

diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
index c154bc8bd4fe..44f1c555cef3 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
@@ -21,7 +21,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["modular_blocks"] = ["StableDiffusionXLAutoBlocks"]
+    _import_structure["modular_blocks_stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks"]
     _import_structure["modular_pipeline"] = ["StableDiffusionXLModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -31,7 +31,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .modular_blocks import StableDiffusionXLAutoBlocks
+        from .modular_blocks_stable_diffusion_xl import StableDiffusionXLAutoBlocks
         from .modular_pipeline import StableDiffusionXLModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
similarity index 100%
rename from src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
rename to src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py

From c0b5820f8b09648a2f49ee5d1d16bed15d70f02d Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-161-123.ec2.internal>
Date: Sat, 14 Feb 2026 07:19:09 +0000
Subject: [PATCH 46/58] style

---
 .../flux/modular_blocks_flux.py               | 23 ++++----
 .../flux/modular_blocks_flux_kontext.py       | 14 ++---
 .../modular_pipelines/flux2/__init__.py       | 10 ++--
 .../flux2/modular_blocks_flux2.py             | 16 +++---
 .../flux2/modular_blocks_flux2_klein.py       | 18 ++-----
 .../flux2/modular_blocks_flux2_klein_base.py  | 19 +++----
 .../modular_pipeline_utils.py                 |  6 ++-
 .../modular_pipelines/qwenimage/__init__.py   |  2 +-
 .../modular_blocks_stable_diffusion_xl.py     | 52 +++++++++++++++----
 .../wan/modular_blocks_wan22.py               |  1 -
 .../wan/modular_blocks_wan_i2v.py             |  4 +-
 .../z_image/modular_blocks_z_image.py         |  5 +-
 .../flux/test_modular_pipeline_flux.py        | 44 ++++++++--------
 .../flux2/test_modular_pipeline_flux2.py      |  4 ++
 .../test_modular_pipeline_flux2_klein.py      |  2 +
 .../qwen/test_modular_pipeline_qwenimage.py   |  3 ++
 ...st_modular_pipeline_stable_diffusion_xl.py |  5 +-
 .../test_modular_pipelines_common.py          |  4 +-
 .../z_image/test_modular_pipeline_z_image.py  |  1 +
 19 files changed, 131 insertions(+), 102 deletions(-)

diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
index 279cd1f305ff..6cb6b7811eb6 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
@@ -18,23 +18,19 @@
 from .before_denoise import (
     FluxImg2ImgPrepareLatentsStep,
     FluxImg2ImgSetTimestepsStep,
-    FluxKontextRoPEInputsStep,
     FluxPrepareLatentsStep,
     FluxRoPEInputsStep,
     FluxSetTimestepsStep,
 )
 from .decoders import FluxDecodeStep
-from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
+from .denoise import FluxDenoiseStep
 from .encoders import (
-    FluxKontextProcessImagesInputStep,
     FluxProcessImagesInputStep,
     FluxTextEncoderStep,
     FluxVaeEncoderStep,
 )
 from .inputs import (
     FluxAdditionalInputsStep,
-    FluxKontextAdditionalInputsStep,
-    FluxKontextSetResolutionStep,
     FluxTextInputStep,
 )
 
@@ -44,6 +40,7 @@
 
 # vae encoder (run before before_denoise)
 
+
 # auto_docstring
 class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     model_name = "flux"
@@ -55,6 +52,7 @@ class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     def description(self) -> str:
         return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
 
+
 # auto_docstring
 class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
     model_name = "flux"
@@ -71,6 +69,7 @@ def description(self):
             + " - if `image` is not provided, step will be skipped."
         )
 
+
 # before_denoise: text2img
 # auto_docstring
 class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
@@ -87,7 +86,12 @@ def description(self):
 # auto_docstring
 class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux"
-    block_classes = [FluxPrepareLatentsStep(), FluxImg2ImgSetTimestepsStep(), FluxImg2ImgPrepareLatentsStep(), FluxRoPEInputsStep()]
+    block_classes = [
+        FluxPrepareLatentsStep(),
+        FluxImg2ImgSetTimestepsStep(),
+        FluxImg2ImgPrepareLatentsStep(),
+        FluxRoPEInputsStep(),
+    ]
     block_names = ["prepare_latents", "set_timesteps", "prepare_img2img_latents", "prepare_rope_inputs"]
 
     @property
@@ -113,9 +117,9 @@ def description(self):
         )
 
 
-
 # inputs: text2image/img2img
 
+
 # auto_docstring
 class FluxImg2ImgInputStep(SequentialPipelineBlocks):
     model_name = "flux"
@@ -173,6 +177,7 @@ def description(self):
     ]
 )
 
+
 # auto_docstring
 class FluxAutoBlocks(SequentialPipelineBlocks):
     model_name = "flux"
@@ -187,6 +192,4 @@ class FluxAutoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image and image-to-image using Flux."
-        )
+        return "Auto Modular pipeline for text-to-image and image-to-image using Flux."
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
index 3c935f0446c0..eb15144ec910 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
@@ -16,23 +16,19 @@
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict
 from .before_denoise import (
-    FluxImg2ImgPrepareLatentsStep,
-    FluxImg2ImgSetTimestepsStep,
     FluxKontextRoPEInputsStep,
     FluxPrepareLatentsStep,
     FluxRoPEInputsStep,
     FluxSetTimestepsStep,
 )
 from .decoders import FluxDecodeStep
-from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
+from .denoise import FluxKontextDenoiseStep
 from .encoders import (
     FluxKontextProcessImagesInputStep,
-    FluxProcessImagesInputStep,
     FluxTextEncoderStep,
     FluxVaeEncoderStep,
 )
 from .inputs import (
-    FluxAdditionalInputsStep,
     FluxKontextAdditionalInputsStep,
     FluxKontextSetResolutionStep,
     FluxTextInputStep,
@@ -73,6 +69,7 @@ def description(self):
 
 # before_denoise: text2img
 
+
 class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux-kontext"
 
@@ -115,6 +112,7 @@ def description(self):
             + " - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is provided.\n"
         )
 
+
 # inputs: Flux Kontext
 class FluxKontextInputStep(SequentialPipelineBlocks):
     model_name = "flux-kontext"
@@ -171,6 +169,7 @@ def description(self):
     ]
 )
 
+
 class FluxKontextAutoBlocks(SequentialPipelineBlocks):
     model_name = "flux-kontext"
 
@@ -183,7 +182,4 @@ class FluxKontextAutoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Modular pipeline for image-to-image using Flux Kontext."
-        )
-
+        return "Modular pipeline for image-to-image using Flux Kontext."
diff --git a/src/diffusers/modular_pipelines/flux2/__init__.py b/src/diffusers/modular_pipelines/flux2/__init__.py
index bf193df113ec..d7cc8badcaf7 100644
--- a/src/diffusers/modular_pipelines/flux2/__init__.py
+++ b/src/diffusers/modular_pipelines/flux2/__init__.py
@@ -23,9 +23,13 @@
 else:
     _import_structure["encoders"] = ["Flux2RemoteTextEncoderStep"]
     _import_structure["modular_blocks_flux2"] = ["Flux2AutoBlocks"]
-    _import_structure["modular_blocks_flux2_klein_base"] = ["Flux2KleinBaseAutoBlocks"]
     _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks"]
-    _import_structure["modular_pipeline"] = ["Flux2ModularPipeline", "Flux2KleinModularPipeline", "Flux2KleinBaseModularPipeline"]
+    _import_structure["modular_blocks_flux2_klein_base"] = ["Flux2KleinBaseAutoBlocks"]
+    _import_structure["modular_pipeline"] = [
+        "Flux2KleinBaseModularPipeline",
+        "Flux2KleinModularPipeline",
+        "Flux2ModularPipeline",
+    ]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -36,8 +40,8 @@
     else:
         from .encoders import Flux2RemoteTextEncoderStep
         from .modular_blocks_flux2 import Flux2AutoBlocks
-        from .modular_blocks_flux2_klein_base import Flux2KleinBaseAutoBlocks
         from .modular_blocks_flux2_klein import Flux2KleinAutoBlocks
+        from .modular_blocks_flux2_klein_base import Flux2KleinBaseAutoBlocks
         from .modular_pipeline import Flux2KleinBaseModularPipeline, Flux2KleinModularPipeline, Flux2ModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
index 1f2f38571b9b..d4207aa6b637 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
@@ -30,7 +30,6 @@
 from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
 from .denoise import Flux2DenoiseStep
 from .encoders import (
-    Flux2RemoteTextEncoderStep,
     Flux2TextEncoderStep,
     Flux2VaeEncoderStep,
 )
@@ -101,9 +100,7 @@ class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Core denoise step that performs the denoising process for Flux2-dev."
-        )
+        return "Core denoise step that performs the denoising process for Flux2-dev."
 
     @property
     def outputs(self):
@@ -139,9 +136,7 @@ class Flux2ImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Core denoise step that performs the denoising process for Flux2-dev with image conditioning."
-        )
+        return "Core denoise step that performs the denoising process for Flux2-dev with image conditioning."
 
     @property
     def outputs(self):
@@ -153,6 +148,7 @@ def outputs(self):
             )
         ]
 
+
 class Flux2AutoCoreDenoiseStep(AutoPipelineBlocks):
     model_name = "flux2"
     block_classes = [Flux2ImageConditionedCoreDenoiseStep, Flux2CoreDenoiseStep]
@@ -168,6 +164,7 @@ def description(self):
             " - `Flux2ImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
         )
 
+
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", Flux2TextEncoderStep()),
@@ -177,6 +174,7 @@ def description(self):
     ]
 )
 
+
 # auto_docstring
 class Flux2AutoBlocks(SequentialPipelineBlocks):
     model_name = "flux2"
@@ -190,9 +188,7 @@ class Flux2AutoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2."
-        )
+        return "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2."
 
     @property
     def outputs(self):
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
index 7a963cf76338..72bf93029494 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -21,21 +21,18 @@
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
-    Flux2KleinBaseRoPEInputsStep,
     Flux2PrepareImageLatentsStep,
     Flux2PrepareLatentsStep,
     Flux2RoPEInputsStep,
     Flux2SetTimestepsStep,
 )
 from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
-from .denoise import Flux2KleinBaseDenoiseStep, Flux2KleinDenoiseStep
+from .denoise import Flux2KleinDenoiseStep
 from .encoders import (
-    Flux2KleinBaseTextEncoderStep,
     Flux2KleinTextEncoderStep,
     Flux2VaeEncoderStep,
 )
 from .inputs import (
-    Flux2KleinBaseTextInputStep,
     Flux2ProcessImagesInputStep,
     Flux2TextInputStep,
 )
@@ -98,9 +95,7 @@ class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image generation."
-        )
+        return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image generation."
 
     @property
     def outputs(self):
@@ -134,9 +129,7 @@ class Flux2KleinImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning."
-        )
+        return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning."
 
     @property
     def outputs(self):
@@ -169,6 +162,7 @@ def description(self):
 ### Auto blocks
 ###
 
+
 # auto_docstring
 class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
     model_name = "flux2-klein"
@@ -186,9 +180,7 @@ class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein."
-        )
+        return "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein."
 
     @property
     def outputs(self):
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
index 391a0116639e..5fff49a0f8c0 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
@@ -24,20 +24,17 @@
     Flux2KleinBaseRoPEInputsStep,
     Flux2PrepareImageLatentsStep,
     Flux2PrepareLatentsStep,
-    Flux2RoPEInputsStep,
     Flux2SetTimestepsStep,
 )
 from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
-from .denoise import Flux2KleinBaseDenoiseStep, Flux2KleinDenoiseStep
+from .denoise import Flux2KleinBaseDenoiseStep
 from .encoders import (
     Flux2KleinBaseTextEncoderStep,
-    Flux2KleinTextEncoderStep,
     Flux2VaeEncoderStep,
 )
 from .inputs import (
     Flux2KleinBaseTextInputStep,
     Flux2ProcessImagesInputStep,
-    Flux2TextInputStep,
 )
 
 
@@ -97,9 +94,7 @@ class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation."
-        )
+        return "Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation."
 
     @property
     def outputs(self):
@@ -132,9 +127,7 @@ class Flux2KleinBaseImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning."
-        )
+        return "Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning."
 
     @property
     def outputs(self):
@@ -146,6 +139,7 @@ def outputs(self):
             )
         ]
 
+
 class Flux2KleinBaseAutoCoreDenoiseStep(AutoPipelineBlocks):
     model_name = "flux2-klein"
     block_classes = [Flux2KleinBaseImageConditionedCoreDenoiseStep, Flux2KleinBaseCoreDenoiseStep]
@@ -161,6 +155,7 @@ def description(self):
             " - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
         )
 
+
 ###
 ### Auto blocks
 ###
@@ -183,9 +178,7 @@ class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model)."
-        )
+        return "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model)."
 
     @property
     def outputs(self):
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 5fe4a2e59c6d..9bc9d0bdefbd 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -16,9 +16,9 @@
 import re
 import warnings
 from collections import OrderedDict
-from dataclasses import dataclass, field, fields
-from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union, get_args, get_origin
+from dataclasses import dataclass, field
 from types import UnionType
+from typing import Any, Dict, List, Literal, Tuple, Type, Union, get_args, get_origin
 
 import PIL.Image
 import torch
@@ -1032,6 +1032,8 @@ def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) ->
                 combined_dict[output_param.name] = output_param
 
     return list(combined_dict.values())
+
+
 def generate_modular_model_card_content(blocks) -> Dict[str, Any]:
     """
     Generate model card content for a modular pipeline.
diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py
index 1c7098a09e08..2e6af4495b37 100644
--- a/src/diffusers/modular_pipelines/qwenimage/__init__.py
+++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py
@@ -28,8 +28,8 @@
     _import_structure["modular_pipeline"] = [
         "QwenImageEditModularPipeline",
         "QwenImageEditPlusModularPipeline",
-        "QwenImageModularPipeline",
         "QwenImageLayeredModularPipeline",
+        "QwenImageModularPipeline",
     ]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
index 6cba442ca9db..8034e69e2394 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
@@ -14,7 +14,6 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
 from .before_denoise import (
     StableDiffusionXLControlNetInputStep,
     StableDiffusionXLControlNetUnionInputStep,
@@ -303,20 +302,53 @@ class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
         "controlnet_inpainting": {"control_image": True, "mask_image": True, "image": True, "prompt": True},
         "controlnet_union_text2image": {"control_image": True, "control_mode": True, "prompt": True},
         "controlnet_union_image2image": {"control_image": True, "control_mode": True, "image": True, "prompt": True},
-        "controlnet_union_inpainting": {"control_image": True, "control_mode": True, "mask_image": True, "image": True, "prompt": True},
+        "controlnet_union_inpainting": {
+            "control_image": True,
+            "control_mode": True,
+            "mask_image": True,
+            "image": True,
+            "prompt": True,
+        },
         "ip_adapter_text2image": {"ip_adapter_image": True, "prompt": True},
         "ip_adapter_image2image": {"ip_adapter_image": True, "image": True, "prompt": True},
         "ip_adapter_inpainting": {"ip_adapter_image": True, "mask_image": True, "image": True, "prompt": True},
         "ip_adapter_controlnet_text2image": {"ip_adapter_image": True, "control_image": True, "prompt": True},
-        "ip_adapter_controlnet_image2image": {"ip_adapter_image": True, "control_image": True, "image": True, "prompt": True},
-        "ip_adapter_controlnet_inpainting": {"ip_adapter_image": True, "control_image": True, "mask_image": True, "image": True, "prompt": True},
-        "ip_adapter_controlnet_union_text2image": {"ip_adapter_image": True, "control_image": True, "control_mode": True, "prompt": True},
-        "ip_adapter_controlnet_union_image2image": {"ip_adapter_image": True, "control_image": True, "control_mode": True, "image": True, "prompt": True},
-        "ip_adapter_controlnet_union_inpainting": {"ip_adapter_image": True, "control_image": True, "control_mode": True, "mask_image": True, "image": True, "prompt": True},
+        "ip_adapter_controlnet_image2image": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "image": True,
+            "prompt": True,
+        },
+        "ip_adapter_controlnet_inpainting": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "mask_image": True,
+            "image": True,
+            "prompt": True,
+        },
+        "ip_adapter_controlnet_union_text2image": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "control_mode": True,
+            "prompt": True,
+        },
+        "ip_adapter_controlnet_union_image2image": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "control_mode": True,
+            "image": True,
+            "prompt": True,
+        },
+        "ip_adapter_controlnet_union_inpainting": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "control_mode": True,
+            "mask_image": True,
+            "image": True,
+            "prompt": True,
+        },
     }
 
     @property
     def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL."
-        )
+        return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL."
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
index 45985249c6d4..613de44cf7dc 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
@@ -66,7 +66,6 @@ def description(self):
 # ====================
 
 
-
 # auto_docstring
 class Wan22Blocks(SequentialPipelineBlocks):
     model_name = "wan"
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
index a3bb30557600..5ef170d3379f 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
@@ -202,6 +202,4 @@ class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Auto Modular pipeline for image-to-video using Wan."
-        )
+        return "Auto Modular pipeline for image-to-video using Wan."
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
index b85c959c661c..fcfe12526c25 100644
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
@@ -14,7 +14,6 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
 from .before_denoise import (
     ZImageAdditionalInputsStep,
     ZImagePrepareLatentsStep,
@@ -40,6 +39,7 @@
 # 1. DENOISE
 # ====================
 
+
 # text2image: inputs(text) -> set_timesteps -> prepare_latents -> denoise
 # auto_docstring
 class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
@@ -63,7 +63,7 @@ def description(self):
         )
 
 
-# image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise 
+# image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise
 # auto_docstring
 class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
@@ -152,4 +152,3 @@ class ZImageAutoBlocks(SequentialPipelineBlocks):
     @property
     def description(self) -> str:
         return "Auto Modular pipeline for text-to-image and image-to-image using ZImage."
-
diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
index 8144294c8eed..f05b9202eba5 100644
--- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -33,7 +33,6 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
-
 FLUX_TEXT2IMAGE_WORKFLOWS = {
     "text2image": [
         ("text_encoder", "FluxTextEncoderStep"),
@@ -90,6 +89,7 @@ def test_float16_inference(self):
     ]
 }
 
+
 class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxModularPipeline
     pipeline_blocks_class = FluxAutoBlocks
@@ -156,31 +156,33 @@ def test_save_from_pretrained(self):
     def test_float16_inference(self):
         super().test_float16_inference(8e-2)
 
+
 FLUX_KONTEXT_WORKFLOWS = {
     "text2image": [
-            ("text_encoder", "FluxTextEncoderStep"),
-            ("denoise.input", "FluxTextInputStep"),
-            ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
-            ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
-            ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
-            ("denoise.denoise", "FluxKontextDenoiseStep"),
-            ("decode", "FluxDecodeStep"),
-        ],
+        ("text_encoder", "FluxTextEncoderStep"),
+        ("denoise.input", "FluxTextInputStep"),
+        ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+        ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+        ("denoise.denoise", "FluxKontextDenoiseStep"),
+        ("decode", "FluxDecodeStep"),
+    ],
     "image_conditioned": [
-            ("text_encoder", "FluxTextEncoderStep"),
-            ("vae_encoder.preprocess", "FluxKontextProcessImagesInputStep"),
-            ("vae_encoder.encode", "FluxVaeEncoderStep"),
-            ("denoise.input.set_resolution", "FluxKontextSetResolutionStep"),
-            ("denoise.input.text_inputs", "FluxTextInputStep"),
-            ("denoise.input.additional_inputs", "FluxKontextAdditionalInputsStep"),
-            ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
-            ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
-            ("denoise.before_denoise.prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
-            ("denoise.denoise", "FluxKontextDenoiseStep"),
-            ("decode", "FluxDecodeStep"),
-        ]
+        ("text_encoder", "FluxTextEncoderStep"),
+        ("vae_encoder.preprocess", "FluxKontextProcessImagesInputStep"),
+        ("vae_encoder.encode", "FluxVaeEncoderStep"),
+        ("denoise.input.set_resolution", "FluxKontextSetResolutionStep"),
+        ("denoise.input.text_inputs", "FluxTextInputStep"),
+        ("denoise.input.additional_inputs", "FluxKontextAdditionalInputsStep"),
+        ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+        ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
+        ("denoise.denoise", "FluxKontextDenoiseStep"),
+        ("decode", "FluxDecodeStep"),
+    ],
 }
 
+
 class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxKontextModularPipeline
     pipeline_blocks_class = FluxKontextAutoBlocks
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
index 290e7244deb3..084f6b3b35b9 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
@@ -27,6 +27,7 @@
 from ...testing_utils import floats_tensor, torch_device
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
+
 FLUX2_TEXT2IMAGE_WORKFLOWS = {
     "text2image": [
         ("text_encoder", "Flux2TextEncoderStep"),
@@ -41,6 +42,7 @@
     ],
 }
 
+
 class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2ModularPipeline
     pipeline_blocks_class = Flux2AutoBlocks
@@ -69,6 +71,7 @@ def get_dummy_inputs(self, seed=0):
     def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
+
 FLUX2_IMAGE_CONDITIONED_WORKFLOWS = {
     "image_conditioned": [
         ("text_encoder", "Flux2TextEncoderStep"),
@@ -86,6 +89,7 @@ def test_float16_inference(self):
     ],
 }
 
+
 class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2ModularPipeline
     pipeline_blocks_class = Flux2AutoBlocks
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
index 0ad3e0d3bcbf..ad295a961357 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
@@ -41,6 +41,7 @@
     ],
 }
 
+
 class TestFlux2KleinModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2KleinModularPipeline
     pipeline_blocks_class = Flux2KleinAutoBlocks
@@ -85,6 +86,7 @@ def test_float16_inference(self):
     ],
 }
 
+
 class TestFlux2KleinImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2KleinModularPipeline
     pipeline_blocks_class = Flux2KleinAutoBlocks
diff --git a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
index b2bf55396ae6..1b4a07526639 100644
--- a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
+++ b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
@@ -126,6 +126,7 @@
     ],
 }
 
+
 class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
     pipeline_class = QwenImageModularPipeline
     pipeline_blocks_class = QwenImageAutoBlocks
@@ -152,6 +153,7 @@ def get_dummy_inputs(self):
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=5e-4)
 
+
 QWEN_IMAGE_EDIT_WORKFLOWS = {
     "edit": [
         ("text_encoder.resize", "QwenImageEditResizeStep"),
@@ -189,6 +191,7 @@ def test_inference_batch_single_identical(self):
     ],
 }
 
+
 class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
     pipeline_class = QwenImageEditModularPipeline
     pipeline_blocks_class = QwenImageEditAutoBlocks
diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
index 4aec782960b2..ffd71ca5a8d0 100644
--- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
+++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
@@ -347,7 +347,6 @@ class TestSDXLModularPipelineFast(
 
     expected_workflow_blocks = TEXT2IMAGE_WORKFLOWS
 
-
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
         inputs = {
@@ -370,6 +369,7 @@ def test_stable_diffusion_xl_euler(self):
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
+
 IMAGE2IMAGE_WORKFLOWS = {
     "image2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
@@ -428,6 +428,7 @@ def test_inference_batch_single_identical(self):
     ],
 }
 
+
 class TestSDXLImg2ImgModularPipelineFast(
     SDXLModularTesterMixin,
     SDXLModularIPAdapterTesterMixin,
@@ -481,6 +482,7 @@ def test_stable_diffusion_xl_euler(self):
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
+
 INPAINTING_WORKFLOWS = {
     "inpainting": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
@@ -539,6 +541,7 @@ def test_inference_batch_single_identical(self):
     ],
 }
 
+
 class SDXLInpaintingModularPipelineFastTests(
     SDXLModularTesterMixin,
     SDXLModularIPAdapterTesterMixin,
diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py
index 49f7a62edc2d..f128b0bd3bfa 100644
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -364,8 +364,7 @@ def test_workflow_map(self):
 
             # Check that the number of blocks matches
             assert len(actual_blocks) == len(expected_blocks), (
-                f"Workflow '{workflow_name}' has {len(actual_blocks)} blocks, "
-                f"expected {len(expected_blocks)}"
+                f"Workflow '{workflow_name}' has {len(actual_blocks)} blocks, expected {len(expected_blocks)}"
             )
 
             # Check that each block name and type matches
@@ -377,6 +376,7 @@ def test_workflow_map(self):
                     f"{actual_block.__class__.__name__}, expected {expected_class_name}"
                 )
 
+
 class ModularGuiderTesterMixin:
     def test_guider_cfg(self, expected_max_diff=1e-2):
         pipe = self.get_pipeline().to(torch_device)
diff --git a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
index a16e019768fb..15997931c8d4 100644
--- a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
+++ b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
@@ -42,6 +42,7 @@
     ],
 }
 
+
 class TestZImageModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = ZImageModularPipeline
     pipeline_blocks_class = ZImageAutoBlocks

From c68127a17e1de232394ae5cb511a6000ba17755d Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-161-123.ec2.internal>
Date: Sat, 14 Feb 2026 09:45:28 +0000
Subject: [PATCH 47/58] up

---
 .../flux/modular_blocks_flux_kontext.py       | 23 ++++++++++++-------
 .../flux2/modular_blocks_flux2.py             | 13 +++--------
 .../flux2/modular_blocks_flux2_klein.py       |  9 +++++++-
 .../flux2/modular_blocks_flux2_klein_base.py  |  5 ++++
 .../modular_pipelines/modular_pipeline.py     | 11 +++++++++
 .../wan/modular_blocks_wan.py                 | 16 ++-----------
 .../wan/modular_blocks_wan22.py               | 16 ++-----------
 .../wan/modular_blocks_wan22_i2v.py           | 18 ++-------------
 .../wan/modular_blocks_wan_i2v.py             | 17 +++++++-------
 .../z_image/modular_blocks_z_image.py         | 21 ++---------------
 10 files changed, 58 insertions(+), 91 deletions(-)

diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
index eb15144ec910..5137131923ac 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
@@ -39,6 +39,7 @@
 
 
 # Flux Kontext vae encoder (run before before_denoise)
+# auto_docstring
 class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
     model_name = "flux-kontext"
 
@@ -50,6 +51,7 @@ def description(self) -> str:
         return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
 
 
+# auto_docstring
 class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
     model_name = "flux-kontext"
 
@@ -61,15 +63,14 @@ class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
     def description(self):
         return (
             "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block that works for img2img tasks.\n"
-            + " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + "This is an auto pipeline block that works for image-conditioned tasks.\n"
+            + " - `FluxKontextVaeEncoderStep` (image_conditioned) is used when only `image` is provided."
             + " - if `image` is not provided, step will be skipped."
         )
 
 
 # before_denoise: text2img
-
-
+# auto_docstring
 class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux-kontext"
 
@@ -78,10 +79,12 @@ class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
+        return "Before denoise step that prepares the inputs for the denoise step for Flux Kontext\n"
+        "for text-to-image tasks."
 
 
-# before_denoise: FluxKontext
+# before_denoise: image-conditioned
+# auto_docstring
 class FluxKontextImageConditionedBeforeDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux-kontext"
 
@@ -91,11 +94,12 @@ class FluxKontextImageConditionedBeforeDenoiseStep(SequentialPipelineBlocks):
     @property
     def description(self):
         return (
-            "Before denoise step that prepare the inputs for the denoise step\n"
-            "for img2img/text2img task for Flux Kontext."
+            "Before denoise step that prepare the inputs for the denoise step for Flux Kontext\n"
+            "for image-conditioned tasks."
         )
 
 
+# auto_docstring
 class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
     model_name = "flux-kontext"
 
@@ -114,6 +118,7 @@ def description(self):
 
 
 # inputs: Flux Kontext
+# auto_docstring
 class FluxKontextInputStep(SequentialPipelineBlocks):
     model_name = "flux-kontext"
     block_classes = [FluxKontextSetResolutionStep(), FluxTextInputStep(), FluxKontextAdditionalInputsStep()]
@@ -128,6 +133,7 @@ def description(self):
         )
 
 
+# auto_docstring
 class FluxKontextAutoInputStep(AutoPipelineBlocks):
     model_name = "flux-kontext"
     block_classes = [FluxKontextInputStep, FluxTextInputStep]
@@ -170,6 +176,7 @@ def description(self):
 )
 
 
+# auto_docstring
 class FluxKontextAutoBlocks(SequentialPipelineBlocks):
     model_name = "flux-kontext"
 
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
index d4207aa6b637..d2afed2b938b 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
@@ -42,20 +42,12 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-Flux2VaeEncoderBlocks = InsertableDict(
-    [
-        ("preprocess", Flux2ProcessImagesInputStep()),
-        ("encode", Flux2VaeEncoderStep()),
-    ]
-)
-
-
 # auto_docstring
 class Flux2VaeEncoderSequentialStep(SequentialPipelineBlocks):
     model_name = "flux2"
 
-    block_classes = Flux2VaeEncoderBlocks.values()
-    block_names = Flux2VaeEncoderBlocks.keys()
+    block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+    block_names = ["preprocess", "encode"]
 
     @property
     def description(self) -> str:
@@ -151,6 +143,7 @@ def outputs(self):
 
 class Flux2AutoCoreDenoiseStep(AutoPipelineBlocks):
     model_name = "flux2"
+
     block_classes = [Flux2ImageConditionedCoreDenoiseStep, Flux2CoreDenoiseStep]
     block_names = ["image_conditioned", "text2image"]
     block_trigger_inputs = ["image_latents", None]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
index 72bf93029494..8a95cfd8bd42 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -45,8 +45,9 @@
 ################
 
 
+# auto_docstring
 class Flux2KleinVaeEncoderSequentialStep(SequentialPipelineBlocks):
-    model_name = "flux2"
+    model_name = "flux2-klein"
 
     block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
     block_names = ["preprocess", "encode"]
@@ -56,7 +57,10 @@ def description(self) -> str:
         return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
 
 
+# auto_docstring
 class Flux2KleinAutoVaeEncoderStep(AutoPipelineBlocks):
+    model_name = "flux2-klein"
+
     block_classes = [Flux2KleinVaeEncoderSequentialStep]
     block_names = ["img_conditioning"]
     block_trigger_inputs = ["image"]
@@ -87,6 +91,7 @@ def description(self):
 )
 
 
+# auto_docstring
 class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux2-klein"
 
@@ -121,6 +126,7 @@ def outputs(self):
 )
 
 
+# auto_docstring
 class Flux2KleinImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux2-klein"
 
@@ -142,6 +148,7 @@ def outputs(self):
         ]
 
 
+# auto_docstring
 class Flux2KleinAutoCoreDenoiseStep(AutoPipelineBlocks):
     model_name = "flux2-klein"
     block_classes = [Flux2KleinImageConditionedCoreDenoiseStep, Flux2KleinCoreDenoiseStep]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
index 5fff49a0f8c0..11ba36c84501 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
@@ -45,6 +45,7 @@
 ################
 
 
+# auto_docstring
 class Flux2KleinBaseVaeEncoderSequentialStep(SequentialPipelineBlocks):
     model_name = "flux2"
 
@@ -56,6 +57,7 @@ def description(self) -> str:
         return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
 
 
+# auto_docstring
 class Flux2KleinBaseAutoVaeEncoderStep(AutoPipelineBlocks):
     block_classes = [Flux2KleinBaseVaeEncoderSequentialStep]
     block_names = ["img_conditioning"]
@@ -87,6 +89,7 @@ def description(self):
 )
 
 
+# auto_docstring
 class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux2-klein"
     block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
@@ -120,6 +123,7 @@ def outputs(self):
 )
 
 
+# auto_docstring
 class Flux2KleinBaseImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux2-klein"
     block_classes = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.values()
@@ -140,6 +144,7 @@ def outputs(self):
         ]
 
 
+# auto_docstring
 class Flux2KleinBaseAutoCoreDenoiseStep(AutoPipelineBlocks):
     model_name = "flux2-klein"
     block_classes = [Flux2KleinBaseImageConditionedCoreDenoiseStep, Flux2KleinBaseCoreDenoiseStep]
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index eff5685f6b05..572bd0af46e3 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -685,6 +685,11 @@ def select_block(self, **kwargs) -> Optional[str]:
         Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic
         for selecting the block.
 
+        Note: When trigger inputs include intermediate outputs from earlier blocks, the selection logic should only
+        depend on the presence or absence of the input (i.e., whether it is None or not), not on its actual value. This
+        is because `get_execution_blocks()` resolves conditions statically by propagating intermediate output names
+        without their runtime values.
+
         Args:
             **kwargs: Trigger input names and their values from the state.
 
@@ -1103,6 +1108,12 @@ def get_execution_blocks(self, **kwargs) -> "SequentialPipelineBlocks":
         """
         Get the blocks that would execute given the specified inputs.
 
+        As the traversal walks through sequential blocks, intermediate outputs from resolved blocks are added to the
+        active inputs. This means conditional blocks that depend on intermediates (e.g., "run img2img if image_latents
+        is present") will resolve correctly, as long as the condition is based on presence/absence (None or not None),
+        not on the actual value.
+
+
         Args:
             **kwargs: Input names and values. Only trigger inputs affect block selection.
 
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
index cd71f4f69999..e4bd483cec26 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
@@ -50,14 +50,7 @@ class WanCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
-            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `WanDenoiseStep` is used to denoise the latents\n"
-        )
+        return "denoise block that takes encoded conditions and runs the denoising process."
 
 
 # ====================
@@ -77,9 +70,4 @@ class WanBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Modular pipeline blocks for Wan2.1.\n"
-            + "- `WanTextEncoderStep` is used to encode the text\n"
-            + "- `WanCoreDenoiseStep` is used to denoise the latents\n"
-            + "- `WanVaeDecoderStep` is used to decode the latents to images"
-        )
+        return "Modular pipeline blocks for Wan2.1."
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
index 613de44cf7dc..d42db8eab485 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
@@ -51,14 +51,7 @@ class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
-            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `Wan22DenoiseStep` is used to denoise the latents in wan2.2\n"
-        )
+        return "denoise block that takes encoded conditions and runs the denoising process."
 
 
 # ====================
@@ -82,9 +75,4 @@ class Wan22Blocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Modular pipeline for text-to-video using Wan2.2.\n"
-            + " - `WanTextEncoderStep` encodes the text\n"
-            + " - `Wan22CoreDenoiseStep` denoes the latents\n"
-            + " - `WanVaeDecoderStep` decodes the latents to video frames\n"
-        )
+        return "Modular pipeline for text-to-video using Wan2.2."
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
index 887e79bb6ad1..b32e65a08e76 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
@@ -77,15 +77,7 @@ class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
-            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
-            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `Wan22Image2VideoDenoiseStep` is used to denoise the latents in wan2.2\n"
-        )
+        return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
 
 
 # ====================
@@ -111,10 +103,4 @@ class Wan22Image2VideoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Modular pipeline for image-to-video using Wan2.2.\n"
-            + " - `WanTextEncoderStep` encodes the text\n"
-            + " - `WanImage2VideoVaeEncoderStep` encodes the image\n"
-            + " - `Wan22Image2VideoCoreDenoiseStep` denoes the latents\n"
-            + " - `WanVaeDecoderStep` decodes the latents to video frames\n"
-        )
+        return "Modular pipeline for image-to-video using Wan2.2."
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
index 5ef170d3379f..006557c2c03e 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
@@ -45,6 +45,7 @@
 
 
 # wan2.1 I2V (first frame only)
+# auto_docstring
 class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanImageEncoderStep]
@@ -56,6 +57,7 @@ def description(self):
 
 
 # wan2.1 FLF2V (first and last frame)
+# auto_docstring
 class WanFLF2VImageEncoderStep(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep]
@@ -67,6 +69,7 @@ def description(self):
 
 
 # wan2.1 Auto Image Encoder
+# auto_docstring
 class WanAutoImageEncoderStep(AutoPipelineBlocks):
     block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep]
     block_names = ["flf2v_image_encoder", "image2video_image_encoder"]
@@ -90,6 +93,7 @@ def description(self):
 
 
 # wan2.1 I2V (first frame only)
+# auto_docstring
 class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep]
@@ -101,6 +105,7 @@ def description(self):
 
 
 # wan2.1 FLF2V (first and last frame)
+# auto_docstring
 class WanFLF2VVaeEncoderStep(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [
@@ -117,6 +122,7 @@ def description(self):
 
 
 # wan2.1 Auto Vae Encoder
+# auto_docstring
 class WanAutoVaeEncoderStep(AutoPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [WanFLF2VVaeEncoderStep, WanImage2VideoVaeEncoderStep]
@@ -141,6 +147,7 @@ def description(self):
 
 # wan2.1 I2V core denoise (support both I2V and FLF2V)
 # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents)
+# auto_docstring
 class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "wan-i2v"
     block_classes = [
@@ -160,15 +167,7 @@ class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
-            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
-            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n"
-        )
+        return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
 
 
 # ====================
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
index fcfe12526c25..83ee2fcbddba 100644
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
@@ -53,14 +53,7 @@ class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n"
-            + " - `ZImageSetTimestepsStep` is used to set the timesteps\n"
-            + " - `ZImageDenoiseStep` is used to denoise the latents\n"
-        )
+        return "denoise block that takes encoded conditions and runs the denoising process."
 
 
 # image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise
@@ -87,17 +80,7 @@ class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `ZImageAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
-            + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n"
-            + " - `ZImageSetTimestepsStep` is used to set the timesteps\n"
-            + " - `ZImageSetTimestepsWithStrengthStep` is used to set the timesteps with strength\n"
-            + " - `ZImagePrepareLatentswithImageStep` is used to prepare the latents with image\n"
-            + " - `ZImageDenoiseStep` is used to denoise the latents\n"
-        )
+        return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
 
 
 # auto_docstring

From fc5951b8f140412af7f4ad5c164e618a4e827201 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-161-123.ec2.internal>
Date: Sat, 14 Feb 2026 10:11:05 +0000
Subject: [PATCH 48/58] add auto docstring

---
 .../flux/modular_blocks_flux.py               | 393 ++++++++++++++++-
 .../flux/modular_blocks_flux_kontext.py       | 395 +++++++++++++++++-
 .../flux2/modular_blocks_flux2.py             | 200 ++++++++-
 .../flux2/modular_blocks_flux2_klein.py       | 238 ++++++++++-
 .../flux2/modular_blocks_flux2_klein_base.py  | 255 ++++++++++-
 .../modular_pipeline_utils.py                 |   4 +
 .../qwenimage/before_denoise.py               |  28 +-
 .../modular_pipelines/qwenimage/decoders.py   |  10 +-
 .../modular_pipelines/qwenimage/denoise.py    |  32 +-
 .../modular_pipelines/qwenimage/encoders.py   |  26 +-
 .../modular_pipelines/qwenimage/inputs.py     |   4 +-
 .../qwenimage/modular_blocks_qwenimage.py     |  46 +-
 .../modular_blocks_qwenimage_edit.py          |  43 +-
 .../modular_blocks_qwenimage_edit_plus.py     |  30 +-
 .../modular_blocks_qwenimage_layered.py       |  20 +-
 .../modular_blocks_stable_diffusion_xl.py     | 158 +++++++
 .../wan/modular_blocks_wan.py                 |  89 ++++
 .../wan/modular_blocks_wan22.py               |  98 +++++
 .../wan/modular_blocks_wan22_i2v.py           | 130 ++++++
 .../wan/modular_blocks_wan_i2v.py             | 277 ++++++++++++
 .../z_image/modular_blocks_z_image.py         | 197 +++++++++
 21 files changed, 2497 insertions(+), 176 deletions(-)

diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
index 6cb6b7811eb6..f2e78e933448 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
@@ -14,7 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
+from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
     FluxImg2ImgPrepareLatentsStep,
     FluxImg2ImgSetTimestepsStep,
@@ -43,6 +43,31 @@
 
 # auto_docstring
 class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+      Inputs:
+          resized_image (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          processed_image (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The latents representing the reference image
+    """
+
     model_name = "flux"
 
     block_classes = [FluxProcessImagesInputStep(), FluxVaeEncoderStep()]
@@ -55,6 +80,34 @@ def description(self) -> str:
 
 # auto_docstring
 class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block that works for img2img tasks.
+       - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided. - if `image` is not provided,
+         step will be skipped.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+      Inputs:
+          resized_image (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          processed_image (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The latents representing the reference image
+    """
+
     model_name = "flux"
     block_classes = [FluxImg2ImgVaeEncoderStep]
     block_names = ["img2img"]
@@ -73,6 +126,54 @@ def description(self):
 # before_denoise: text2img
 # auto_docstring
 class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepares the inputs for the denoise step in text-to-image generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
     model_name = "flux"
     block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()]
     block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
@@ -85,6 +186,61 @@ def description(self):
 # before_denoise: img2img
 # auto_docstring
 class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs for the denoise step for img2img task.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
     model_name = "flux"
     block_classes = [
         FluxPrepareLatentsStep(),
@@ -102,6 +258,64 @@ def description(self):
 # before_denoise: all task (text2img, img2img)
 # auto_docstring
 class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs for the denoise step.
+      This is an auto pipeline block that works for text2image.
+       - `FluxBeforeDenoiseStep` (text2image) is used.
+       - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`):
+              TODO: Add description.
+          width (`int`):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
     model_name = "flux"
     block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
     block_names = ["img2img", "text2image"]
@@ -122,6 +336,38 @@ def description(self):
 
 # auto_docstring
 class FluxImg2ImgInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the img2img denoising step. It:
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation
+          pooled_prompt_embeds (`Tensor`):
+              pooled text embeddings used to guide the image generation
+          image_height (`int`):
+              The height of the image latents
+          image_width (`int`):
+              The width of the image latents
+    """
+
     model_name = "flux"
     block_classes = [FluxTextInputStep(), FluxAdditionalInputsStep()]
     block_names = ["text_inputs", "additional_inputs"]
@@ -135,6 +381,42 @@ def description(self):
 
 # auto_docstring
 class FluxAutoInputStep(AutoPipelineBlocks):
+    """
+    Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size,
+    and patchified.
+       This is an auto pipeline block that works for text2image/img2img tasks.
+       - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.
+       - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation
+          pooled_prompt_embeds (`Tensor`):
+              pooled text embeddings used to guide the image generation
+          image_height (`int`):
+              The height of the image latents
+          image_width (`int`):
+              The width of the image latents
+    """
+
     model_name = "flux"
 
     block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
@@ -153,6 +435,50 @@ def description(self):
 
 # auto_docstring
 class FluxCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core step that performs the denoising process for Flux.
+      This step supports text-to-image and image-to-image tasks for Flux:
+       - for image-to-image generation, you need to provide `image_latents`
+       - for text-to-image generation, all you need to provide is prompt embeddings.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux"
     block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxDenoiseStep]
     block_names = ["input", "before_denoise", "denoise"]
@@ -166,6 +492,12 @@ def description(self):
             + " - for text-to-image generation, all you need to provide is prompt embeddings."
         )
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Auto blocks (text2image and img2img)
 AUTO_BLOCKS = InsertableDict(
@@ -180,6 +512,61 @@ def description(self):
 
 # auto_docstring
 class FluxAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image and image-to-image using Flux.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image2image`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2
+          (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          prompt_2 (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          resized_image (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     model_name = "flux"
 
     block_classes = AUTO_BLOCKS.values()
@@ -193,3 +580,7 @@ class FluxAutoBlocks(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Auto Modular pipeline for text-to-image and image-to-image using Flux."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
index 5137131923ac..b5a5dbf78c0e 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
@@ -14,7 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
+from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
     FluxKontextRoPEInputsStep,
     FluxPrepareLatentsStep,
@@ -41,6 +41,27 @@
 # Flux Kontext vae encoder (run before before_denoise)
 # auto_docstring
 class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          _auto_resize (`bool`, *optional*, defaults to True):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          processed_image (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The latents representing the reference image
+    """
+
     model_name = "flux-kontext"
 
     block_classes = [FluxKontextProcessImagesInputStep(), FluxVaeEncoderStep(sample_mode="argmax")]
@@ -53,6 +74,30 @@ def description(self) -> str:
 
 # auto_docstring
 class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block that works for image-conditioned tasks.
+       - `FluxKontextVaeEncoderStep` (image_conditioned) is used when only `image` is provided. - if `image` is not
+         provided, step will be skipped.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          _auto_resize (`bool`, *optional*, defaults to True):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          processed_image (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The latents representing the reference image
+    """
+
     model_name = "flux-kontext"
 
     block_classes = [FluxKontextVaeEncoderStep]
@@ -72,6 +117,54 @@ def description(self):
 # before_denoise: text2img
 # auto_docstring
 class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepares the inputs for the denoise step for Flux Kontext
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
     model_name = "flux-kontext"
 
     block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()]
@@ -86,6 +179,59 @@ def description(self):
 # before_denoise: image-conditioned
 # auto_docstring
 class FluxKontextImageConditionedBeforeDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs for the denoise step for Flux Kontext
+      for image-conditioned tasks.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          image_height (`None`, *optional*):
+              TODO: Add description.
+          image_width (`None`, *optional*):
+              TODO: Add description.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
     model_name = "flux-kontext"
 
     block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxKontextRoPEInputsStep()]
@@ -101,6 +247,62 @@ def description(self):
 
 # auto_docstring
 class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs for the denoise step.
+      This is an auto pipeline block that works for text2image.
+       - `FluxKontextBeforeDenoiseStep` (text2image) is used.
+       - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is
+         provided.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          image_height (`None`, *optional*):
+              TODO: Add description.
+          image_width (`None`, *optional*):
+              TODO: Add description.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
     model_name = "flux-kontext"
 
     block_classes = [FluxKontextImageConditionedBeforeDenoiseStep, FluxKontextBeforeDenoiseStep]
@@ -120,6 +322,46 @@ def description(self):
 # inputs: Flux Kontext
 # auto_docstring
 class FluxKontextInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the both text2img and img2img denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Inputs:
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          max_area (`int`, *optional*, defaults to 1048576):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          height (`int`):
+              The height of the initial noisy latents
+          width (`int`):
+              The width of the initial noisy latents
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation
+          pooled_prompt_embeds (`Tensor`):
+              pooled text embeddings used to guide the image generation
+          image_height (`int`):
+              The height of the image latents
+          image_width (`int`):
+              The width of the image latents
+    """
+
     model_name = "flux-kontext"
     block_classes = [FluxKontextSetResolutionStep(), FluxTextInputStep(), FluxKontextAdditionalInputsStep()]
     block_names = ["set_resolution", "text_inputs", "additional_inputs"]
@@ -135,6 +377,48 @@ def description(self):
 
 # auto_docstring
 class FluxKontextAutoInputStep(AutoPipelineBlocks):
+    """
+    Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size,
+    and patchified.
+       This is an auto pipeline block that works for text2image/img2img tasks.
+       - `FluxKontextInputStep` (image_conditioned) is used when `image_latents` is provided.
+       - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present.
+
+      Inputs:
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          max_area (`int`, *optional*, defaults to 1048576):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          height (`int`):
+              The height of the initial noisy latents
+          width (`int`):
+              The width of the initial noisy latents
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation
+          pooled_prompt_embeds (`Tensor`):
+              pooled text embeddings used to guide the image generation
+          image_height (`int`):
+              The height of the image latents
+          image_width (`int`):
+              The width of the image latents
+    """
+
     model_name = "flux-kontext"
     block_classes = [FluxKontextInputStep, FluxTextInputStep]
     block_names = ["image_conditioned", "text2image"]
@@ -152,6 +436,50 @@ def description(self):
 
 # auto_docstring
 class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core step that performs the denoising process for Flux Kontext.
+      This step supports text-to-image and image-conditioned tasks for Flux Kontext:
+       - for image-conditioned generation, you need to provide `image_latents`
+       - for text-to-image generation, all you need to provide is prompt embeddings.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+      Inputs:
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          max_area (`int`, *optional*, defaults to 1048576):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux-kontext"
     block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextDenoiseStep]
     block_names = ["input", "before_denoise", "denoise"]
@@ -165,6 +493,12 @@ def description(self):
             + " - for text-to-image generation, all you need to provide is prompt embeddings."
         )
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 AUTO_BLOCKS_KONTEXT = InsertableDict(
     [
@@ -178,6 +512,61 @@ def description(self):
 
 # auto_docstring
 class FluxKontextAutoBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline for image-to-image using Flux Kontext.
+
+      Supported workflows:
+        - `image_conditioned`: requires `image`, `prompt`
+        - `text2image`: requires `prompt`
+
+      Components:
+          text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2
+          (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          prompt_2 (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          _auto_resize (`bool`, *optional*, defaults to True):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          max_area (`int`, *optional*, defaults to 1048576):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     model_name = "flux-kontext"
 
     block_classes = AUTO_BLOCKS_KONTEXT.values()
@@ -190,3 +579,7 @@ class FluxKontextAutoBlocks(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Modular pipeline for image-to-image using Flux Kontext."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
index d2afed2b938b..b1033a7dff9e 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
@@ -44,6 +40,29 @@
 
 # auto_docstring
 class Flux2VaeEncoderSequentialStep(SequentialPipelineBlocks):
+    """
+    VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
     model_name = "flux2"
 
     block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
@@ -56,6 +75,32 @@ def description(self) -> str:
 
 # auto_docstring
 class Flux2AutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    VAE encoder step that encodes the image inputs into their latent representations.
+      This is an auto pipeline block that works for image conditioning tasks.
+       - `Flux2VaeEncoderSequentialStep` is used when `image` is provided.
+       - If `image` is not provided, step will be skipped.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
     block_classes = [Flux2VaeEncoderSequentialStep]
     block_names = ["img_conditioning"]
     block_trigger_inputs = ["image"]
@@ -85,6 +130,45 @@ def description(self):
 
 # auto_docstring
 class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-dev.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 4.0):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2"
 
     block_classes = Flux2CoreDenoiseBlocks.values()
@@ -97,11 +181,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
@@ -121,6 +201,43 @@ def outputs(self):
 
 # auto_docstring
 class Flux2ImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-dev with image conditioning.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 4.0):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2"
 
     block_classes = Flux2ImageConditionedCoreDenoiseBlocks.values()
@@ -133,11 +250,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
@@ -170,6 +283,59 @@ def description(self):
 
 # auto_docstring
 class Flux2AutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image_conditioned`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Mistral3ForConditionalGeneration`) tokenizer (`AutoProcessor`) image_processor
+          (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
+          (`Flux2Transformer2DModel`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          text_encoder_out_layers (`tuple`, *optional*, defaults to (10, 20, 30)):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 4.0):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     model_name = "flux2"
 
     block_classes = AUTO_BLOCKS.values()
@@ -186,9 +352,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="images",
-                type_hint=List[PIL.Image.Image],
-                description="The images from the decoding step.",
-            )
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
index 8a95cfd8bd42..5dbae43a5a7f 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
@@ -47,6 +43,29 @@
 
 # auto_docstring
 class Flux2KleinVaeEncoderSequentialStep(SequentialPipelineBlocks):
+    """
+    VAE encoder step that preprocesses and encodes the image inputs into their latent representations.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
     model_name = "flux2-klein"
 
     block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
@@ -59,6 +78,32 @@ def description(self) -> str:
 
 # auto_docstring
 class Flux2KleinAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    VAE encoder step that encodes the image inputs into their latent representations.
+      This is an auto pipeline block that works for image conditioning tasks.
+       - `Flux2KleinVaeEncoderSequentialStep` is used when `image` is provided.
+       - If `image` is not provided, step will be skipped.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
     model_name = "flux2-klein"
 
     block_classes = [Flux2KleinVaeEncoderSequentialStep]
@@ -93,6 +138,44 @@ def description(self):
 
 # auto_docstring
 class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image
+    generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
 
     block_classes = Flux2KleinCoreDenoiseBlocks.values()
@@ -105,11 +188,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
@@ -128,6 +207,41 @@ def outputs(self):
 
 # auto_docstring
 class Flux2KleinImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
 
     block_classes = Flux2KleinImageConditionedCoreDenoiseBlocks.values()
@@ -140,16 +254,52 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
 # auto_docstring
 class Flux2KleinAutoCoreDenoiseStep(AutoPipelineBlocks):
+    """
+    Auto core denoise step that performs the denoising process for Flux2-Klein.
+      This is an auto pipeline block that works for text-to-image and image-conditioned generation.
+       - `Flux2KleinCoreDenoiseStep` is used for text-to-image generation.
+       - `Flux2KleinImageConditionedCoreDenoiseStep` is used for image-conditioned generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
     block_classes = [Flux2KleinImageConditionedCoreDenoiseStep, Flux2KleinCoreDenoiseStep]
     block_names = ["image_conditioned", "text2image"]
@@ -172,6 +322,60 @@ def description(self):
 
 # auto_docstring
 class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image_conditioned`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Qwen3ForCausalLM`) tokenizer (`Qwen2TokenizerFast`) image_processor (`Flux2ImageProcessor`)
+          vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
+          (`Flux2Transformer2DModel`)
+
+      Configs:
+          is_distilled (default: True)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          text_encoder_out_layers (`tuple`, *optional*, defaults to (9, 18, 27)):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     model_name = "flux2-klein"
     block_classes = [
         Flux2KleinTextEncoderStep(),
@@ -192,9 +396,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="images",
-                type_hint=List[PIL.Image.Image],
-                description="The images from the decoding step.",
-            )
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
index 11ba36c84501..42e025c622b4 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
@@ -47,6 +43,29 @@
 
 # auto_docstring
 class Flux2KleinBaseVaeEncoderSequentialStep(SequentialPipelineBlocks):
+    """
+    VAE encoder step that preprocesses and encodes the image inputs into their latent representations.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
     model_name = "flux2"
 
     block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
@@ -59,6 +78,32 @@ def description(self) -> str:
 
 # auto_docstring
 class Flux2KleinBaseAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    VAE encoder step that encodes the image inputs into their latent representations.
+      This is an auto pipeline block that works for image conditioning tasks.
+       - `Flux2KleinBaseVaeEncoderSequentialStep` is used when `image` is provided.
+       - If `image` is not provided, step will be skipped.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
     block_classes = [Flux2KleinBaseVaeEncoderSequentialStep]
     block_names = ["img_conditioning"]
     block_trigger_inputs = ["image"]
@@ -91,6 +136,49 @@ def description(self):
 
 # auto_docstring
 class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Configs:
+          is_distilled (default: False)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
     block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
     block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
@@ -102,11 +190,7 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
@@ -125,6 +209,47 @@ def outputs(self):
 
 # auto_docstring
 class Flux2KleinBaseImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Configs:
+          is_distilled (default: False)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
     block_classes = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.values()
     block_names = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.keys()
@@ -136,16 +261,58 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
 # auto_docstring
 class Flux2KleinBaseAutoCoreDenoiseStep(AutoPipelineBlocks):
+    """
+    Auto core denoise step that performs the denoising process for Flux2-Klein (base model).
+      This is an auto pipeline block that works for text-to-image and image-conditioned generation.
+       - `Flux2KleinBaseCoreDenoiseStep` is used for text-to-image generation.
+       - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Configs:
+          is_distilled (default: False)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
     block_classes = [Flux2KleinBaseImageConditionedCoreDenoiseStep, Flux2KleinBaseCoreDenoiseStep]
     block_names = ["image_conditioned", "text2image"]
@@ -168,6 +335,60 @@ def description(self):
 
 # auto_docstring
 class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image_conditioned`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Qwen3ForCausalLM`) tokenizer (`Qwen2TokenizerFast`) guider (`ClassifierFreeGuidance`)
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Configs:
+          is_distilled (default: False)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          text_encoder_out_layers (`tuple`, *optional*, defaults to (9, 18, 27)):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     model_name = "flux2-klein"
     block_classes = [
         Flux2KleinBaseTextEncoderStep(),
@@ -188,9 +409,5 @@ def description(self):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="images",
-                type_hint=List[PIL.Image.Image],
-                description="The images from the decoding step.",
-            )
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 9bc9d0bdefbd..8d90b5b375b4 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -504,6 +504,10 @@ class ConfigSpec:
         "type_hint": list[PIL.Image.Image],
         "description": "Generated images.",
     },
+    "videos": {
+        "type_hint": list[PIL.Image.Image],
+        "description": "The generated videos.",
+    },
     "latents": {
         "type_hint": torch.Tensor,
         "description": "Denoised latents.",
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index c4e14566a795..51b5c6ac8c3d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -558,7 +558,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
       Inputs:
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           latents (`Tensor`):
               The initial random noised latents for the denoising process. Can be generated in prepare latents step.
@@ -644,7 +644,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
       Inputs:
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
@@ -725,7 +725,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
       Inputs:
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           latents (`Tensor`):
               The latents to use for the denoising process. Can be generated in prepare latents step.
@@ -842,7 +842,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
               mask for the negative text embeddings. Can be generated from text_encoder step.
 
       Outputs:
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shapes of the images latents, used for RoPE calculation
     """
 
@@ -917,7 +917,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
               mask for the negative text embeddings. Can be generated from text_encoder step.
 
       Outputs:
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shapes of the images latents, used for RoPE calculation
     """
 
@@ -995,9 +995,9 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
           batch_size (`int`, *optional*, defaults to 1):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
               be generated in input step.
-          image_height (`List`):
+          image_height (`list`):
               The heights of the reference images. Can be generated in input step.
-          image_width (`List`):
+          image_width (`list`):
               The widths of the reference images. Can be generated in input step.
           height (`int`):
               The height in pixels of the generated image.
@@ -1009,11 +1009,11 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
               mask for the negative text embeddings. Can be generated from text_encoder step.
 
       Outputs:
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shapes of the image latents, used for RoPE calculation
-          txt_seq_lens (`List`):
+          txt_seq_lens (`list`):
               The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
+          negative_txt_seq_lens (`list`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
 
@@ -1123,11 +1123,11 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
               mask for the negative text embeddings. Can be generated from text_encoder step.
 
       Outputs:
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shapes of the image latents, used for RoPE calculation
-          txt_seq_lens (`List`):
+          txt_seq_lens (`list`):
               The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
+          negative_txt_seq_lens (`list`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
           additional_t_cond (`Tensor`):
               The additional t cond, used for RoPE calculation
@@ -1238,7 +1238,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
 
       Outputs:
-          controlnet_keep (`List`):
+          controlnet_keep (`list`):
               The controlnet keep values
     """
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 49183eed9cda..e4ccb6b8e047 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -191,7 +191,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
               step.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -268,7 +268,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
@@ -366,7 +366,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
@@ -436,12 +436,12 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
               the generated image tensor from decoders step
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index 6724612361aa..de8ea05c5047 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -518,11 +518,11 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
               The number of denoising steps.
           latents (`Tensor`):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
 
       Outputs:
@@ -576,11 +576,11 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
               The number of denoising steps.
           latents (`Tensor`):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
           mask (`Tensor`):
               The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -645,13 +645,13 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
               The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
-          controlnet_keep (`List`):
+          controlnet_keep (`list`):
               The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
 
       Outputs:
@@ -711,13 +711,13 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
               The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
-          controlnet_keep (`List`):
+          controlnet_keep (`list`):
               The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
           mask (`Tensor`):
               The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -787,11 +787,11 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
 
       Outputs:
@@ -846,11 +846,11 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
           mask (`Tensor`):
               The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -910,11 +910,11 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
 
       Outputs:
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 6abcf7ce215a..527267dc0d6e 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -285,11 +285,11 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
           image_resize_processor (`VaeImageProcessor`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
     """
 
@@ -359,13 +359,13 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
           image_resize_processor (`VaeImageProcessor`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
     """
 
@@ -452,13 +452,13 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
           image_resize_processor (`VaeImageProcessor`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               Images resized to 1024x1024 target area for VAE encoding
-          resized_cond_image (`List`):
+          resized_cond_image (`list`):
               Images resized to 384x384 target area for VL text encoding
     """
 
@@ -1058,7 +1058,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
       Inputs:
           mask_image (`Image`):
               Mask image for inpainting.
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -1072,7 +1072,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
               The processed image
           processed_mask_image (`Tensor`):
               The processed mask image
-          mask_overlay_kwargs (`Dict`):
+          mask_overlay_kwargs (`dict`):
               The kwargs for the postprocess step to apply the mask overlay
     """
 
@@ -1177,7 +1177,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
               The processed image
           processed_mask_image (`Tensor`):
               The processed mask image
-          mask_overlay_kwargs (`Dict`):
+          mask_overlay_kwargs (`dict`):
               The kwargs for the postprocess step to apply the mask overlay
     """
 
@@ -1256,7 +1256,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
           image_processor (`VaeImageProcessor`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -1340,7 +1340,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
           image_processor (`VaeImageProcessor`)
 
       Inputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized image. should be generated using a resize step
 
       Outputs:
@@ -1412,7 +1412,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
           image_processor (`VaeImageProcessor`)
 
       Inputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized image. should be generated using a resize step
 
       Outputs:
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index ebe53940a4e5..faec7db245df 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -496,9 +496,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
-          image_height (`List`):
+          image_height (`list`):
               The image heights calculated from the image latents dimension
-          image_width (`List`):
+          image_width (`list`):
               The image widths calculated from the image latents dimension
           height (`int`):
               if not provided, updated to image height
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 44490c916d01..bf87028b2f90 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -119,7 +119,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
       Inputs:
           mask_image (`Image`):
               Mask image for inpainting.
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -135,7 +135,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
               The processed image
           processed_mask_image (`Tensor`):
               The processed mask image
-          mask_overlay_kwargs (`Dict`):
+          mask_overlay_kwargs (`dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
               The latent representation of the input image.
@@ -164,7 +164,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
           image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -476,9 +476,9 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -553,11 +553,11 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -632,11 +632,11 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -712,7 +712,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
@@ -720,7 +720,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -802,7 +802,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
@@ -812,7 +812,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -894,7 +894,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
@@ -904,7 +904,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -1032,7 +1032,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -1061,12 +1061,12 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
               step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -1138,7 +1138,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Maximum sequence length for prompt encoding.
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-          image (`Union[Image, List]`, *optional*):
+          image (`Image | list`, *optional*):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -1164,9 +1164,9 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -1187,12 +1187,12 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Scale for ControlNet conditioning.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index af3d72c1d50c..2bb0d7e21226 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -67,7 +67,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
           (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
@@ -75,7 +75,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           prompt_embeds (`Tensor`):
               The prompt embeddings.
@@ -115,13 +115,13 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
           (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           processed_image (`Tensor`):
               The processed image
@@ -156,7 +156,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
           (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           mask_image (`Image`):
               Mask image for inpainting.
@@ -166,13 +166,13 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           processed_image (`Tensor`):
               The processed image
           processed_mask_image (`Tensor`):
               The processed mask image
-          mask_overlay_kwargs (`Dict`):
+          mask_overlay_kwargs (`dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
               The latent representation of the input image.
@@ -450,9 +450,9 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -526,11 +526,11 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -627,7 +627,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -656,12 +656,12 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
               step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -718,6 +718,11 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
         `padding_mask_crop`
 
+
+      Supported workflows:
+        - `edit`: requires `prompt`, `image`
+        - `edit_inpainting`: requires `prompt`, `mask_image`, `image`
+
       Components:
           image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
           (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
@@ -725,7 +730,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
@@ -751,22 +756,22 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 56652c94c4b0..4a1f418d7b45 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -58,7 +58,7 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
           (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
@@ -66,9 +66,9 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               Images resized to 1024x1024 target area for VAE encoding
-          resized_cond_image (`List`):
+          resized_cond_image (`list`):
               Images resized to 384x384 target area for VL text encoding
           prompt_embeds (`Tensor`):
               The prompt embeddings.
@@ -108,15 +108,15 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
           (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               Images resized to 1024x1024 target area for VAE encoding
-          resized_cond_image (`List`):
+          resized_cond_image (`list`):
               Images resized to 384x384 target area for VL text encoding
           processed_image (`Tensor`):
               The processed image
@@ -189,9 +189,9 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
               The negative prompt embeddings. (batch-expanded)
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask. (batch-expanded)
-          image_height (`List`):
+          image_height (`list`):
               The image heights calculated from the image latents dimension
-          image_width (`List`):
+          image_width (`list`):
               The image widths calculated from the image latents dimension
           height (`int`):
               if not provided, updated to image height
@@ -253,9 +253,9 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -315,7 +315,7 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -357,7 +357,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
@@ -375,9 +375,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -385,7 +385,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 43cefa5eb658..a10454f1fb0c 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -60,7 +60,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
           (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
@@ -74,7 +74,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
               Maximum sequence length for prompt encoding.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           prompt (`str`):
               The prompt or prompts to guide image generation. If not provided, updated using image caption
@@ -117,7 +117,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
           (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
@@ -125,7 +125,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           processed_image (`Tensor`):
               The processed image
@@ -250,9 +250,9 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -317,7 +317,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
           scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
@@ -339,9 +339,9 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
               Number of layers to extract from the image
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -349,7 +349,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
index 8034e69e2394..a7a18e514777 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     StableDiffusionXLControlNetInputStep,
     StableDiffusionXLControlNetUnionInputStep,
@@ -278,6 +279,159 @@ def description(self):
 # ip-adapter, controlnet, text2img, img2img, inpainting
 # auto_docstring
 class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion
+    XL.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image2image`: requires `image`, `prompt`
+        - `inpainting`: requires `mask_image`, `image`, `prompt`
+        - `controlnet_text2image`: requires `control_image`, `prompt`
+        - `controlnet_image2image`: requires `control_image`, `image`, `prompt`
+        - `controlnet_inpainting`: requires `control_image`, `mask_image`, `image`, `prompt`
+        - `controlnet_union_text2image`: requires `control_image`, `control_mode`, `prompt`
+        - `controlnet_union_image2image`: requires `control_image`, `control_mode`, `image`, `prompt`
+        - `controlnet_union_inpainting`: requires `control_image`, `control_mode`, `mask_image`, `image`, `prompt`
+        - `ip_adapter_text2image`: requires `ip_adapter_image`, `prompt`
+        - `ip_adapter_image2image`: requires `ip_adapter_image`, `image`, `prompt`
+        - `ip_adapter_inpainting`: requires `ip_adapter_image`, `mask_image`, `image`, `prompt`
+        - `ip_adapter_controlnet_text2image`: requires `ip_adapter_image`, `control_image`, `prompt`
+        - `ip_adapter_controlnet_image2image`: requires `ip_adapter_image`, `control_image`, `image`, `prompt`
+        - `ip_adapter_controlnet_inpainting`: requires `ip_adapter_image`, `control_image`, `mask_image`, `image`,
+          `prompt`
+        - `ip_adapter_controlnet_union_text2image`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+          `prompt`
+        - `ip_adapter_controlnet_union_image2image`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+          `image`, `prompt`
+        - `ip_adapter_controlnet_union_inpainting`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+          `mask_image`, `image`, `prompt`
+
+      Components:
+          text_encoder (`CLIPTextModel`) text_encoder_2 (`CLIPTextModelWithProjection`) tokenizer (`CLIPTokenizer`)
+          tokenizer_2 (`CLIPTokenizer`) guider (`ClassifierFreeGuidance`) image_encoder
+          (`CLIPVisionModelWithProjection`) feature_extractor (`CLIPImageProcessor`) unet (`UNet2DConditionModel`) vae
+          (`AutoencoderKL`) image_processor (`VaeImageProcessor`) mask_processor (`VaeImageProcessor`) scheduler
+          (`EulerDiscreteScheduler`) controlnet (`ControlNetUnionModel`) control_image_processor (`VaeImageProcessor`)
+
+      Configs:
+          force_zeros_for_empty_prompt (default: True) requires_aesthetics_score (default: False)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          prompt_2 (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt_2 (`None`, *optional*):
+              TODO: Add description.
+          cross_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          clip_skip (`None`, *optional*):
+              TODO: Add description.
+          ip_adapter_image (`Image | ndarray | Tensor | list | list | list`, *optional*):
+              The image(s) to be used as ip adapter
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          mask_image (`None`, *optional*):
+              TODO: Add description.
+          padding_mask_crop (`None`, *optional*):
+              TODO: Add description.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          generator (`None`, *optional*):
+              TODO: Add description.
+          preprocess_kwargs (`dict | NoneType`, *optional*):
+              A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under
+              `self.image_processor` in [diffusers.image_processor.VaeImageProcessor]
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          ip_adapter_embeds (`list`, *optional*):
+              Pre-generated image embeddings for IP-Adapter. Can be generated from ip_adapter step.
+          negative_ip_adapter_embeds (`list`, *optional*):
+              Pre-generated negative image embeddings for IP-Adapter. Can be generated from ip_adapter step.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          denoising_end (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.3):
+              TODO: Add description.
+          denoising_start (`None`, *optional*):
+              TODO: Add description.
+          latents (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              The latents representing the reference image for image-to-image/inpainting generation. Can be generated
+              in vae_encode step.
+          mask (`Tensor`, *optional*):
+              The mask for the inpainting generation. Can be generated in vae_encode step.
+          masked_image_latents (`Tensor`, *optional*):
+              The masked image latents for the inpainting generation (only for inpainting-specific unet). Can be
+              generated in vae_encode step.
+          original_size (`None`, *optional*):
+              TODO: Add description.
+          target_size (`None`, *optional*):
+              TODO: Add description.
+          negative_original_size (`None`, *optional*):
+              TODO: Add description.
+          negative_target_size (`None`, *optional*):
+              TODO: Add description.
+          crops_coords_top_left (`None`, *optional*, defaults to (0, 0)):
+              TODO: Add description.
+          negative_crops_coords_top_left (`None`, *optional*, defaults to (0, 0)):
+              TODO: Add description.
+          aesthetic_score (`None`, *optional*, defaults to 6.0):
+              TODO: Add description.
+          negative_aesthetic_score (`None`, *optional*, defaults to 2.0):
+              TODO: Add description.
+          control_image (`None`, *optional*):
+              TODO: Add description.
+          control_mode (`None`, *optional*):
+              TODO: Add description.
+          control_guidance_start (`None`, *optional*, defaults to 0.0):
+              TODO: Add description.
+          control_guidance_end (`None`, *optional*, defaults to 1.0):
+              TODO: Add description.
+          controlnet_conditioning_scale (`None`, *optional*, defaults to 1.0):
+              TODO: Add description.
+          guess_mode (`None`, *optional*, defaults to False):
+              TODO: Add description.
+          crops_coords (`tuple | NoneType`, *optional*):
+              The crop coordinates to use for preprocess/postprocess the image and mask, for inpainting task only. Can
+              be generated in vae_encode step.
+          controlnet_cond (`Tensor`, *optional*):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          conditioning_scale (`float`, *optional*):
+              The controlnet conditioning scale value to use for the denoising process. Can be generated in
+              prepare_controlnet_inputs step.
+          controlnet_keep (`list`, *optional*):
+              The controlnet keep values to use for the denoising process. Can be generated in
+              prepare_controlnet_inputs step.
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs that need to be prepared with guider. It should contain
+              prompt_embeds/negative_prompt_embeds, add_time_ids/negative_add_time_ids,
+              pooled_prompt_embeds/negative_pooled_prompt_embeds, and ip_adapter_embeds/negative_ip_adapter_embeds
+              (optional).please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when
+              they are created and added to the pipeline state
+          eta (`None`, *optional*, defaults to 0.0):
+              TODO: Add description.
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     block_classes = [
         StableDiffusionXLTextEncoderStep,
         StableDiffusionXLAutoIPAdapterStep,
@@ -352,3 +506,7 @@ class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
index e4bd483cec26..b641c6cd7fcc 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     WanPrepareLatentsStep,
     WanSetTimestepsStep,
@@ -39,6 +40,43 @@
 # inputs(text) -> set_timesteps -> prepare_latents -> denoise
 # auto_docstring
 class WanCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded conditions and runs the denoising process.
+
+      Components:
+          transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          num_frames (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "wan"
     block_classes = [
         WanTextInputStep,
@@ -52,6 +90,10 @@ class WanCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "denoise block that takes encoded conditions and runs the denoising process."
 
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
 
 # ====================
 # 2. BLOCKS (Wan2.1 text2video)
@@ -60,6 +102,49 @@ def description(self):
 
 # auto_docstring
 class WanBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline blocks for Wan2.1.
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer
+          (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) vae (`AutoencoderKLWan`) video_processor
+          (`VideoProcessor`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          num_frames (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              The output type of the decoded videos
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "wan"
     block_classes = [
         WanTextEncoderStep,
@@ -71,3 +156,7 @@ class WanBlocks(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Modular pipeline blocks for Wan2.1."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
index d42db8eab485..9f602c24713b 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     WanPrepareLatentsStep,
     WanSetTimestepsStep,
@@ -40,6 +41,48 @@
 
 # auto_docstring
 class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded conditions and runs the denoising process.
+
+      Components:
+          transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+          guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+      Configs:
+          boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+          noise stages.
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          num_frames (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "wan"
     block_classes = [
         WanTextInputStep,
@@ -53,6 +96,10 @@ class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "denoise block that takes encoded conditions and runs the denoising process."
 
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
 
 # ====================
 # 2. BLOCKS (Wan2.2 text2video)
@@ -61,6 +108,53 @@ def description(self):
 
 # auto_docstring
 class Wan22Blocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline for text-to-video using Wan2.2.
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer
+          (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider_2 (`ClassifierFreeGuidance`)
+          transformer_2 (`WanTransformer3DModel`) vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Configs:
+          boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+          noise stages.
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          num_frames (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              The output type of the decoded videos
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "wan"
     block_classes = [
         WanTextEncoderStep,
@@ -76,3 +170,7 @@ class Wan22Blocks(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Modular pipeline for text-to-video using Wan2.2."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
index b32e65a08e76..8e55b7a50f08 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     WanAdditionalInputsStep,
     WanPrepareLatentsStep,
@@ -42,6 +43,34 @@
 
 # auto_docstring
 class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent
+    representation
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          first_frame_latents (`Tensor`):
+              video latent representation with the first frame image condition
+          image_condition_latents (`Tensor | NoneType`):
+              TODO: Add description.
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep]
     block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"]
@@ -59,6 +88,50 @@ def description(self):
 # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents)
 # auto_docstring
 class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+      Components:
+          transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+          guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+      Configs:
+          boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+          noise stages.
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          num_frames (`None`, *optional*):
+              TODO: Add description.
+          image_condition_latents (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanTextInputStep,
@@ -79,6 +152,10 @@ class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
 
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
 
 # ====================
 # 3. BLOCKS (Wan2.2 Image2Video)
@@ -87,6 +164,55 @@ def description(self):
 
 # auto_docstring
 class Wan22Image2VideoBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline for image-to-video using Wan2.2.
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) vae
+          (`AutoencoderKLWan`) video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler
+          (`UniPCMultistepScheduler`) guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+      Configs:
+          boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+          noise stages.
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              The output type of the decoded videos
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanTextEncoderStep,
@@ -104,3 +230,7 @@ class Wan22Image2VideoBlocks(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Modular pipeline for image-to-video using Wan2.2."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
index 006557c2c03e..c08db62c469a 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     WanAdditionalInputsStep,
     WanPrepareLatentsStep,
@@ -47,6 +48,27 @@
 # wan2.1 I2V (first frame only)
 # auto_docstring
 class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks):
+    """
+    Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings
+
+      Components:
+          image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              The image embeddings
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanImageEncoderStep]
     block_names = ["image_resize", "image_encoder"]
@@ -59,6 +81,32 @@ def description(self):
 # wan2.1 FLF2V (first and last frame)
 # auto_docstring
 class WanFLF2VImageEncoderStep(SequentialPipelineBlocks):
+    """
+    FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image
+    embeddings
+
+      Components:
+          image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`):
+              The last frameimage
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          resized_last_image (`Image`):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              The image embeddings
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep]
     block_names = ["image_resize", "last_image_resize", "image_encoder"]
@@ -71,6 +119,34 @@ def description(self):
 # wan2.1 Auto Image Encoder
 # auto_docstring
 class WanAutoImageEncoderStep(AutoPipelineBlocks):
+    """
+    Image Encoder step that encode the image to generate the image embeddingsThis is an auto pipeline block that works
+    for image2video tasks. - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided. -
+    `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is
+    not provided, step will be skipped.
+
+      Components:
+          image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+      Inputs:
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`, *optional*):
+              The last frameimage
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          resized_last_image (`Image`):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              The image embeddings
+    """
+
     block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep]
     block_names = ["flf2v_image_encoder", "image2video_image_encoder"]
     block_trigger_inputs = ["last_image", "image"]
@@ -95,6 +171,34 @@ def description(self):
 # wan2.1 I2V (first frame only)
 # auto_docstring
 class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent
+    representation
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          first_frame_latents (`Tensor`):
+              video latent representation with the first frame image condition
+          image_condition_latents (`Tensor | NoneType`):
+              TODO: Add description.
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep]
     block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"]
@@ -107,6 +211,38 @@ def description(self):
 # wan2.1 FLF2V (first and last frame)
 # auto_docstring
 class WanFLF2VVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the
+    latent conditions
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`):
+              The last frameimage
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          resized_last_image (`Image`):
+              TODO: Add description.
+          first_last_frame_latents (`Tensor`):
+              video latent representation with the first and last frame images condition
+          image_condition_latents (`Tensor | NoneType`):
+              TODO: Add description.
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanImageResizeStep,
@@ -124,6 +260,42 @@ def description(self):
 # wan2.1 Auto Vae Encoder
 # auto_docstring
 class WanAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Vae Image Encoder step that encode the image to generate the image latentsThis is an auto pipeline block that works
+    for image2video tasks. - `WanFLF2VVaeEncoderStep` (flf2v) is used when `last_image` is provided. -
+    `WanImage2VideoVaeEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is not
+    provided, step will be skipped.
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`, *optional*):
+              The last frameimage
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          resized_last_image (`Image`):
+              TODO: Add description.
+          first_last_frame_latents (`Tensor`):
+              video latent representation with the first and last frame images condition
+          image_condition_latents (`Tensor | NoneType`):
+              TODO: Add description.
+          first_frame_latents (`Tensor`):
+              video latent representation with the first frame image condition
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanFLF2VVaeEncoderStep, WanImage2VideoVaeEncoderStep]
     block_names = ["flf2v_vae_encoder", "image2video_vae_encoder"]
@@ -149,6 +321,51 @@ def description(self):
 # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents)
 # auto_docstring
 class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+      Components:
+          transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          num_frames (`None`, *optional*):
+              TODO: Add description.
+          image_condition_latents (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              TODO: Add description.
+
+      Outputs:
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `transformer.dtype`)
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanTextInputStep,
@@ -178,6 +395,62 @@ def description(self):
 # wan2.1 Image2Video Auto Blocks
 # auto_docstring
 class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for image-to-video using Wan.
+
+      Supported workflows:
+        - `image2video`: requires `image`, `prompt`
+        - `flf2v`: requires `last_image`, `image`, `prompt`
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`)
+          image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) vae (`AutoencoderKLWan`)
+          video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler
+          (`UniPCMultistepScheduler`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`, *optional*):
+              The last frameimage
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_condition_latents (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              The output type of the decoded videos
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanTextEncoderStep,
@@ -202,3 +475,7 @@ class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
     @property
     def description(self):
         return "Auto Modular pipeline for image-to-video using Wan."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
index 83ee2fcbddba..23e20d55fb1e 100644
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     ZImageAdditionalInputsStep,
     ZImagePrepareLatentsStep,
@@ -43,6 +44,40 @@
 # text2image: inputs(text) -> set_timesteps -> prepare_latents -> denoise
 # auto_docstring
 class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded conditions and runs the denoising process.
+
+      Components:
+          transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`list`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`list`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 9):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     block_classes = [
         ZImageTextInputStep,
         ZImagePrepareLatentsStep,
@@ -55,10 +90,52 @@ class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "denoise block that takes encoded conditions and runs the denoising process."
 
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
 
 # image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise
 # auto_docstring
 class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+      Components:
+          transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`list`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`list`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 9):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     block_classes = [
         ZImageTextInputStep,
         ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
@@ -82,9 +159,55 @@ class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
 
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
 
 # auto_docstring
 class ZImageAutoDenoiseStep(AutoPipelineBlocks):
+    """
+    Denoise step that iteratively denoise the latents. This is a auto pipeline block that works for text2image and
+    image2image tasks. - `ZImageCoreDenoiseStep` (text2image) for text2image tasks. -
+    `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks. - if `image_latents` is provided,
+    `ZImageImage2ImageCoreDenoiseStep` will be used.
+       - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.
+
+      Components:
+          transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`list`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`list`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     block_classes = [
         ZImageImage2ImageCoreDenoiseStep,
         ZImageCoreDenoiseStep,
@@ -106,6 +229,27 @@ def description(self) -> str:
 
 # auto_docstring
 class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
+    """
+    Vae Image Encoder step that encode the image to generate the image latents
+
+      Components:
+          vae (`AutoencoderKL`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          image_latents (`Tensor`):
+              video latent representation with the first frame image condition
+    """
+
     block_classes = [ZImageVaeImageEncoderStep]
     block_names = ["vae_encoder"]
     block_trigger_inputs = ["image"]
@@ -120,6 +264,55 @@ def description(self) -> str:
 
 # auto_docstring
 class ZImageAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image and image-to-image using ZImage.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image2image`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) guider (`ClassifierFreeGuidance`) vae
+          (`AutoencoderKL`) image_processor (`VaeImageProcessor`) transformer (`ZImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          output_type (`str`, *optional*, defaults to pil):
+              The type of the output images, can be 'pil', 'np', 'pt'
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     block_classes = [
         ZImageTextEncoderStep,
         ZImageAutoVaeImageEncoderStep,
@@ -135,3 +328,7 @@ class ZImageAutoBlocks(SequentialPipelineBlocks):
     @property
     def description(self) -> str:
         return "Auto Modular pipeline for text-to-image and image-to-image using ZImage."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]

From 48fa52d21122caa4786736d5bf6bdd86086afac7 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-161-123.ec2.internal>
Date: Sat, 14 Feb 2026 10:16:21 +0000
Subject: [PATCH 49/58] workflow_names -> available_workflows

---
 src/diffusers/modular_pipelines/modular_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 572bd0af46e3..098991dd8e6c 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -359,11 +359,11 @@ def get_execution_blocks(self, **kwargs):
 
     # currently only SequentialPipelineBlocks support workflows
     @property
-    def workflow_names(self):
+    def available_workflows(self):
         """
         Returns a list of available workflow names. Must be implemented by subclasses that define `_workflow_map`.
         """
-        raise NotImplementedError(f"`workflow_names` is not implemented for {self.__class__.__name__}")
+        raise NotImplementedError(f"`available_workflows` is not implemented for {self.__class__.__name__}")
 
     def get_workflow(self, workflow_name: str):
         """
@@ -939,7 +939,7 @@ def expected_configs(self):
         return expected_configs
 
     @property
-    def workflow_names(self):
+    def available_workflows(self):
         if self._workflow_map is None:
             raise NotImplementedError(
                 f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"

From ecb00b9d6a43df0d1e00322720ad5f3d5e3245aa Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-161-123.ec2.internal>
Date: Sat, 14 Feb 2026 11:07:02 +0000
Subject: [PATCH 50/58] fix workflow test for klein base

---
 .../test_modular_pipeline_flux2_klein_base.py | 43 ++++++++++++++++---
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
index 701dd0fed896..b3aa79040317 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
@@ -21,20 +21,35 @@
 
 from diffusers.modular_pipelines import (
     Flux2KleinBaseAutoBlocks,
-    Flux2KleinModularPipeline,
+    Flux2KleinBaseModularPipeline,
 )
 
 from ...testing_utils import floats_tensor, torch_device
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
-class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
-    pipeline_class = Flux2KleinModularPipeline
+FLUX2_KLEIN_BASE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "Flux2KleinBaseTextEncoderStep"),
+        ("denoise.input", "Flux2KleinBaseTextInputStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "Flux2KleinBaseRoPEInputsStep"),
+        ("denoise.denoise", "Flux2KleinBaseDenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+
+class TestFlux2KleinBaseModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2KleinBaseModularPipeline
     pipeline_blocks_class = Flux2KleinBaseAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular"
 
     params = frozenset(["prompt", "height", "width"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = FLUX2_KLEIN_BASE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -55,13 +70,31 @@ def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
 
-class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
-    pipeline_class = Flux2KleinModularPipeline
+FLUX2_KLEIN_BASE_IMAGE_CONDITIONED_WORKFLOWS = {
+    "image_conditioned": [
+        ("text_encoder", "Flux2KleinBaseTextEncoderStep"),
+        ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+        ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+        ("denoise.input", "Flux2KleinBaseTextInputStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "Flux2KleinBaseRoPEInputsStep"),
+        ("denoise.denoise", "Flux2KleinBaseDenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+
+class TestFlux2KleinBaseImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2KleinBaseModularPipeline
     pipeline_blocks_class = Flux2KleinBaseAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular"
 
     params = frozenset(["prompt", "height", "width", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX2_KLEIN_BASE_IMAGE_CONDITIONED_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)

From 3e9ec6f42030c69517e389c461312c5174418d83 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Sat, 14 Feb 2026 01:30:36 -1000
Subject: [PATCH 51/58] Apply suggestions from code review

Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
---
 tests/modular_pipelines/test_modular_pipelines_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py
index f128b0bd3bfa..e97b543ff85d 100644
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -371,6 +371,7 @@ def test_workflow_map(self):
             for i, ((actual_name, actual_block), (expected_name, expected_class_name)) in enumerate(
                 zip(actual_blocks, expected_blocks)
             ):
+                assert actual_name == expected_name
                 assert actual_block.__class__.__name__ == expected_class_name, (
                     f"Workflow '{workflow_name}': block '{actual_name}' has type "
                     f"{actual_block.__class__.__name__}, expected {expected_class_name}"

From f41441a8c1e9e999f5477be05b30ffe72cedc748 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 16:39:24 +0000
Subject: [PATCH 52/58] fix workflow tests

---
 .../flux/test_modular_pipeline_flux.py        |  24 ++--
 .../flux2/test_modular_pipeline_flux2.py      |  34 +++---
 ...st_modular_pipeline_stable_diffusion_xl.py | 112 +++++++++---------
 .../z_image/test_modular_pipeline_z_image.py  |  22 ++--
 4 files changed, 96 insertions(+), 96 deletions(-)

diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
index f05b9202eba5..9a6b4b9b6fb4 100644
--- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -36,11 +36,11 @@
 FLUX_TEXT2IMAGE_WORKFLOWS = {
     "text2image": [
         ("text_encoder", "FluxTextEncoderStep"),
-        ("input", "FluxTextInputStep"),
-        ("prepare_latents", "FluxPrepareLatentsStep"),
-        ("set_timesteps", "FluxSetTimestepsStep"),
-        ("prepare_rope_inputs", "FluxRoPEInputsStep"),
-        ("denoise", "FluxDenoiseStep"),
+        ("denoise.input", "FluxTextInputStep"),
+        ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+        ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+        ("denoise.denoise", "FluxDenoiseStep"),
         ("decode", "FluxDecodeStep"),
     ]
 }
@@ -78,13 +78,13 @@ def test_float16_inference(self):
         ("text_encoder", "FluxTextEncoderStep"),
         ("vae_encoder.preprocess", "FluxProcessImagesInputStep"),
         ("vae_encoder.encode", "FluxVaeEncoderStep"),
-        ("input", "FluxTextInputStep"),
-        ("additional_inputs", "FluxAdditionalInputsStep"),
-        ("prepare_latents", "FluxPrepareLatentsStep"),
-        ("set_timesteps", "FluxImg2ImgSetTimestepsStep"),
-        ("prepare_img2img_latents", "FluxImg2ImgPrepareLatentsStep"),
-        ("prepare_rope_inputs", "FluxRoPEInputsStep"),
-        ("denoise", "FluxDenoiseStep"),
+        ("denoise.input.text_inputs", "FluxTextInputStep"),
+        ("denoise.input.additional_inputs", "FluxAdditionalInputsStep"),
+        ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+        ("denoise.before_denoise.set_timesteps", "FluxImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_img2img_latents", "FluxImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+        ("denoise.denoise", "FluxDenoiseStep"),
         ("decode", "FluxDecodeStep"),
     ]
 }
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
index 084f6b3b35b9..3045af636841 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
@@ -31,13 +31,13 @@
 FLUX2_TEXT2IMAGE_WORKFLOWS = {
     "text2image": [
         ("text_encoder", "Flux2TextEncoderStep"),
-        ("text_input", "Flux2TextInputStep"),
-        ("prepare_latents", "Flux2PrepareLatentsStep"),
-        ("set_timesteps", "Flux2SetTimestepsStep"),
-        ("prepare_guidance", "Flux2PrepareGuidanceStep"),
-        ("prepare_rope_inputs", "Flux2RoPEInputsStep"),
-        ("denoise", "Flux2DenoiseStep"),
-        ("after_denoise", "Flux2UnpackLatentsStep"),
+        ("denoise.input", "Flux2TextInputStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_guidance", "Flux2PrepareGuidanceStep"),
+        ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise.denoise", "Flux2DenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
         ("decode", "Flux2DecodeStep"),
     ],
 }
@@ -75,16 +75,16 @@ def test_float16_inference(self):
 FLUX2_IMAGE_CONDITIONED_WORKFLOWS = {
     "image_conditioned": [
         ("text_encoder", "Flux2TextEncoderStep"),
-        ("preprocess_images", "Flux2ProcessImagesInputStep"),
-        ("vae_encoder", "Flux2VaeEncoderStep"),
-        ("text_input", "Flux2TextInputStep"),
-        ("prepare_image_latents", "Flux2PrepareImageLatentsStep"),
-        ("prepare_latents", "Flux2PrepareLatentsStep"),
-        ("set_timesteps", "Flux2SetTimestepsStep"),
-        ("prepare_guidance", "Flux2PrepareGuidanceStep"),
-        ("prepare_rope_inputs", "Flux2RoPEInputsStep"),
-        ("denoise", "Flux2DenoiseStep"),
-        ("after_denoise", "Flux2UnpackLatentsStep"),
+        ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+        ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+        ("denoise.input", "Flux2TextInputStep"),
+        ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_guidance", "Flux2PrepareGuidanceStep"),
+        ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise.denoise", "Flux2DenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
         ("decode", "Flux2DecodeStep"),
     ],
 }
diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
index ffd71ca5a8d0..f640f0ec83f2 100644
--- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
+++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
@@ -270,52 +270,52 @@ def test_controlnet_cfg(self):
 TEXT2IMAGE_WORKFLOWS = {
     "text2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
-        ("denoise", "StableDiffusionXLDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
     "controlnet_text2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
-        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
-        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
     "controlnet_union_text2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
-        ("controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
-        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
     "ip_adapter_text2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
         ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
-        ("denoise", "StableDiffusionXLDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
     "ip_adapter_controlnet_text2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
         ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
-        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
-        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
 }
@@ -374,56 +374,56 @@ def test_inference_batch_single_identical(self):
     "image2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
         ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
-        ("denoise", "StableDiffusionXLDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
     "controlnet_image2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
         ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
-        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
-        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
     "controlnet_union_image2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
         ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
-        ("controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
-        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
     "ip_adapter_image2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
         ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
         ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
-        ("denoise", "StableDiffusionXLDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
     "ip_adapter_controlnet_image2image": [
         ("text_encoder", "StableDiffusionXLTextEncoderStep"),
         ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
         ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
-        ("input", "StableDiffusionXLInputStep"),
-        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
-        ("prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
-        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
-        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
-        ("denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
         ("decode", "StableDiffusionXLDecodeStep"),
     ],
 }
diff --git a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
index 15997931c8d4..ab45def3ef30 100644
--- a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
+++ b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
@@ -22,22 +22,22 @@
 ZIMAGE_WORKFLOWS = {
     "text2image": [
         ("text_encoder", "ZImageTextEncoderStep"),
-        ("input", "ZImageTextInputStep"),
-        ("prepare_latents", "ZImagePrepareLatentsStep"),
-        ("set_timesteps", "ZImageSetTimestepsStep"),
-        ("denoise", "ZImageDenoiseStep"),
+        ("denoise.input", "ZImageTextInputStep"),
+        ("denoise.prepare_latents", "ZImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "ZImageSetTimestepsStep"),
+        ("denoise.denoise", "ZImageDenoiseStep"),
         ("decode", "ZImageVaeDecoderStep"),
     ],
     "image2image": [
         ("text_encoder", "ZImageTextEncoderStep"),
         ("vae_encoder", "ZImageVaeImageEncoderStep"),
-        ("input", "ZImageTextInputStep"),
-        ("additional_inputs", "ZImageAdditionalInputsStep"),
-        ("prepare_latents", "ZImagePrepareLatentsStep"),
-        ("set_timesteps", "ZImageSetTimestepsStep"),
-        ("set_timesteps_with_strength", "ZImageSetTimestepsWithStrengthStep"),
-        ("prepare_latents_with_image", "ZImagePrepareLatentswithImageStep"),
-        ("denoise", "ZImageDenoiseStep"),
+        ("denoise.input", "ZImageTextInputStep"),
+        ("denoise.additional_inputs", "ZImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "ZImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "ZImageSetTimestepsStep"),
+        ("denoise.set_timesteps_with_strength", "ZImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_latents_with_image", "ZImagePrepareLatentswithImageStep"),
+        ("denoise.denoise", "ZImageDenoiseStep"),
         ("decode", "ZImageVaeDecoderStep"),
     ],
 }

From a6670ce84af0a12e3d685756b5820a391d2cc6e0 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 16:40:12 +0000
Subject: [PATCH 53/58] qwen: edit -> image_conditioned to be consistent with
 flux kontext/2 such

---
 .../qwenimage/modular_blocks_qwenimage_edit.py                | 4 ++--
 .../modular_pipelines/qwen/test_modular_pipeline_qwenimage.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 2bb0d7e21226..1d8859638012 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -779,8 +779,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     block_classes = EDIT_AUTO_BLOCKS.values()
     block_names = EDIT_AUTO_BLOCKS.keys()
     _workflow_map = {
-        "edit": {"prompt": True, "image": True},
-        "edit_inpainting": {"prompt": True, "mask_image": True, "image": True},
+        "image_conditioned": {"prompt": True, "image": True},
+        "image_conditioned_inpainting": {"prompt": True, "mask_image": True, "image": True},
     }
 
     @property
diff --git a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
index 1b4a07526639..92573c202e49 100644
--- a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
+++ b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
@@ -155,7 +155,7 @@ def test_inference_batch_single_identical(self):
 
 
 QWEN_IMAGE_EDIT_WORKFLOWS = {
-    "edit": [
+    "image_conditioned": [
         ("text_encoder.resize", "QwenImageEditResizeStep"),
         ("text_encoder.encode", "QwenImageEditTextEncoderStep"),
         ("vae_encoder.resize", "QwenImageEditResizeStep"),
@@ -171,7 +171,7 @@ def test_inference_batch_single_identical(self):
         ("decode.decode", "QwenImageDecoderStep"),
         ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
     ],
-    "edit_inpainting": [
+    "image_conditioned_inpainting": [
         ("text_encoder.resize", "QwenImageEditResizeStep"),
         ("text_encoder.encode", "QwenImageEditTextEncoderStep"),
         ("vae_encoder.resize", "QwenImageEditResizeStep"),

From 2007559b477578e98c55989dbc46a45a022d0047 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 22:08:03 +0000
Subject: [PATCH 54/58] remove Optional

---
 src/diffusers/modular_pipelines/modular_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 02d64b417377..8d85d60cd181 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -19,7 +19,7 @@
 from collections import OrderedDict
 from copy import deepcopy
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from huggingface_hub import create_repo
@@ -680,7 +680,7 @@ def fn_recursive_get_trigger(blocks):
 
         return all_triggers
 
-    def select_block(self, **kwargs) -> Optional[str]:
+    def select_block(self, **kwargs) -> str | None:
         """
         Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic
         for selecting the block.
@@ -724,7 +724,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
             logger.error(error_msg)
             raise
 
-    def get_execution_blocks(self, **kwargs) -> Optional["ModularPipelineBlocks"]:
+    def get_execution_blocks(self, **kwargs) -> "ModularPipelineBlocks" | None:
         """
         Get the block(s) that would execute given the inputs.
 

From 45c250ea5209092e43d28489a83733d08b8685a0 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 22:23:56 +0000
Subject: [PATCH 55/58] update type hints

---
 src/diffusers/modular_pipelines/modular_pipeline_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 8d90b5b375b4..cab17c2aed5c 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -18,7 +18,7 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field
 from types import UnionType
-from typing import Any, Dict, List, Literal, Tuple, Type, Union, get_args, get_origin
+from typing import Any, Literal, Type, Union, get_args, get_origin
 
 import PIL.Image
 import torch
@@ -972,7 +972,7 @@ def make_doc_string(
     return output
 
 
-def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> List[InputParam]:
+def combine_inputs(*named_input_lists: list[tuple[str, list[InputParam]]]) -> list[InputParam]:
     """
     Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if current
     default value is None and new default value is not None. Warns if multiple non-None default values exist for the
@@ -1015,7 +1015,7 @@ def combine_inputs(*named_input_lists: List[Tuple[str, List[InputParam]]]) -> Li
     return list(combined_dict.values())
 
 
-def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) -> List[OutputParam]:
+def combine_outputs(*named_output_lists: list[tuple[str, list[OutputParam]]]) -> list[OutputParam]:
     """
     Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
     occurrence of each output name.
@@ -1038,7 +1038,7 @@ def combine_outputs(*named_output_lists: List[Tuple[str, List[OutputParam]]]) ->
     return list(combined_dict.values())
 
 
-def generate_modular_model_card_content(blocks) -> Dict[str, Any]:
+def generate_modular_model_card_content(blocks) -> dict[str, Any]:
     """
     Generate model card content for a modular pipeline.
 

From e3e99bbdc3e51585f0495d3f526b0c681accf642 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 22:28:05 +0000
Subject: [PATCH 56/58] update guider update_components

---
 docs/source/en/modular_diffusers/guiders.md | 16 +---------------
 docs/source/zh/modular_diffusers/guiders.md | 17 +----------------
 2 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/docs/source/en/modular_diffusers/guiders.md b/docs/source/en/modular_diffusers/guiders.md
index 6abe4fad2736..ffe039f41556 100644
--- a/docs/source/en/modular_diffusers/guiders.md
+++ b/docs/source/en/modular_diffusers/guiders.md
@@ -89,10 +89,8 @@ t2i_pipeline.guider
 
 ## Changing guider parameters
 
-The guider parameters can be adjusted with either the [`~ComponentSpec.create`] method or with [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value.
+The guider parameters can be adjusted with the [`~ComponentSpec.create`] method and [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value.
 
-<hfoptions id="switch">
-<hfoption id="create">
 
 ```py
 guider_spec = t2i_pipeline.get_component_spec("guider")
@@ -100,18 +98,6 @@ guider = guider_spec.create(guidance_scale=10)
 t2i_pipeline.update_components(guider=guider)
 ```
 
-</hfoption>
-<hfoption id="update_components">
-
-```py
-guider_spec = t2i_pipeline.get_component_spec("guider")
-guider_spec.config["guidance_scale"] = 10
-t2i_pipeline.update_components(guider=guider_spec)
-```
-
-</hfoption>
-</hfoptions>
-
 ## Uploading custom guiders
 
 Call the [`~utils.PushToHubMixin.push_to_hub`] method on a custom guider to share it to the Hub.
diff --git a/docs/source/zh/modular_diffusers/guiders.md b/docs/source/zh/modular_diffusers/guiders.md
index 50436f90c4a5..2315625a197a 100644
--- a/docs/source/zh/modular_diffusers/guiders.md
+++ b/docs/source/zh/modular_diffusers/guiders.md
@@ -86,10 +86,7 @@ t2i_pipeline.guider
 
 ## 更改引导器参数
 
-引导器参数可以通过 [`~ComponentSpec.create`] 方法或 [`~ModularPipeline.update_components`] 方法进行调整。下面的示例更改了 `guidance_scale` 值。
-
-<hfoptions id="switch">
-<hfoption id="create">
+引导器参数可以通过 [`~ComponentSpec.create`] 方法以及 [`~ModularPipeline.update_components`] 方法进行调整。下面的示例更改了 `guidance_scale` 值。
 
 ```py
 guider_spec = t2i_pipeline.get_component_spec("guider")
@@ -97,18 +94,6 @@ guider = guider_spec.create(guidance_scale=10)
 t2i_pipeline.update_components(guider=guider)
 ```
 
-</hfoption>
-<hfoption id="update_components">
-
-```py
-guider_spec = t2i_pipeline.get_component_spec("guider")
-guider_spec.config["guidance_scale"] = 10
-t2i_pipeline.update_components(guider=guider_spec)
-```
-
-</hfoption>
-</hfoptions>
-
 ## 上传自定义引导器
 
 在自定义引导器上调用 [`~utils.PushToHubMixin.push_to_hub`] 方法，将其分享到 Hub。

From 224904b77187931cd4a8dccd875cfee8447a4182 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 22:38:16 +0000
Subject: [PATCH 57/58] fix more

---
 src/diffusers/modular_pipelines/modular_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 8d85d60cd181..76a850b63c4e 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -724,7 +724,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
             logger.error(error_msg)
             raise
 
-    def get_execution_blocks(self, **kwargs) -> "ModularPipelineBlocks" | None:
+    def get_execution_blocks(self, **kwargs) -> ModularPipelineBlocks | None:
         """
         Get the block(s) that would execute given the inputs.
 

From a66c159dac12ae78ac5cea23fa2591ed71c7fd06 Mon Sep 17 00:00:00 2001
From: "yiyi@huggingface.co" <yiyi@ip-26-0-160-103.ec2.internal>
Date: Sat, 14 Feb 2026 22:39:45 +0000
Subject: [PATCH 58/58] update docstring auto again

---
 .../qwenimage/modular_blocks_qwenimage_edit.py                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 1d8859638012..37b80b69ec7e 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -720,8 +720,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
 
 
       Supported workflows:
-        - `edit`: requires `prompt`, `image`
-        - `edit_inpainting`: requires `prompt`, `mask_image`, `image`
+        - `image_conditioned`: requires `prompt`, `image`
+        - `image_conditioned_inpainting`: requires `prompt`, `mask_image`, `image`
 
       Components:
           image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor