diff --git a/docs/source/en/modular_diffusers/guiders.md b/docs/source/en/modular_diffusers/guiders.md
index 6abe4fad2736..ffe039f41556 100644
--- a/docs/source/en/modular_diffusers/guiders.md
+++ b/docs/source/en/modular_diffusers/guiders.md
@@ -89,10 +89,8 @@ t2i_pipeline.guider
 
 ## Changing guider parameters
 
-The guider parameters can be adjusted with either the [`~ComponentSpec.create`] method or with [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value.
+The guider parameters can be adjusted with the [`~ComponentSpec.create`] method and [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value.
 
-<hfoptions id="switch">
-<hfoption id="create">
 
 ```py
 guider_spec = t2i_pipeline.get_component_spec("guider")
@@ -100,18 +98,6 @@ guider = guider_spec.create(guidance_scale=10)
 t2i_pipeline.update_components(guider=guider)
 ```
 
-</hfoption>
-<hfoption id="update_components">
-
-```py
-guider_spec = t2i_pipeline.get_component_spec("guider")
-guider_spec.config["guidance_scale"] = 10
-t2i_pipeline.update_components(guider=guider_spec)
-```
-
-</hfoption>
-</hfoptions>
-
 ## Uploading custom guiders
 
 Call the [`~utils.PushToHubMixin.push_to_hub`] method on a custom guider to share it to the Hub.
diff --git a/docs/source/zh/modular_diffusers/guiders.md b/docs/source/zh/modular_diffusers/guiders.md
index 50436f90c4a5..2315625a197a 100644
--- a/docs/source/zh/modular_diffusers/guiders.md
+++ b/docs/source/zh/modular_diffusers/guiders.md
@@ -86,10 +86,7 @@ t2i_pipeline.guider
 
 ## 更改引导器参数
 
-引导器参数可以通过 [`~ComponentSpec.create`] 方法或 [`~ModularPipeline.update_components`] 方法进行调整。下面的示例更改了 `guidance_scale` 值。
-
-<hfoptions id="switch">
-<hfoption id="create">
+引导器参数可以通过 [`~ComponentSpec.create`] 方法以及 [`~ModularPipeline.update_components`] 方法进行调整。下面的示例更改了 `guidance_scale` 值。
 
 ```py
 guider_spec = t2i_pipeline.get_component_spec("guider")
@@ -97,18 +94,6 @@ guider = guider_spec.create(guidance_scale=10)
 t2i_pipeline.update_components(guider=guider)
 ```
 
-</hfoption>
-<hfoption id="update_components">
-
-```py
-guider_spec = t2i_pipeline.get_component_spec("guider")
-guider_spec.config["guidance_scale"] = 10
-t2i_pipeline.update_components(guider=guider_spec)
-```
-
-</hfoption>
-</hfoptions>
-
 ## 上传自定义引导器
 
 在自定义引导器上调用 [`~utils.PushToHubMixin.push_to_hub`] 方法，将其分享到 Hub。
diff --git a/src/diffusers/modular_pipelines/flux/__init__.py b/src/diffusers/modular_pipelines/flux/__init__.py
index ec00986611c8..4754ed01ce6a 100644
--- a/src/diffusers/modular_pipelines/flux/__init__.py
+++ b/src/diffusers/modular_pipelines/flux/__init__.py
@@ -21,21 +21,8 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["encoders"] = ["FluxTextEncoderStep"]
-    _import_structure["modular_blocks"] = [
-        "ALL_BLOCKS",
-        "AUTO_BLOCKS",
-        "AUTO_BLOCKS_KONTEXT",
-        "FLUX_KONTEXT_BLOCKS",
-        "TEXT2IMAGE_BLOCKS",
-        "FluxAutoBeforeDenoiseStep",
-        "FluxAutoBlocks",
-        "FluxAutoDecodeStep",
-        "FluxAutoDenoiseStep",
-        "FluxKontextAutoBlocks",
-        "FluxKontextAutoDenoiseStep",
-        "FluxKontextBeforeDenoiseStep",
-    ]
+    _import_structure["modular_blocks_flux"] = ["FluxAutoBlocks"]
+    _import_structure["modular_blocks_flux_kontext"] = ["FluxKontextAutoBlocks"]
     _import_structure["modular_pipeline"] = ["FluxKontextModularPipeline", "FluxModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -45,21 +32,8 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .encoders import FluxTextEncoderStep
-        from .modular_blocks import (
-            ALL_BLOCKS,
-            AUTO_BLOCKS,
-            AUTO_BLOCKS_KONTEXT,
-            FLUX_KONTEXT_BLOCKS,
-            TEXT2IMAGE_BLOCKS,
-            FluxAutoBeforeDenoiseStep,
-            FluxAutoBlocks,
-            FluxAutoDecodeStep,
-            FluxAutoDenoiseStep,
-            FluxKontextAutoBlocks,
-            FluxKontextAutoDenoiseStep,
-            FluxKontextBeforeDenoiseStep,
-        )
+        from .modular_blocks_flux import FluxAutoBlocks
+        from .modular_blocks_flux_kontext import FluxKontextAutoBlocks
         from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
index 4f94a17d88eb..583c139ff22e 100644
--- a/src/diffusers/modular_pipelines/flux/encoders.py
+++ b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -205,7 +205,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState):
         return components, state
 
 
-class FluxVaeEncoderDynamicStep(ModularPipelineBlocks):
+class FluxVaeEncoderStep(ModularPipelineBlocks):
     model_name = "flux"
 
     def __init__(
diff --git a/src/diffusers/modular_pipelines/flux/inputs.py b/src/diffusers/modular_pipelines/flux/inputs.py
index dbf42e0c6df4..9d2f69dbe26f 100644
--- a/src/diffusers/modular_pipelines/flux/inputs.py
+++ b/src/diffusers/modular_pipelines/flux/inputs.py
@@ -121,7 +121,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
 
 
 # Adapted from `QwenImageAdditionalInputsStep`
-class FluxInputsDynamicStep(ModularPipelineBlocks):
+class FluxAdditionalInputsStep(ModularPipelineBlocks):
     model_name = "flux"
 
     def __init__(
@@ -243,7 +243,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         return components, state
 
 
-class FluxKontextInputsDynamicStep(FluxInputsDynamicStep):
+class FluxKontextAdditionalInputsStep(FluxAdditionalInputsStep):
     model_name = "flux-kontext"
 
     def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
@@ -256,7 +256,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
                 continue
 
             # 1. Calculate height/width from latents
-            # Unlike the `FluxInputsDynamicStep`, we don't overwrite the `block.height` and `block.width`
+            # Unlike the `FluxAdditionalInputsStep`, we don't overwrite the `block.height` and `block.width`
             height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
             if not hasattr(block_state, "image_height"):
                 block_state.image_height = height
@@ -303,6 +303,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
 class FluxKontextSetResolutionStep(ModularPipelineBlocks):
     model_name = "flux-kontext"
 
+    @property
     def description(self):
         return (
             "Determines the height and width to be used during the subsequent computations.\n"
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
deleted file mode 100644
index bd9b2d1b40c9..000000000000
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    FluxImg2ImgPrepareLatentsStep,
-    FluxImg2ImgSetTimestepsStep,
-    FluxKontextRoPEInputsStep,
-    FluxPrepareLatentsStep,
-    FluxRoPEInputsStep,
-    FluxSetTimestepsStep,
-)
-from .decoders import FluxDecodeStep
-from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
-from .encoders import (
-    FluxKontextProcessImagesInputStep,
-    FluxProcessImagesInputStep,
-    FluxTextEncoderStep,
-    FluxVaeEncoderDynamicStep,
-)
-from .inputs import (
-    FluxInputsDynamicStep,
-    FluxKontextInputsDynamicStep,
-    FluxKontextSetResolutionStep,
-    FluxTextInputStep,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# vae encoder (run before before_denoise)
-FluxImg2ImgVaeEncoderBlocks = InsertableDict(
-    [("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep())]
-)
-
-
-class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "flux"
-
-    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
-    block_names = FluxImg2ImgVaeEncoderBlocks.keys()
-
-    @property
-    def description(self) -> str:
-        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
-
-
-class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [FluxImg2ImgVaeEncoderStep]
-    block_names = ["img2img"]
-    block_trigger_inputs = ["image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block that works for img2img tasks.\n"
-            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
-            + " - if `image` is not provided, step will be skipped."
-        )
-
-
-# Flux Kontext vae encoder (run before before_denoise)
-
-FluxKontextVaeEncoderBlocks = InsertableDict(
-    [("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep(sample_mode="argmax"))]
-)
-
-
-class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
-    model_name = "flux-kontext"
-
-    block_classes = FluxKontextVaeEncoderBlocks.values()
-    block_names = FluxKontextVaeEncoderBlocks.keys()
-
-    @property
-    def description(self) -> str:
-        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
-
-
-class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextVaeEncoderStep]
-    block_names = ["img2img"]
-    block_trigger_inputs = ["image"]
-
-    @property
-    def description(self):
-        return (
-            "Vae encoder step that encode the image inputs into their latent representations.\n"
-            + "This is an auto pipeline block that works for img2img tasks.\n"
-            + " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided."
-            + " - if `image` is not provided, step will be skipped."
-        )
-
-
-# before_denoise: text2img
-FluxBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-    ]
-)
-
-
-class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxBeforeDenoiseBlocks.values()
-    block_names = FluxBeforeDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
-
-
-# before_denoise: img2img
-FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
-        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-    ]
-)
-
-
-class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
-    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return "Before denoise step that prepare the inputs for the denoise step for img2img task."
-
-
-# before_denoise: all task (text2img, img2img)
-class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    model_name = "flux-kontext"
-    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
-    block_names = ["img2img", "text2image"]
-    block_trigger_inputs = ["image_latents", None]
-
-    @property
-    def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
-            + "This is an auto pipeline block that works for text2image.\n"
-            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
-            + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
-        )
-
-
-# before_denoise: FluxKontext
-
-FluxKontextBeforeDenoiseBlocks = InsertableDict(
-    [
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
-    ]
-)
-
-
-class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = FluxKontextBeforeDenoiseBlocks.values()
-    block_names = FluxKontextBeforeDenoiseBlocks.keys()
-
-    @property
-    def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step\n"
-            "for img2img/text2img task for Flux Kontext."
-        )
-
-
-class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep]
-    block_names = ["img2img", "text2image"]
-    block_trigger_inputs = ["image_latents", None]
-
-    @property
-    def description(self):
-        return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
-            + "This is an auto pipeline block that works for text2image.\n"
-            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
-            + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
-        )
-
-
-# denoise: text2image
-class FluxAutoDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxDenoiseStep]
-    block_names = ["denoise"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Denoise step that iteratively denoise the latents. "
-            "This is a auto pipeline block that works for text2image and img2img tasks."
-            " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
-        )
-
-
-# denoise: Flux Kontext
-
-
-class FluxKontextAutoDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextDenoiseStep]
-    block_names = ["denoise"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Denoise step that iteratively denoise the latents for Flux Kontext. "
-            "This is a auto pipeline block that works for text2image and img2img tasks."
-            " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
-        )
-
-
-# decode: all task (text2img, img2img)
-class FluxAutoDecodeStep(AutoPipelineBlocks):
-    block_classes = [FluxDecodeStep]
-    block_names = ["non-inpaint"]
-    block_trigger_inputs = [None]
-
-    @property
-    def description(self):
-        return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
-
-
-# inputs: text2image/img2img
-FluxImg2ImgBlocks = InsertableDict(
-    [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
-)
-
-
-class FluxImg2ImgInputStep(SequentialPipelineBlocks):
-    model_name = "flux"
-    block_classes = FluxImg2ImgBlocks.values()
-    block_names = FluxImg2ImgBlocks.keys()
-
-    @property
-    def description(self):
-        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
-        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
-        " - update height/width based `image_latents`, patchify `image_latents`."
-
-
-class FluxAutoInputStep(AutoPipelineBlocks):
-    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
-    block_names = ["img2img", "text2image"]
-    block_trigger_inputs = ["image_latents", None]
-
-    @property
-    def description(self):
-        return (
-            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
-            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
-            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
-            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
-        )
-
-
-# inputs: Flux Kontext
-
-FluxKontextBlocks = InsertableDict(
-    [
-        ("set_resolution", FluxKontextSetResolutionStep()),
-        ("text_inputs", FluxTextInputStep()),
-        ("additional_inputs", FluxKontextInputsDynamicStep()),
-    ]
-)
-
-
-class FluxKontextInputStep(SequentialPipelineBlocks):
-    model_name = "flux-kontext"
-    block_classes = FluxKontextBlocks.values()
-    block_names = FluxKontextBlocks.keys()
-
-    @property
-    def description(self):
-        return (
-            "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
-            " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
-            " - update height/width based `image_latents`, patchify `image_latents`."
-        )
-
-
-class FluxKontextAutoInputStep(AutoPipelineBlocks):
-    block_classes = [FluxKontextInputStep, FluxTextInputStep]
-    # block_classes = [FluxKontextInputStep]
-    block_names = ["img2img", "text2img"]
-    # block_names = ["img2img"]
-    block_trigger_inputs = ["image_latents", None]
-    # block_trigger_inputs = ["image_latents"]
-
-    @property
-    def description(self):
-        return (
-            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
-            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
-            + " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n"
-            + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
-        )
-
-
-class FluxCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "flux"
-    block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
-    block_names = ["input", "before_denoise", "denoise"]
-
-    @property
-    def description(self):
-        return (
-            "Core step that performs the denoising process. \n"
-            + " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
-            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
-            + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
-            + " - for image-to-image generation, you need to provide `image_latents`\n"
-            + " - for text-to-image generation, all you need to provide is prompt embeddings."
-        )
-
-
-class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
-    model_name = "flux-kontext"
-    block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextAutoDenoiseStep]
-    block_names = ["input", "before_denoise", "denoise"]
-
-    @property
-    def description(self):
-        return (
-            "Core step that performs the denoising process. \n"
-            + " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
-            + " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
-            + " - `FluxKontextAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
-            + " - for image-to-image generation, you need to provide `image_latents`\n"
-            + " - for text-to-image generation, all you need to provide is prompt embeddings."
-        )
-
-
-# Auto blocks (text2image and img2img)
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxAutoVaeEncoderStep()),
-        ("denoise", FluxCoreDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-AUTO_BLOCKS_KONTEXT = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
-        ("denoise", FluxKontextCoreDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-
-class FluxAutoBlocks(SequentialPipelineBlocks):
-    model_name = "flux"
-
-    block_classes = AUTO_BLOCKS.values()
-    block_names = AUTO_BLOCKS.keys()
-
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image and image-to-image using Flux.\n"
-            + "- for text-to-image generation, all you need to provide is `prompt`\n"
-            + "- for image-to-image generation, you need to provide either `image` or `image_latents`"
-        )
-
-
-class FluxKontextAutoBlocks(FluxAutoBlocks):
-    model_name = "flux-kontext"
-
-    block_classes = AUTO_BLOCKS_KONTEXT.values()
-    block_names = AUTO_BLOCKS_KONTEXT.keys()
-
-
-TEXT2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("input", FluxTextInputStep()),
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-        ("denoise", FluxDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-IMAGE2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxVaeEncoderDynamicStep()),
-        ("input", FluxImg2ImgInputStep()),
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
-        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
-        ("prepare_rope_inputs", FluxRoPEInputsStep()),
-        ("denoise", FluxDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-FLUX_KONTEXT_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep()),
-        ("vae_encoder", FluxVaeEncoderDynamicStep(sample_mode="argmax")),
-        ("input", FluxKontextInputStep()),
-        ("prepare_latents", FluxPrepareLatentsStep()),
-        ("set_timesteps", FluxSetTimestepsStep()),
-        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
-        ("denoise", FluxKontextDenoiseStep()),
-        ("decode", FluxDecodeStep()),
-    ]
-)
-
-ALL_BLOCKS = {
-    "text2image": TEXT2IMAGE_BLOCKS,
-    "img2img": IMAGE2IMAGE_BLOCKS,
-    "auto": AUTO_BLOCKS,
-    "auto_kontext": AUTO_BLOCKS_KONTEXT,
-    "kontext": FLUX_KONTEXT_BLOCKS,
-}
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
new file mode 100644
index 000000000000..f2e78e933448
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
@@ -0,0 +1,586 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+    FluxImg2ImgPrepareLatentsStep,
+    FluxImg2ImgSetTimestepsStep,
+    FluxPrepareLatentsStep,
+    FluxRoPEInputsStep,
+    FluxSetTimestepsStep,
+)
+from .decoders import FluxDecodeStep
+from .denoise import FluxDenoiseStep
+from .encoders import (
+    FluxProcessImagesInputStep,
+    FluxTextEncoderStep,
+    FluxVaeEncoderStep,
+)
+from .inputs import (
+    FluxAdditionalInputsStep,
+    FluxTextInputStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# vae encoder (run before before_denoise)
+
+
+# auto_docstring
+class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+      Inputs:
+          resized_image (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          processed_image (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The latents representing the reference image
+    """
+
+    model_name = "flux"
+
+    block_classes = [FluxProcessImagesInputStep(), FluxVaeEncoderStep()]
+    block_names = ["preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
+# auto_docstring
+class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block that works for img2img tasks.
+       - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided. - if `image` is not provided,
+         step will be skipped.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+      Inputs:
+          resized_image (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          processed_image (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The latents representing the reference image
+    """
+
+    model_name = "flux"
+    block_classes = [FluxImg2ImgVaeEncoderStep]
+    block_names = ["img2img"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            + "This is an auto pipeline block that works for img2img tasks.\n"
+            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + " - if `image` is not provided, step will be skipped."
+        )
+
+
+# before_denoise: text2img
+# auto_docstring
+class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepares the inputs for the denoise step in text-to-image generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
+    model_name = "flux"
+    block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()]
+    block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
+
+    @property
+    def description(self):
+        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
+
+
+# before_denoise: img2img
+# auto_docstring
+class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs for the denoise step for img2img task.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
+    model_name = "flux"
+    block_classes = [
+        FluxPrepareLatentsStep(),
+        FluxImg2ImgSetTimestepsStep(),
+        FluxImg2ImgPrepareLatentsStep(),
+        FluxRoPEInputsStep(),
+    ]
+    block_names = ["prepare_latents", "set_timesteps", "prepare_img2img_latents", "prepare_rope_inputs"]
+
+    @property
+    def description(self):
+        return "Before denoise step that prepare the inputs for the denoise step for img2img task."
+
+
+# before_denoise: all task (text2img, img2img)
+# auto_docstring
+class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs for the denoise step.
+      This is an auto pipeline block that works for text2image.
+       - `FluxBeforeDenoiseStep` (text2image) is used.
+       - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`):
+              TODO: Add description.
+          width (`int`):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
+    model_name = "flux"
+    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is an auto pipeline block that works for text2image.\n"
+            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
+            + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
+        )
+
+
+# inputs: text2image/img2img
+
+
+# auto_docstring
+class FluxImg2ImgInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the img2img denoising step. It:
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation
+          pooled_prompt_embeds (`Tensor`):
+              pooled text embeddings used to guide the image generation
+          image_height (`int`):
+              The height of the image latents
+          image_width (`int`):
+              The width of the image latents
+    """
+
+    model_name = "flux"
+    block_classes = [FluxTextInputStep(), FluxAdditionalInputsStep()]
+    block_names = ["text_inputs", "additional_inputs"]
+
+    @property
+    def description(self):
+        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
+        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+        " - update height/width based `image_latents`, patchify `image_latents`."
+
+
+# auto_docstring
+class FluxAutoInputStep(AutoPipelineBlocks):
+    """
+    Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size,
+    and patchified.
+       This is an auto pipeline block that works for text2image/img2img tasks.
+       - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.
+       - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation
+          pooled_prompt_embeds (`Tensor`):
+              pooled text embeddings used to guide the image generation
+          image_height (`int`):
+              The height of the image latents
+          image_width (`int`):
+              The width of the image latents
+    """
+
+    model_name = "flux"
+
+    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
+            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
+        )
+
+
+# auto_docstring
+class FluxCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core step that performs the denoising process for Flux.
+      This step supports text-to-image and image-to-image tasks for Flux:
+       - for image-to-image generation, you need to provide `image_latents`
+       - for text-to-image generation, all you need to provide is prompt embeddings.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "flux"
+    block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxDenoiseStep]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process for Flux.\n"
+            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings."
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
+
+# Auto blocks (text2image and img2img)
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", FluxTextEncoderStep()),
+        ("vae_encoder", FluxAutoVaeEncoderStep()),
+        ("denoise", FluxCoreDenoiseStep()),
+        ("decode", FluxDecodeStep()),
+    ]
+)
+
+
+# auto_docstring
+class FluxAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image and image-to-image using Flux.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image2image`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2
+          (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          prompt_2 (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          resized_image (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
+    model_name = "flux"
+
+    block_classes = AUTO_BLOCKS.values()
+    block_names = AUTO_BLOCKS.keys()
+
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image2image": {"image": True, "prompt": True},
+    }
+
+    @property
+    def description(self):
+        return "Auto Modular pipeline for text-to-image and image-to-image using Flux."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
new file mode 100644
index 000000000000..b5a5dbf78c0e
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
@@ -0,0 +1,585 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+    FluxKontextRoPEInputsStep,
+    FluxPrepareLatentsStep,
+    FluxRoPEInputsStep,
+    FluxSetTimestepsStep,
+)
+from .decoders import FluxDecodeStep
+from .denoise import FluxKontextDenoiseStep
+from .encoders import (
+    FluxKontextProcessImagesInputStep,
+    FluxTextEncoderStep,
+    FluxVaeEncoderStep,
+)
+from .inputs import (
+    FluxKontextAdditionalInputsStep,
+    FluxKontextSetResolutionStep,
+    FluxTextInputStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Flux Kontext vae encoder (run before before_denoise)
+# auto_docstring
+class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          _auto_resize (`bool`, *optional*, defaults to True):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          processed_image (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The latents representing the reference image
+    """
+
+    model_name = "flux-kontext"
+
+    block_classes = [FluxKontextProcessImagesInputStep(), FluxVaeEncoderStep(sample_mode="argmax")]
+    block_names = ["preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
+# auto_docstring
+class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block that works for image-conditioned tasks.
+       - `FluxKontextVaeEncoderStep` (image_conditioned) is used when only `image` is provided. - if `image` is not
+         provided, step will be skipped.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          _auto_resize (`bool`, *optional*, defaults to True):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          processed_image (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`):
+              The latents representing the reference image
+    """
+
+    model_name = "flux-kontext"
+
+    block_classes = [FluxKontextVaeEncoderStep]
+    block_names = ["image_conditioned"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            + "This is an auto pipeline block that works for image-conditioned tasks.\n"
+            + " - `FluxKontextVaeEncoderStep` (image_conditioned) is used when only `image` is provided."
+            + " - if `image` is not provided, step will be skipped."
+        )
+
+
+# before_denoise: text2img
+# auto_docstring
+class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepares the inputs for the denoise step for Flux Kontext
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
+    model_name = "flux-kontext"
+
+    block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()]
+    block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
+
+    @property
+    def description(self):
+        return "Before denoise step that prepares the inputs for the denoise step for Flux Kontext\n"
+        "for text-to-image tasks."
+
+
+# before_denoise: image-conditioned
+# auto_docstring
+class FluxKontextImageConditionedBeforeDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs for the denoise step for Flux Kontext
+      for image-conditioned tasks.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          image_height (`None`, *optional*):
+              TODO: Add description.
+          image_width (`None`, *optional*):
+              TODO: Add description.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
+    model_name = "flux-kontext"
+
+    block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxKontextRoPEInputsStep()]
+    block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step for Flux Kontext\n"
+            "for image-conditioned tasks."
+        )
+
+
+# auto_docstring
+class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs for the denoise step.
+      This is an auto pipeline block that works for text2image.
+       - `FluxKontextBeforeDenoiseStep` (text2image) is used.
+       - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is
+         provided.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+              Can be generated in input step.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          image_height (`None`, *optional*):
+              TODO: Add description.
+          image_width (`None`, *optional*):
+              TODO: Add description.
+          prompt_embeds (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+          timesteps (`Tensor`):
+              The timesteps to use for inference
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time
+          guidance (`Tensor`):
+              Optional guidance to be used.
+          txt_ids (`list`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation.
+          img_ids (`list`):
+              The sequence lengths of the image latents, used for RoPE calculation.
+    """
+
+    model_name = "flux-kontext"
+
+    block_classes = [FluxKontextImageConditionedBeforeDenoiseStep, FluxKontextBeforeDenoiseStep]
+    block_names = ["image_conditioned", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is an auto pipeline block that works for text2image.\n"
+            + " - `FluxKontextBeforeDenoiseStep` (text2image) is used.\n"
+            + " - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is provided.\n"
+        )
+
+
+# inputs: Flux Kontext
+# auto_docstring
+class FluxKontextInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the both text2img and img2img denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Inputs:
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          max_area (`int`, *optional*, defaults to 1048576):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          height (`int`):
+              The height of the initial noisy latents
+          width (`int`):
+              The width of the initial noisy latents
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation
+          pooled_prompt_embeds (`Tensor`):
+              pooled text embeddings used to guide the image generation
+          image_height (`int`):
+              The height of the image latents
+          image_width (`int`):
+              The width of the image latents
+    """
+
+    model_name = "flux-kontext"
+    block_classes = [FluxKontextSetResolutionStep(), FluxTextInputStep(), FluxKontextAdditionalInputsStep()]
+    block_names = ["set_resolution", "text_inputs", "additional_inputs"]
+
+    @property
+    def description(self):
+        return (
+            "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
+            " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+            " - update height/width based `image_latents`, patchify `image_latents`."
+        )
+
+
+# auto_docstring
+class FluxKontextAutoInputStep(AutoPipelineBlocks):
+    """
+    Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size,
+    and patchified.
+       This is an auto pipeline block that works for text2image/img2img tasks.
+       - `FluxKontextInputStep` (image_conditioned) is used when `image_latents` is provided.
+       - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present.
+
+      Inputs:
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          max_area (`int`, *optional*, defaults to 1048576):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          height (`int`):
+              The height of the initial noisy latents
+          width (`int`):
+              The width of the initial noisy latents
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation
+          pooled_prompt_embeds (`Tensor`):
+              pooled text embeddings used to guide the image generation
+          image_height (`int`):
+              The height of the image latents
+          image_width (`int`):
+              The width of the image latents
+    """
+
+    model_name = "flux-kontext"
+    block_classes = [FluxKontextInputStep, FluxTextInputStep]
+    block_names = ["image_conditioned", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+            + " - `FluxKontextInputStep` (image_conditioned) is used when `image_latents` is provided.\n"
+            + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
+        )
+
+
+# auto_docstring
+class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core step that performs the denoising process for Flux Kontext.
+      This step supports text-to-image and image-conditioned tasks for Flux Kontext:
+       - for image-conditioned generation, you need to provide `image_latents`
+       - for text-to-image generation, all you need to provide is prompt embeddings.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+      Inputs:
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          max_area (`int`, *optional*, defaults to 1048576):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          pooled_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "flux-kontext"
+    block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextDenoiseStep]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process for Flux Kontext.\n"
+            + "This step supports text-to-image and image-conditioned tasks for Flux Kontext:\n"
+            + " - for image-conditioned generation, you need to provide `image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings."
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
+
+AUTO_BLOCKS_KONTEXT = InsertableDict(
+    [
+        ("text_encoder", FluxTextEncoderStep()),
+        ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
+        ("denoise", FluxKontextCoreDenoiseStep()),
+        ("decode", FluxDecodeStep()),
+    ]
+)
+
+
+# auto_docstring
+class FluxKontextAutoBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline for image-to-image using Flux Kontext.
+
+      Supported workflows:
+        - `image_conditioned`: requires `image`, `prompt`
+        - `text2image`: requires `prompt`
+
+      Components:
+          text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2
+          (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          prompt_2 (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          _auto_resize (`bool`, *optional*, defaults to True):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          max_area (`int`, *optional*, defaults to 1048576):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 3.5):
+              TODO: Add description.
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
+    model_name = "flux-kontext"
+
+    block_classes = AUTO_BLOCKS_KONTEXT.values()
+    block_names = AUTO_BLOCKS_KONTEXT.keys()
+    _workflow_map = {
+        "image_conditioned": {"image": True, "prompt": True},
+        "text2image": {"prompt": True},
+    }
+
+    @property
+    def description(self):
+        return "Modular pipeline for image-to-image using Flux Kontext."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/flux2/__init__.py b/src/diffusers/modular_pipelines/flux2/__init__.py
index 74907a9af806..d7cc8badcaf7 100644
--- a/src/diffusers/modular_pipelines/flux2/__init__.py
+++ b/src/diffusers/modular_pipelines/flux2/__init__.py
@@ -21,44 +21,14 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["encoders"] = [
-        "Flux2TextEncoderStep",
-        "Flux2RemoteTextEncoderStep",
-        "Flux2VaeEncoderStep",
-    ]
-    _import_structure["before_denoise"] = [
-        "Flux2SetTimestepsStep",
-        "Flux2PrepareLatentsStep",
-        "Flux2RoPEInputsStep",
-        "Flux2PrepareImageLatentsStep",
-    ]
-    _import_structure["denoise"] = [
-        "Flux2LoopDenoiser",
-        "Flux2LoopAfterDenoiser",
-        "Flux2DenoiseLoopWrapper",
-        "Flux2DenoiseStep",
-    ]
-    _import_structure["decoders"] = ["Flux2DecodeStep"]
-    _import_structure["inputs"] = [
-        "Flux2ProcessImagesInputStep",
-        "Flux2TextInputStep",
-    ]
-    _import_structure["modular_blocks_flux2"] = [
-        "ALL_BLOCKS",
-        "AUTO_BLOCKS",
-        "REMOTE_AUTO_BLOCKS",
-        "TEXT2IMAGE_BLOCKS",
-        "IMAGE_CONDITIONED_BLOCKS",
-        "Flux2AutoBlocks",
-        "Flux2AutoVaeEncoderStep",
-        "Flux2CoreDenoiseStep",
-        "Flux2VaeEncoderSequentialStep",
-    ]
-    _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks", "Flux2KleinBaseAutoBlocks"]
+    _import_structure["encoders"] = ["Flux2RemoteTextEncoderStep"]
+    _import_structure["modular_blocks_flux2"] = ["Flux2AutoBlocks"]
+    _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks"]
+    _import_structure["modular_blocks_flux2_klein_base"] = ["Flux2KleinBaseAutoBlocks"]
     _import_structure["modular_pipeline"] = [
-        "Flux2ModularPipeline",
-        "Flux2KleinModularPipeline",
         "Flux2KleinBaseModularPipeline",
+        "Flux2KleinModularPipeline",
+        "Flux2ModularPipeline",
     ]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -68,43 +38,10 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .before_denoise import (
-            Flux2PrepareImageLatentsStep,
-            Flux2PrepareLatentsStep,
-            Flux2RoPEInputsStep,
-            Flux2SetTimestepsStep,
-        )
-        from .decoders import Flux2DecodeStep
-        from .denoise import (
-            Flux2DenoiseLoopWrapper,
-            Flux2DenoiseStep,
-            Flux2LoopAfterDenoiser,
-            Flux2LoopDenoiser,
-        )
-        from .encoders import (
-            Flux2RemoteTextEncoderStep,
-            Flux2TextEncoderStep,
-            Flux2VaeEncoderStep,
-        )
-        from .inputs import (
-            Flux2ProcessImagesInputStep,
-            Flux2TextInputStep,
-        )
-        from .modular_blocks_flux2 import (
-            ALL_BLOCKS,
-            AUTO_BLOCKS,
-            IMAGE_CONDITIONED_BLOCKS,
-            REMOTE_AUTO_BLOCKS,
-            TEXT2IMAGE_BLOCKS,
-            Flux2AutoBlocks,
-            Flux2AutoVaeEncoderStep,
-            Flux2CoreDenoiseStep,
-            Flux2VaeEncoderSequentialStep,
-        )
-        from .modular_blocks_flux2_klein import (
-            Flux2KleinAutoBlocks,
-            Flux2KleinBaseAutoBlocks,
-        )
+        from .encoders import Flux2RemoteTextEncoderStep
+        from .modular_blocks_flux2 import Flux2AutoBlocks
+        from .modular_blocks_flux2_klein import Flux2KleinAutoBlocks
+        from .modular_blocks_flux2_klein_base import Flux2KleinBaseAutoBlocks
         from .modular_pipeline import Flux2KleinBaseModularPipeline, Flux2KleinModularPipeline, Flux2ModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
index 41a0ff7dee28..b1033a7dff9e 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
@@ -30,7 +26,6 @@
 from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
 from .denoise import Flux2DenoiseStep
 from .encoders import (
-    Flux2RemoteTextEncoderStep,
     Flux2TextEncoderStep,
     Flux2VaeEncoderStep,
 )
@@ -43,26 +38,69 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-Flux2VaeEncoderBlocks = InsertableDict(
-    [
-        ("preprocess", Flux2ProcessImagesInputStep()),
-        ("encode", Flux2VaeEncoderStep()),
-    ]
-)
-
-
+# auto_docstring
 class Flux2VaeEncoderSequentialStep(SequentialPipelineBlocks):
+    """
+    VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
     model_name = "flux2"
 
-    block_classes = Flux2VaeEncoderBlocks.values()
-    block_names = Flux2VaeEncoderBlocks.keys()
+    block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+    block_names = ["preprocess", "encode"]
 
     @property
     def description(self) -> str:
         return "VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning."
 
 
+# auto_docstring
 class Flux2AutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    VAE encoder step that encodes the image inputs into their latent representations.
+      This is an auto pipeline block that works for image conditioning tasks.
+       - `Flux2VaeEncoderSequentialStep` is used when `image` is provided.
+       - If `image` is not provided, step will be skipped.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
     block_classes = [Flux2VaeEncoderSequentialStep]
     block_names = ["img_conditioning"]
     block_trigger_inputs = ["image"]
@@ -80,7 +118,6 @@ def description(self):
 Flux2CoreDenoiseBlocks = InsertableDict(
     [
         ("input", Flux2TextInputStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
         ("prepare_guidance", Flux2PrepareGuidanceStep()),
@@ -91,7 +128,47 @@ def description(self):
 )
 
 
+# auto_docstring
 class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-dev.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 4.0):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2"
 
     block_classes = Flux2CoreDenoiseBlocks.values()
@@ -99,108 +176,181 @@ class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Core denoise step that performs the denoising process for Flux2-dev.\n"
-            " - `Flux2TextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n"
-            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
-            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
-            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
-            " - `Flux2PrepareGuidanceStep` (prepare_guidance) prepares the guidance tensor for the denoising step.\n"
-            " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
-            " - `Flux2DenoiseStep` (denoise) iteratively denoises the latents.\n"
-            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
-        )
+        return "Core denoise step that performs the denoising process for Flux2-dev."
 
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
-AUTO_BLOCKS = InsertableDict(
+Flux2ImageConditionedCoreDenoiseBlocks = InsertableDict(
     [
-        ("text_encoder", Flux2TextEncoderStep()),
-        ("vae_encoder", Flux2AutoVaeEncoderStep()),
-        ("denoise", Flux2CoreDenoiseStep()),
-        ("decode", Flux2DecodeStep()),
+        ("input", Flux2TextInputStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_guidance", Flux2PrepareGuidanceStep()),
+        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
+        ("denoise", Flux2DenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
     ]
 )
 
 
-REMOTE_AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", Flux2RemoteTextEncoderStep()),
-        ("vae_encoder", Flux2AutoVaeEncoderStep()),
-        ("denoise", Flux2CoreDenoiseStep()),
-        ("decode", Flux2DecodeStep()),
-    ]
-)
-
+# auto_docstring
+class Flux2ImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-dev with image conditioning.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 4.0):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
 
-class Flux2AutoBlocks(SequentialPipelineBlocks):
     model_name = "flux2"
 
-    block_classes = AUTO_BLOCKS.values()
-    block_names = AUTO_BLOCKS.keys()
+    block_classes = Flux2ImageConditionedCoreDenoiseBlocks.values()
+    block_names = Flux2ImageConditionedCoreDenoiseBlocks.keys()
 
     @property
     def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.\n"
-            "- For text-to-image generation, all you need to provide is `prompt`.\n"
-            "- For image-conditioned generation, you need to provide `image` (list of PIL images)."
-        )
+        return "Core denoise step that performs the denoising process for Flux2-dev with image conditioning."
 
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="images",
-                type_hint=List[PIL.Image.Image],
-                description="The images from the decoding step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
-TEXT2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", Flux2TextEncoderStep()),
-        ("text_input", Flux2TextInputStep()),
-        ("prepare_latents", Flux2PrepareLatentsStep()),
-        ("set_timesteps", Flux2SetTimestepsStep()),
-        ("prepare_guidance", Flux2PrepareGuidanceStep()),
-        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
-        ("denoise", Flux2DenoiseStep()),
-        ("after_denoise", Flux2UnpackLatentsStep()),
-        ("decode", Flux2DecodeStep()),
-    ]
-)
+class Flux2AutoCoreDenoiseStep(AutoPipelineBlocks):
+    model_name = "flux2"
+
+    block_classes = [Flux2ImageConditionedCoreDenoiseStep, Flux2CoreDenoiseStep]
+    block_names = ["image_conditioned", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Auto core denoise step that performs the denoising process for Flux2-dev."
+            "This is an auto pipeline block that works for text-to-image and image-conditioned generation."
+            " - `Flux2CoreDenoiseStep` is used for text-to-image generation.\n"
+            " - `Flux2ImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
+        )
+
 
-IMAGE_CONDITIONED_BLOCKS = InsertableDict(
+AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", Flux2TextEncoderStep()),
-        ("text_input", Flux2TextInputStep()),
-        ("preprocess_images", Flux2ProcessImagesInputStep()),
-        ("vae_encoder", Flux2VaeEncoderStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
-        ("prepare_latents", Flux2PrepareLatentsStep()),
-        ("set_timesteps", Flux2SetTimestepsStep()),
-        ("prepare_guidance", Flux2PrepareGuidanceStep()),
-        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
-        ("denoise", Flux2DenoiseStep()),
-        ("after_denoise", Flux2UnpackLatentsStep()),
+        ("vae_encoder", Flux2AutoVaeEncoderStep()),
+        ("denoise", Flux2AutoCoreDenoiseStep()),
         ("decode", Flux2DecodeStep()),
     ]
 )
 
-ALL_BLOCKS = {
-    "text2image": TEXT2IMAGE_BLOCKS,
-    "image_conditioned": IMAGE_CONDITIONED_BLOCKS,
-    "auto": AUTO_BLOCKS,
-    "remote": REMOTE_AUTO_BLOCKS,
-}
+
+# auto_docstring
+class Flux2AutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image_conditioned`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Mistral3ForConditionalGeneration`) tokenizer (`AutoProcessor`) image_processor
+          (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
+          (`Flux2Transformer2DModel`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          text_encoder_out_layers (`tuple`, *optional*, defaults to (10, 20, 30)):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          guidance_scale (`None`, *optional*, defaults to 4.0):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
+    model_name = "flux2"
+
+    block_classes = AUTO_BLOCKS.values()
+    block_names = AUTO_BLOCKS.keys()
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image_conditioned": {"image": True, "prompt": True},
+    }
+
+    @property
+    def description(self):
+        return "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2."
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("images"),
+        ]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
index 984832d77be5..5dbae43a5a7f 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -12,30 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
-    Flux2KleinBaseRoPEInputsStep,
     Flux2PrepareImageLatentsStep,
     Flux2PrepareLatentsStep,
     Flux2RoPEInputsStep,
     Flux2SetTimestepsStep,
 )
 from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
-from .denoise import Flux2KleinBaseDenoiseStep, Flux2KleinDenoiseStep
+from .denoise import Flux2KleinDenoiseStep
 from .encoders import (
-    Flux2KleinBaseTextEncoderStep,
     Flux2KleinTextEncoderStep,
     Flux2VaeEncoderStep,
 )
 from .inputs import (
-    Flux2KleinBaseTextInputStep,
     Flux2ProcessImagesInputStep,
     Flux2TextInputStep,
 )
@@ -47,26 +40,72 @@
 # VAE encoder
 ################
 
-Flux2KleinVaeEncoderBlocks = InsertableDict(
-    [
-        ("preprocess", Flux2ProcessImagesInputStep()),
-        ("encode", Flux2VaeEncoderStep()),
-    ]
-)
-
 
+# auto_docstring
 class Flux2KleinVaeEncoderSequentialStep(SequentialPipelineBlocks):
-    model_name = "flux2"
+    """
+    VAE encoder step that preprocesses and encodes the image inputs into their latent representations.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
+    model_name = "flux2-klein"
 
-    block_classes = Flux2KleinVaeEncoderBlocks.values()
-    block_names = Flux2KleinVaeEncoderBlocks.keys()
+    block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+    block_names = ["preprocess", "encode"]
 
     @property
     def description(self) -> str:
         return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
 
 
+# auto_docstring
 class Flux2KleinAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    VAE encoder step that encodes the image inputs into their latent representations.
+      This is an auto pipeline block that works for image conditioning tasks.
+       - `Flux2KleinVaeEncoderSequentialStep` is used when `image` is provided.
+       - If `image` is not provided, step will be skipped.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
+    model_name = "flux2-klein"
+
     block_classes = [Flux2KleinVaeEncoderSequentialStep]
     block_names = ["img_conditioning"]
     block_trigger_inputs = ["image"]
@@ -88,7 +127,6 @@ def description(self):
 Flux2KleinCoreDenoiseBlocks = InsertableDict(
     [
         ("input", Flux2TextInputStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
         ("prepare_rope_inputs", Flux2RoPEInputsStep()),
@@ -98,7 +136,46 @@ def description(self):
 )
 
 
+# auto_docstring
 class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image
+    generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
 
     block_classes = Flux2KleinCoreDenoiseBlocks.values()
@@ -106,127 +183,218 @@ class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model).\n"
-            " - `Flux2KleinTextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n"
-            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents  and image_latent_ids for the denoising step.\n"
-            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
-            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
-            " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
-            " - `Flux2KleinDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
-        )
+        return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image generation."
 
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
-Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
+Flux2KleinImageConditionedCoreDenoiseBlocks = InsertableDict(
     [
-        ("input", Flux2KleinBaseTextInputStep()),
-        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("input", Flux2TextInputStep()),
         ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
-        ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
-        ("denoise", Flux2KleinBaseDenoiseStep()),
+        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
+        ("denoise", Flux2KleinDenoiseStep()),
         ("after_denoise", Flux2UnpackLatentsStep()),
     ]
 )
 
 
-class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
+# auto_docstring
+class Flux2KleinImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
-    block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
-    block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
+
+    block_classes = Flux2KleinImageConditionedCoreDenoiseBlocks.values()
+    block_names = Flux2KleinImageConditionedCoreDenoiseBlocks.keys()
 
     @property
     def description(self):
-        return "Core denoise step that performs the denoising process for Flux2-Klein (base model)."
-        return (
-            "Core denoise step that performs the denoising process for Flux2-Klein (base model).\n"
-            " - `Flux2KleinBaseTextInputStep` (input) standardizes the text inputs (prompt_embeds + negative_prompt_embeds) for the denoising step.\n"
-            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
-            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
-            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
-            " - `Flux2KleinBaseRoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids + negative_txt_ids) for the denoising step.\n"
-            " - `Flux2KleinBaseDenoiseStep` (denoise) iteratively denoises the latents using Classifier-Free Guidance.\n"
-            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
-        )
+        return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning."
 
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents",
-                type_hint=torch.Tensor,
-                description="The latents from the denoising step.",
-            )
+            OutputParam.template("latents"),
         ]
 
 
-###
-### Auto blocks
-###
-class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
+# auto_docstring
+class Flux2KleinAutoCoreDenoiseStep(AutoPipelineBlocks):
+    """
+    Auto core denoise step that performs the denoising process for Flux2-Klein.
+      This is an auto pipeline block that works for text-to-image and image-conditioned generation.
+       - `Flux2KleinCoreDenoiseStep` is used for text-to-image generation.
+       - `Flux2KleinImageConditionedCoreDenoiseStep` is used for image-conditioned generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "flux2-klein"
-    block_classes = [
-        Flux2KleinTextEncoderStep(),
-        Flux2KleinAutoVaeEncoderStep(),
-        Flux2KleinCoreDenoiseStep(),
-        Flux2DecodeStep(),
-    ]
-    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    block_classes = [Flux2KleinImageConditionedCoreDenoiseStep, Flux2KleinCoreDenoiseStep]
+    block_names = ["image_conditioned", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
 
     @property
     def description(self):
         return (
-            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.\n"
-            + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
-            + " - for text-to-image generation, all you need to provide is `prompt`.\n"
+            "Auto core denoise step that performs the denoising process for Flux2-Klein.\n"
+            "This is an auto pipeline block that works for text-to-image and image-conditioned generation.\n"
+            " - `Flux2KleinCoreDenoiseStep` is used for text-to-image generation.\n"
+            " - `Flux2KleinImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
         )
 
-    @property
-    def outputs(self):
-        return [
-            OutputParam(
-                name="images",
-                type_hint=List[PIL.Image.Image],
-                description="The images from the decoding step.",
-            )
-        ]
+
+###
+### Auto blocks
+###
 
 
-class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
+# auto_docstring
+class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image_conditioned`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Qwen3ForCausalLM`) tokenizer (`Qwen2TokenizerFast`) image_processor (`Flux2ImageProcessor`)
+          vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
+          (`Flux2Transformer2DModel`)
+
+      Configs:
+          is_distilled (default: True)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          text_encoder_out_layers (`tuple`, *optional*, defaults to (9, 18, 27)):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     model_name = "flux2-klein"
     block_classes = [
-        Flux2KleinBaseTextEncoderStep(),
+        Flux2KleinTextEncoderStep(),
         Flux2KleinAutoVaeEncoderStep(),
-        Flux2KleinBaseCoreDenoiseStep(),
+        Flux2KleinAutoCoreDenoiseStep(),
         Flux2DecodeStep(),
     ]
     block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image_conditioned": {"image": True, "prompt": True},
+    }
 
     @property
     def description(self):
-        return (
-            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).\n"
-            + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
-            + " - for text-to-image generation, all you need to provide is `prompt`.\n"
-        )
+        return "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein."
 
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="images",
-                type_hint=List[PIL.Image.Image],
-                description="The images from the decoding step.",
-            )
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
new file mode 100644
index 000000000000..42e025c622b4
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
@@ -0,0 +1,413 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+    Flux2KleinBaseRoPEInputsStep,
+    Flux2PrepareImageLatentsStep,
+    Flux2PrepareLatentsStep,
+    Flux2SetTimestepsStep,
+)
+from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
+from .denoise import Flux2KleinBaseDenoiseStep
+from .encoders import (
+    Flux2KleinBaseTextEncoderStep,
+    Flux2VaeEncoderStep,
+)
+from .inputs import (
+    Flux2KleinBaseTextInputStep,
+    Flux2ProcessImagesInputStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+################
+# VAE encoder
+################
+
+
+# auto_docstring
+class Flux2KleinBaseVaeEncoderSequentialStep(SequentialPipelineBlocks):
+    """
+    VAE encoder step that preprocesses and encodes the image inputs into their latent representations.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
+    model_name = "flux2"
+
+    block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+    block_names = ["preprocess", "encode"]
+
+    @property
+    def description(self) -> str:
+        return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
+
+
+# auto_docstring
+class Flux2KleinBaseAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    VAE encoder step that encodes the image inputs into their latent representations.
+      This is an auto pipeline block that works for image conditioning tasks.
+       - `Flux2KleinBaseVaeEncoderSequentialStep` is used when `image` is provided.
+       - If `image` is not provided, step will be skipped.
+
+      Components:
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+      Inputs:
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          condition_images (`list`):
+              TODO: Add description.
+          image_latents (`list`):
+              List of latent representations for each reference image
+    """
+
+    block_classes = [Flux2KleinBaseVaeEncoderSequentialStep]
+    block_names = ["img_conditioning"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "VAE encoder step that encodes the image inputs into their latent representations.\n"
+            "This is an auto pipeline block that works for image conditioning tasks.\n"
+            " - `Flux2KleinBaseVaeEncoderSequentialStep` is used when `image` is provided.\n"
+            " - If `image` is not provided, step will be skipped."
+        )
+
+
+###
+### Core denoise
+###
+
+Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
+    [
+        ("input", Flux2KleinBaseTextInputStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
+        ("denoise", Flux2KleinBaseDenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
+    ]
+)
+
+
+# auto_docstring
+class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Configs:
+          is_distilled (default: False)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "flux2-klein"
+    block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
+    block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return "Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation."
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
+
+Flux2KleinBaseImageConditionedCoreDenoiseBlocks = InsertableDict(
+    [
+        ("input", Flux2KleinBaseTextInputStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
+        ("denoise", Flux2KleinBaseDenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
+    ]
+)
+
+
+# auto_docstring
+class Flux2KleinBaseImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Configs:
+          is_distilled (default: False)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "flux2-klein"
+    block_classes = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.values()
+    block_names = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return "Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning."
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
+
+# auto_docstring
+class Flux2KleinBaseAutoCoreDenoiseStep(AutoPipelineBlocks):
+    """
+    Auto core denoise step that performs the denoising process for Flux2-Klein (base model).
+      This is an auto pipeline block that works for text-to-image and image-conditioned generation.
+       - `Flux2KleinBaseCoreDenoiseStep` is used for text-to-image generation.
+       - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Configs:
+          is_distilled (default: False)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "flux2-klein"
+    block_classes = [Flux2KleinBaseImageConditionedCoreDenoiseStep, Flux2KleinBaseCoreDenoiseStep]
+    block_names = ["image_conditioned", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Auto core denoise step that performs the denoising process for Flux2-Klein (base model).\n"
+            "This is an auto pipeline block that works for text-to-image and image-conditioned generation.\n"
+            " - `Flux2KleinBaseCoreDenoiseStep` is used for text-to-image generation.\n"
+            " - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
+        )
+
+
+###
+### Auto blocks
+###
+
+
+# auto_docstring
+class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image_conditioned`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Qwen3ForCausalLM`) tokenizer (`Qwen2TokenizerFast`) guider (`ClassifierFreeGuidance`)
+          image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+      Configs:
+          is_distilled (default: False)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              TODO: Add description.
+          text_encoder_out_layers (`tuple`, *optional*, defaults to (9, 18, 27)):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          image_latents (`list`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          timesteps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          joint_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_latent_ids (`Tensor`, *optional*):
+              Position IDs for image latents. Shape: (B, img_seq_len, 4)
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
+    model_name = "flux2-klein"
+    block_classes = [
+        Flux2KleinBaseTextEncoderStep(),
+        Flux2KleinBaseAutoVaeEncoderStep(),
+        Flux2KleinBaseAutoCoreDenoiseStep(),
+        Flux2DecodeStep(),
+    ]
+    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image_conditioned": {"image": True, "prompt": True},
+    }
+
+    @property
+    def description(self):
+        return "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model)."
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("images"),
+        ]
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 0eff85926fc5..76a850b63c4e 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -40,8 +40,11 @@
     InputParam,
     InsertableDict,
     OutputParam,
+    combine_inputs,
+    combine_outputs,
     format_components,
     format_configs,
+    format_workflow,
     generate_modular_model_card_content,
     make_doc_string,
 )
@@ -287,6 +290,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
 
     config_name = "modular_config.json"
     model_name = None
+    _workflow_map = None
 
     @classmethod
     def _get_signature_keys(cls, obj):
@@ -342,6 +346,35 @@ def _get_outputs(self):
     def outputs(self) -> list[OutputParam]:
         return self._get_outputs()
 
+    # currentlyonly ConditionalPipelineBlocks and SequentialPipelineBlocks support `get_execution_blocks`
+    def get_execution_blocks(self, **kwargs):
+        """
+        Get the block(s) that would execute given the inputs. Must be implemented by subclasses that support
+        conditional block selection.
+
+        Args:
+            **kwargs: Input names and values. Only trigger inputs affect block selection.
+        """
+        raise NotImplementedError(f"`get_execution_blocks` is not implemented for {self.__class__.__name__}")
+
+    # currently only SequentialPipelineBlocks support workflows
+    @property
+    def available_workflows(self):
+        """
+        Returns a list of available workflow names. Must be implemented by subclasses that define `_workflow_map`.
+        """
+        raise NotImplementedError(f"`available_workflows` is not implemented for {self.__class__.__name__}")
+
+    def get_workflow(self, workflow_name: str):
+        """
+        Get the execution blocks for a specific workflow. Must be implemented by subclasses that define
+        `_workflow_map`.
+
+        Args:
+            workflow_name: Name of the workflow to retrieve.
+        """
+        raise NotImplementedError(f"`get_workflow` is not implemented for {self.__class__.__name__}")
+
     @classmethod
     def from_pretrained(
         cls,
@@ -480,72 +513,6 @@ def set_block_state(self, state: PipelineState, block_state: BlockState):
                     if current_value is not param:  # Using identity comparison to check if object was modified
                         state.set(param_name, param, input_param.kwargs_type)
 
-    @staticmethod
-    def combine_inputs(*named_input_lists: list[tuple[str, list[InputParam]]]) -> list[InputParam]:
-        """
-        Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if
-        current default value is None and new default value is not None. Warns if multiple non-None default values
-        exist for the same input.
-
-        Args:
-            named_input_lists: list of tuples containing (block_name, input_param_list) pairs
-
-        Returns:
-            list[InputParam]: Combined list of unique InputParam objects
-        """
-        combined_dict = {}  # name -> InputParam
-        value_sources = {}  # name -> block_name
-
-        for block_name, inputs in named_input_lists:
-            for input_param in inputs:
-                if input_param.name is None and input_param.kwargs_type is not None:
-                    input_name = "*_" + input_param.kwargs_type
-                else:
-                    input_name = input_param.name
-                if input_name in combined_dict:
-                    current_param = combined_dict[input_name]
-                    if (
-                        current_param.default is not None
-                        and input_param.default is not None
-                        and current_param.default != input_param.default
-                    ):
-                        warnings.warn(
-                            f"Multiple different default values found for input '{input_name}': "
-                            f"{current_param.default} (from block '{value_sources[input_name]}') and "
-                            f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
-                        )
-                    if current_param.default is None and input_param.default is not None:
-                        combined_dict[input_name] = input_param
-                        value_sources[input_name] = block_name
-                else:
-                    combined_dict[input_name] = input_param
-                    value_sources[input_name] = block_name
-
-        return list(combined_dict.values())
-
-    @staticmethod
-    def combine_outputs(*named_output_lists: list[tuple[str, list[OutputParam]]]) -> list[OutputParam]:
-        """
-        Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
-        occurrence of each output name.
-
-        Args:
-            named_output_lists: list of tuples containing (block_name, output_param_list) pairs
-
-        Returns:
-            list[OutputParam]: Combined list of unique OutputParam objects
-        """
-        combined_dict = {}  # name -> OutputParam
-
-        for block_name, outputs in named_output_lists:
-            for output_param in outputs:
-                if (output_param.name not in combined_dict) or (
-                    combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
-                ):
-                    combined_dict[output_param.name] = output_param
-
-        return list(combined_dict.values())
-
     @property
     def input_names(self) -> list[str]:
         return [input_param.name for input_param in self.inputs if input_param.name is not None]
@@ -577,7 +544,8 @@ def doc(self):
 class ConditionalPipelineBlocks(ModularPipelineBlocks):
     """
     A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the
-    `select_block` method to define the logic for selecting the block.
+    `select_block` method to define the logic for selecting the block. Currently, we only support selection logic based
+    on the presence or absence of inputs (i.e., whether they are `None` or not)
 
     This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
     library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -585,15 +553,20 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
     > [!WARNING] > This is an experimental feature and is likely to change in the future.
 
     Attributes:
-        block_classes: List of block classes to be used
-        block_names: List of prefixes for each block
-        block_trigger_inputs: List of input names that select_block() uses to determine which block to run
+        block_classes: List of block classes to be used. Must have the same length as `block_names`.
+        block_names: List of names for each block. Must have the same length as `block_classes`.
+        block_trigger_inputs: List of input names that `select_block()` uses to determine which block to run.
+            For `ConditionalPipelineBlocks`, this does not need to correspond to `block_names` and `block_classes`. For
+            `AutoPipelineBlocks`, this must have the same length as `block_names` and `block_classes`, where each
+            element specifies the trigger input for the corresponding block.
+        default_block_name: Name of the default block to run when no trigger inputs match.
+            If None, this block can be skipped entirely when no trigger inputs are provided.
     """
 
     block_classes = []
     block_names = []
     block_trigger_inputs = []
-    default_block_name = None  # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided
+    default_block_name = None
 
     def __init__(self):
         sub_blocks = InsertableDict()
@@ -657,7 +630,7 @@ def required_inputs(self) -> list[str]:
     @property
     def inputs(self) -> list[tuple[str, Any]]:
         named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
-        combined_inputs = self.combine_inputs(*named_inputs)
+        combined_inputs = combine_inputs(*named_inputs)
         # mark Required inputs only if that input is required by all the blocks
         for input_param in combined_inputs:
             if input_param.name in self.required_inputs:
@@ -669,15 +642,16 @@ def inputs(self) -> list[tuple[str, Any]]:
     @property
     def intermediate_outputs(self) -> list[str]:
         named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
         return combined_outputs
 
     @property
     def outputs(self) -> list[str]:
         named_outputs = [(name, block.outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
         return combined_outputs
 
+    # used for `__repr__`
     def _get_trigger_inputs(self) -> set:
         """
         Returns a set of all unique trigger input values found in this block and nested blocks.
@@ -706,16 +680,16 @@ def fn_recursive_get_trigger(blocks):
 
         return all_triggers
 
-    @property
-    def trigger_inputs(self):
-        """All trigger inputs including from nested blocks."""
-        return self._get_trigger_inputs()
-
     def select_block(self, **kwargs) -> str | None:
         """
         Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic
         for selecting the block.
 
+        Note: When trigger inputs include intermediate outputs from earlier blocks, the selection logic should only
+        depend on the presence or absence of the input (i.e., whether it is None or not), not on its actual value. This
+        is because `get_execution_blocks()` resolves conditions statically by propagating intermediate output names
+        without their runtime values.
+
         Args:
             **kwargs: Trigger input names and their values from the state.
 
@@ -750,6 +724,39 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
             logger.error(error_msg)
             raise
 
+    def get_execution_blocks(self, **kwargs) -> ModularPipelineBlocks | None:
+        """
+        Get the block(s) that would execute given the inputs.
+
+        Recursively resolves nested ConditionalPipelineBlocks until reaching either:
+        - A leaf block (no sub_blocks or LoopSequentialPipelineBlocks) → returns single `ModularPipelineBlocks`
+        - A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns
+        a `SequentialPipelineBlocks` containing the resolved execution blocks
+
+        Args:
+            **kwargs: Input names and values. Only trigger inputs affect block selection.
+
+        Returns:
+            - `ModularPipelineBlocks`: A leaf block or resolved `SequentialPipelineBlocks`
+            - `None`: If this block would be skipped (no trigger matched and no default)
+        """
+        trigger_kwargs = {name: kwargs.get(name) for name in self.block_trigger_inputs if name is not None}
+        block_name = self.select_block(**trigger_kwargs)
+
+        if block_name is None:
+            block_name = self.default_block_name
+
+        if block_name is None:
+            return None
+
+        block = self.sub_blocks[block_name]
+
+        # Recursively resolve until we hit a leaf block
+        if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
+            return block.get_execution_blocks(**kwargs)
+
+        return block
+
     def __repr__(self):
         class_name = self.__class__.__name__
         base_class = self.__class__.__bases__[0].__name__
@@ -757,11 +764,11 @@ def __repr__(self):
             f"{class_name}(\n  Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
         )
 
-        if self.trigger_inputs:
+        if self._get_trigger_inputs():
             header += "\n"
             header += "  " + "=" * 100 + "\n"
             header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {sorted(self.trigger_inputs)}\n"
+            header += f"  Trigger Inputs: {sorted(self._get_trigger_inputs())}\n"
             header += "  " + "=" * 100 + "\n\n"
 
         # Format description with proper indentation
@@ -828,24 +835,56 @@ def doc(self):
 
 class AutoPipelineBlocks(ConditionalPipelineBlocks):
     """
-    A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
+        A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
+
+        This is a specialized version of `ConditionalPipelineBlocks` where:
+        - Each block has one corresponding trigger input (1:1 mapping)
+        - Block selection is automatic: the first block whose trigger input is present gets selected
+        - `block_trigger_inputs` must have the same length as `block_names` and `block_classes`
+        - Use `None` in `block_trigger_inputs` to specify the default block, i.e the block that will run if no trigger
+          inputs are present
+
+        Attributes:
+            block_classes:
+                List of block classes to be used. Must have the same length as `block_names` and
+                `block_trigger_inputs`.
+            block_names:
+                List of names for each block. Must have the same length as `block_classes` and `block_trigger_inputs`.
+            block_trigger_inputs:
+                List of input names where each element specifies the trigger input for the corresponding block. Use
+                `None` to mark the default block.
+
+        Example:
+    ```python
+        class MyAutoBlock(AutoPipelineBlocks):
+            block_classes = [InpaintEncoderBlock, ImageEncoderBlock, TextEncoderBlock]
+            block_names = ["inpaint", "img2img", "text2img"]
+            block_trigger_inputs = ["mask_image", "image", None]  # text2img is the default
+    ```
+
+        With this definition:
+        - As long as `mask_image` is provided, "inpaint" block runs (regardless of `image` being provided or not)
+        - If `mask_image` is not provided but `image` is provided, "img2img" block runs
+        - Otherwise, "text2img" block runs (default, trigger is `None`)
     """
 
     def __init__(self):
         super().__init__()
 
+        if self.default_block_name is not None:
+            raise ValueError(
+                f"In {self.__class__.__name__}, do not set `default_block_name` for AutoPipelineBlocks. "
+                f"Use `None` in `block_trigger_inputs` to specify the default block."
+            )
+
         if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
             raise ValueError(
                 f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
             )
 
-    @property
-    def default_block_name(self) -> str | None:
-        """Derive default_block_name from block_trigger_inputs (None entry)."""
         if None in self.block_trigger_inputs:
             idx = self.block_trigger_inputs.index(None)
-            return self.block_names[idx]
-        return None
+            self.default_block_name = self.block_names[idx]
 
     def select_block(self, **kwargs) -> str | None:
         """Select block based on which trigger input is present (not None)."""
@@ -899,6 +938,29 @@ def expected_configs(self):
                     expected_configs.append(config)
         return expected_configs
 
+    @property
+    def available_workflows(self):
+        if self._workflow_map is None:
+            raise NotImplementedError(
+                f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
+            )
+
+        return list(self._workflow_map.keys())
+
+    def get_workflow(self, workflow_name: str):
+        if self._workflow_map is None:
+            raise NotImplementedError(
+                f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
+            )
+
+        if workflow_name not in self._workflow_map:
+            raise ValueError(f"Workflow {workflow_name} not found in {self.__class__.__name__}")
+
+        trigger_inputs = self._workflow_map[workflow_name]
+        workflow_blocks = self.get_execution_blocks(**trigger_inputs)
+
+        return workflow_blocks
+
     @classmethod
     def from_blocks_dict(
         cls, blocks_dict: dict[str, Any], description: str | None = None
@@ -994,7 +1056,7 @@ def intermediate_outputs(self) -> list[str]:
             # filter out them here so they do not end up as intermediate_outputs
             if name not in inp_names:
                 named_outputs.append((name, block.intermediate_outputs))
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
         return combined_outputs
 
     # YiYi TODO: I think we can remove the outputs property
@@ -1018,6 +1080,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
                 raise
         return pipeline, state
 
+    # used for `__repr__`
     def _get_trigger_inputs(self):
         """
         Returns a set of all unique trigger input values found in the blocks.
@@ -1041,89 +1104,56 @@ def fn_recursive_get_trigger(blocks):
 
         return fn_recursive_get_trigger(self.sub_blocks)
 
-    @property
-    def trigger_inputs(self):
-        return self._get_trigger_inputs()
-
-    def _traverse_trigger_blocks(self, active_inputs):
+    def get_execution_blocks(self, **kwargs) -> "SequentialPipelineBlocks":
         """
-        Traverse blocks and select which ones would run given the active inputs.
+        Get the blocks that would execute given the specified inputs.
+
+        As the traversal walks through sequential blocks, intermediate outputs from resolved blocks are added to the
+        active inputs. This means conditional blocks that depend on intermediates (e.g., "run img2img if image_latents
+        is present") will resolve correctly, as long as the condition is based on presence/absence (None or not None),
+        not on the actual value.
+
 
         Args:
-            active_inputs: Dict of input names to values that are "present"
+            **kwargs: Input names and values. Only trigger inputs affect block selection.
 
         Returns:
-            OrderedDict of block_name -> block that would execute
+            SequentialPipelineBlocks containing only the blocks that would execute
         """
+        # Copy kwargs so we can add outputs as we traverse
+        active_inputs = dict(kwargs)
 
         def fn_recursive_traverse(block, block_name, active_inputs):
             result_blocks = OrderedDict()
 
             # ConditionalPipelineBlocks (includes AutoPipelineBlocks)
             if isinstance(block, ConditionalPipelineBlocks):
-                trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
-                selected_block_name = block.select_block(**trigger_kwargs)
-
-                if selected_block_name is None:
-                    selected_block_name = block.default_block_name
-
-                if selected_block_name is None:
+                block = block.get_execution_blocks(**active_inputs)
+                if block is None:
                     return result_blocks
 
-                selected_block = block.sub_blocks[selected_block_name]
-
-                if selected_block.sub_blocks:
-                    result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
-                else:
-                    result_blocks[block_name] = selected_block
-                    if hasattr(selected_block, "outputs"):
-                        for out in selected_block.outputs:
-                            active_inputs[out.name] = True
-
-                return result_blocks
-
-            # SequentialPipelineBlocks or LoopSequentialPipelineBlocks
-            if block.sub_blocks:
+            # Has sub_blocks (SequentialPipelineBlocks/ConditionalPipelineBlocks)
+            if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
                 for sub_block_name, sub_block in block.sub_blocks.items():
-                    blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
-                    blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
-                    result_blocks.update(blocks_to_update)
+                    nested_blocks = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
+                    nested_blocks = {f"{block_name}.{k}": v for k, v in nested_blocks.items()}
+                    result_blocks.update(nested_blocks)
             else:
+                # Leaf block: single ModularPipelineBlocks or LoopSequentialPipelineBlocks
                 result_blocks[block_name] = block
-                if hasattr(block, "outputs"):
-                    for out in block.outputs:
+                # Add outputs to active_inputs so subsequent blocks can use them as triggers
+                if hasattr(block, "intermediate_outputs"):
+                    for out in block.intermediate_outputs:
                         active_inputs[out.name] = True
 
             return result_blocks
 
         all_blocks = OrderedDict()
         for block_name, block in self.sub_blocks.items():
-            blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
-            all_blocks.update(blocks_to_update)
-        return all_blocks
+            nested_blocks = fn_recursive_traverse(block, block_name, active_inputs)
+            all_blocks.update(nested_blocks)
 
-    def get_execution_blocks(self, **kwargs):
-        """
-        Get the blocks that would execute given the specified inputs.
-
-        Args:
-            **kwargs: Input names and values. Only trigger inputs affect block selection.
-                    Pass any inputs that would be non-None at runtime.
-
-        Returns:
-            SequentialPipelineBlocks containing only the blocks that would execute
-
-        Example:
-            # Get blocks for inpainting workflow blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask,
-            image=image)
-
-            # Get blocks for text2image workflow blocks = pipeline.get_execution_blocks(prompt="a cat")
-        """
-        # Filter out None values
-        active_inputs = {k: v for k, v in kwargs.items() if v is not None}
-
-        blocks_triggered = self._traverse_trigger_blocks(active_inputs)
-        return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)
+        return SequentialPipelineBlocks.from_blocks_dict(all_blocks)
 
     def __repr__(self):
         class_name = self.__class__.__name__
@@ -1132,18 +1162,23 @@ def __repr__(self):
             f"{class_name}(\n  Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
         )
 
-        if self.trigger_inputs:
+        if self._workflow_map is None and self._get_trigger_inputs():
             header += "\n"
             header += "  " + "=" * 100 + "\n"
             header += "  This pipeline contains blocks that are selected at runtime based on inputs.\n"
-            header += f"  Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
+            header += f"  Trigger Inputs: {[inp for inp in self._get_trigger_inputs() if inp is not None]}\n"
             # Get first trigger input as example
-            example_input = next(t for t in self.trigger_inputs if t is not None)
+            example_input = next(t for t in self._get_trigger_inputs() if t is not None)
             header += f"  Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
             header += "  " + "=" * 100 + "\n\n"
 
+        description = self.description
+        if self._workflow_map is not None:
+            workflow_str = format_workflow(self._workflow_map)
+            description = f"{self.description}\n\n{workflow_str}"
+
         # Format description with proper indentation
-        desc_lines = self.description.split("\n")
+        desc_lines = description.split("\n")
         desc = []
         # First line with "Description:" label
         desc.append(f"  Description: {desc_lines[0]}")
@@ -1191,10 +1226,15 @@ def __repr__(self):
 
     @property
     def doc(self):
+        description = self.description
+        if self._workflow_map is not None:
+            workflow_str = format_workflow(self._workflow_map)
+            description = f"{self.description}\n\n{workflow_str}"
+
         return make_doc_string(
             self.inputs,
             self.outputs,
-            self.description,
+            description=description,
             class_name=self.__class__.__name__,
             expected_components=self.expected_components,
             expected_configs=self.expected_configs,
@@ -1327,7 +1367,7 @@ def required_inputs(self) -> list[str]:
     @property
     def intermediate_outputs(self) -> list[str]:
         named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
-        combined_outputs = self.combine_outputs(*named_outputs)
+        combined_outputs = combine_outputs(*named_outputs)
         for output in self.loop_intermediate_outputs:
             if output.name not in {output.name for output in combined_outputs}:
                 combined_outputs.append(output)
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index aa378f715974..cab17c2aed5c 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -14,6 +14,7 @@
 
 import inspect
 import re
+import warnings
 from collections import OrderedDict
 from dataclasses import dataclass, field
 from types import UnionType
@@ -503,6 +504,10 @@ class ConfigSpec:
         "type_hint": list[PIL.Image.Image],
         "description": "Generated images.",
     },
+    "videos": {
+        "type_hint": list[PIL.Image.Image],
+        "description": "The generated videos.",
+    },
     "latents": {
         "type_hint": torch.Tensor,
         "description": "Denoised latents.",
@@ -887,6 +892,30 @@ def format_configs(configs, indent_level=4, max_line_length=115, add_empty_lines
     return "\n".join(formatted_configs)
 
 
+def format_workflow(workflow_map):
+    """Format a workflow map into a readable string representation.
+
+    Args:
+        workflow_map: Dictionary mapping workflow names to trigger inputs
+
+    Returns:
+        A formatted string representing all workflows
+    """
+    if workflow_map is None:
+        return ""
+
+    lines = ["Supported workflows:"]
+    for workflow_name, trigger_inputs in workflow_map.items():
+        required_inputs = [k for k, v in trigger_inputs.items() if v]
+        if required_inputs:
+            inputs_str = ", ".join(f"`{t}`" for t in required_inputs)
+            lines.append(f"  - `{workflow_name}`: requires {inputs_str}")
+        else:
+            lines.append(f"  - `{workflow_name}`: default (no additional inputs required)")
+
+    return "\n".join(lines)
+
+
 def make_doc_string(
     inputs,
     outputs,
@@ -943,6 +972,72 @@ def make_doc_string(
     return output
 
 
+def combine_inputs(*named_input_lists: list[tuple[str, list[InputParam]]]) -> list[InputParam]:
+    """
+    Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if current
+    default value is None and new default value is not None. Warns if multiple non-None default values exist for the
+    same input.
+
+    Args:
+        named_input_lists: List of tuples containing (block_name, input_param_list) pairs
+
+    Returns:
+        List[InputParam]: Combined list of unique InputParam objects
+    """
+    combined_dict = {}  # name -> InputParam
+    value_sources = {}  # name -> block_name
+
+    for block_name, inputs in named_input_lists:
+        for input_param in inputs:
+            if input_param.name is None and input_param.kwargs_type is not None:
+                input_name = "*_" + input_param.kwargs_type
+            else:
+                input_name = input_param.name
+            if input_name in combined_dict:
+                current_param = combined_dict[input_name]
+                if (
+                    current_param.default is not None
+                    and input_param.default is not None
+                    and current_param.default != input_param.default
+                ):
+                    warnings.warn(
+                        f"Multiple different default values found for input '{input_name}': "
+                        f"{current_param.default} (from block '{value_sources[input_name]}') and "
+                        f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
+                    )
+                if current_param.default is None and input_param.default is not None:
+                    combined_dict[input_name] = input_param
+                    value_sources[input_name] = block_name
+            else:
+                combined_dict[input_name] = input_param
+                value_sources[input_name] = block_name
+
+    return list(combined_dict.values())
+
+
+def combine_outputs(*named_output_lists: list[tuple[str, list[OutputParam]]]) -> list[OutputParam]:
+    """
+    Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
+    occurrence of each output name.
+
+    Args:
+        named_output_lists: List of tuples containing (block_name, output_param_list) pairs
+
+    Returns:
+        List[OutputParam]: Combined list of unique OutputParam objects
+    """
+    combined_dict = {}  # name -> OutputParam
+
+    for block_name, outputs in named_output_lists:
+        for output_param in outputs:
+            if (output_param.name not in combined_dict) or (
+                combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
+            ):
+                combined_dict[output_param.name] = output_param
+
+    return list(combined_dict.values())
+
+
 def generate_modular_model_card_content(blocks) -> dict[str, Any]:
     """
     Generate model card content for a modular pipeline.
diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py
index 2b01a5b5a4b5..2e6af4495b37 100644
--- a/src/diffusers/modular_pipelines/qwenimage/__init__.py
+++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py
@@ -21,27 +21,15 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["modular_blocks_qwenimage"] = [
-        "AUTO_BLOCKS",
-        "QwenImageAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_edit"] = [
-        "EDIT_AUTO_BLOCKS",
-        "QwenImageEditAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_edit_plus"] = [
-        "EDIT_PLUS_AUTO_BLOCKS",
-        "QwenImageEditPlusAutoBlocks",
-    ]
-    _import_structure["modular_blocks_qwenimage_layered"] = [
-        "LAYERED_AUTO_BLOCKS",
-        "QwenImageLayeredAutoBlocks",
-    ]
+    _import_structure["modular_blocks_qwenimage"] = ["QwenImageAutoBlocks"]
+    _import_structure["modular_blocks_qwenimage_edit"] = ["QwenImageEditAutoBlocks"]
+    _import_structure["modular_blocks_qwenimage_edit_plus"] = ["QwenImageEditPlusAutoBlocks"]
+    _import_structure["modular_blocks_qwenimage_layered"] = ["QwenImageLayeredAutoBlocks"]
     _import_structure["modular_pipeline"] = [
         "QwenImageEditModularPipeline",
         "QwenImageEditPlusModularPipeline",
-        "QwenImageModularPipeline",
         "QwenImageLayeredModularPipeline",
+        "QwenImageModularPipeline",
     ]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -51,22 +39,10 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .modular_blocks_qwenimage import (
-            AUTO_BLOCKS,
-            QwenImageAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_edit import (
-            EDIT_AUTO_BLOCKS,
-            QwenImageEditAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_edit_plus import (
-            EDIT_PLUS_AUTO_BLOCKS,
-            QwenImageEditPlusAutoBlocks,
-        )
-        from .modular_blocks_qwenimage_layered import (
-            LAYERED_AUTO_BLOCKS,
-            QwenImageLayeredAutoBlocks,
-        )
+        from .modular_blocks_qwenimage import QwenImageAutoBlocks
+        from .modular_blocks_qwenimage_edit import QwenImageEditAutoBlocks
+        from .modular_blocks_qwenimage_edit_plus import QwenImageEditPlusAutoBlocks
+        from .modular_blocks_qwenimage_layered import QwenImageLayeredAutoBlocks
         from .modular_pipeline import (
             QwenImageEditModularPipeline,
             QwenImageEditPlusModularPipeline,
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index c4e14566a795..51b5c6ac8c3d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -558,7 +558,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
       Inputs:
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           latents (`Tensor`):
               The initial random noised latents for the denoising process. Can be generated in prepare latents step.
@@ -644,7 +644,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
       Inputs:
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
@@ -725,7 +725,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
       Inputs:
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           latents (`Tensor`):
               The latents to use for the denoising process. Can be generated in prepare latents step.
@@ -842,7 +842,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
               mask for the negative text embeddings. Can be generated from text_encoder step.
 
       Outputs:
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shapes of the images latents, used for RoPE calculation
     """
 
@@ -917,7 +917,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
               mask for the negative text embeddings. Can be generated from text_encoder step.
 
       Outputs:
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shapes of the images latents, used for RoPE calculation
     """
 
@@ -995,9 +995,9 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
           batch_size (`int`, *optional*, defaults to 1):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
               be generated in input step.
-          image_height (`List`):
+          image_height (`list`):
               The heights of the reference images. Can be generated in input step.
-          image_width (`List`):
+          image_width (`list`):
               The widths of the reference images. Can be generated in input step.
           height (`int`):
               The height in pixels of the generated image.
@@ -1009,11 +1009,11 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
               mask for the negative text embeddings. Can be generated from text_encoder step.
 
       Outputs:
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shapes of the image latents, used for RoPE calculation
-          txt_seq_lens (`List`):
+          txt_seq_lens (`list`):
               The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
+          negative_txt_seq_lens (`list`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
     """
 
@@ -1123,11 +1123,11 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
               mask for the negative text embeddings. Can be generated from text_encoder step.
 
       Outputs:
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shapes of the image latents, used for RoPE calculation
-          txt_seq_lens (`List`):
+          txt_seq_lens (`list`):
               The sequence lengths of the prompt embeds, used for RoPE calculation
-          negative_txt_seq_lens (`List`):
+          negative_txt_seq_lens (`list`):
               The sequence lengths of the negative prompt embeds, used for RoPE calculation
           additional_t_cond (`Tensor`):
               The additional t cond, used for RoPE calculation
@@ -1238,7 +1238,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
 
       Outputs:
-          controlnet_keep (`List`):
+          controlnet_keep (`list`):
               The controlnet keep values
     """
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 49183eed9cda..e4ccb6b8e047 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -191,7 +191,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
               step.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -268,7 +268,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
@@ -366,7 +366,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
@@ -436,12 +436,12 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
               the generated image tensor from decoders step
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index 6724612361aa..de8ea05c5047 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -518,11 +518,11 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
               The number of denoising steps.
           latents (`Tensor`):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
 
       Outputs:
@@ -576,11 +576,11 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
               The number of denoising steps.
           latents (`Tensor`):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
           mask (`Tensor`):
               The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -645,13 +645,13 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
               The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
-          controlnet_keep (`List`):
+          controlnet_keep (`list`):
               The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
 
       Outputs:
@@ -711,13 +711,13 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
               The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
-          controlnet_keep (`List`):
+          controlnet_keep (`list`):
               The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
           mask (`Tensor`):
               The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -787,11 +787,11 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
 
       Outputs:
@@ -846,11 +846,11 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
           mask (`Tensor`):
               The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -910,11 +910,11 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
               The initial latents to use for the denoising process. Can be generated in prepare_latent step.
           image_latents (`Tensor`):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-          img_shapes (`List`):
+          img_shapes (`list`):
               The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
 
       Outputs:
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 6abcf7ce215a..527267dc0d6e 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -285,11 +285,11 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
           image_resize_processor (`VaeImageProcessor`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
     """
 
@@ -359,13 +359,13 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
           image_resize_processor (`VaeImageProcessor`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
     """
 
@@ -452,13 +452,13 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
           image_resize_processor (`VaeImageProcessor`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               Images resized to 1024x1024 target area for VAE encoding
-          resized_cond_image (`List`):
+          resized_cond_image (`list`):
               Images resized to 384x384 target area for VL text encoding
     """
 
@@ -1058,7 +1058,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
       Inputs:
           mask_image (`Image`):
               Mask image for inpainting.
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -1072,7 +1072,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
               The processed image
           processed_mask_image (`Tensor`):
               The processed mask image
-          mask_overlay_kwargs (`Dict`):
+          mask_overlay_kwargs (`dict`):
               The kwargs for the postprocess step to apply the mask overlay
     """
 
@@ -1177,7 +1177,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
               The processed image
           processed_mask_image (`Tensor`):
               The processed mask image
-          mask_overlay_kwargs (`Dict`):
+          mask_overlay_kwargs (`dict`):
               The kwargs for the postprocess step to apply the mask overlay
     """
 
@@ -1256,7 +1256,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
           image_processor (`VaeImageProcessor`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -1340,7 +1340,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
           image_processor (`VaeImageProcessor`)
 
       Inputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized image. should be generated using a resize step
 
       Outputs:
@@ -1412,7 +1412,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
           image_processor (`VaeImageProcessor`)
 
       Inputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized image. should be generated using a resize step
 
       Outputs:
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index ebe53940a4e5..faec7db245df 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -496,9 +496,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
               image latents used to guide the image generation. Can be generated from vae_encoder step.
 
       Outputs:
-          image_height (`List`):
+          image_height (`list`):
               The image heights calculated from the image latents dimension
-          image_width (`List`):
+          image_width (`list`):
               The image widths calculated from the image latents dimension
           height (`int`):
               if not provided, updated to image height
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 7503e0c7684b..bf87028b2f90 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -119,7 +119,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
       Inputs:
           mask_image (`Image`):
               Mask image for inpainting.
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -135,7 +135,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
               The processed image
           processed_mask_image (`Tensor`):
               The processed mask image
-          mask_overlay_kwargs (`Dict`):
+          mask_overlay_kwargs (`dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
               The latent representation of the input image.
@@ -164,7 +164,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
           image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -476,9 +476,9 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -553,11 +553,11 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -632,11 +632,11 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -712,7 +712,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
@@ -720,7 +720,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -802,7 +802,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
@@ -812,7 +812,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -894,7 +894,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
@@ -904,7 +904,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               When to stop applying ControlNet.
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -1032,7 +1032,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -1061,12 +1061,12 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
               step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -1113,10 +1113,14 @@ def description(self):
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
-      - for image-to-image generation, you need to provide `image`
-      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
-      - to run the controlnet workflow, you need to provide `control_image`
-      - for text-to-image generation, all you need to provide is `prompt`
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image2image`: requires `prompt`, `image`
+        - `inpainting`: requires `prompt`, `mask_image`, `image`
+        - `controlnet_text2image`: requires `prompt`, `control_image`
+        - `controlnet_image2image`: requires `prompt`, `image`, `control_image`
+        - `controlnet_inpainting`: requires `prompt`, `mask_image`, `image`, `control_image`
 
       Components:
           text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
@@ -1134,7 +1138,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Maximum sequence length for prompt encoding.
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-          image (`Union[Image, List]`, *optional*):
+          image (`Image | list`, *optional*):
               Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
@@ -1160,9 +1164,9 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -1183,12 +1187,12 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
               Scale for ControlNet conditioning.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
@@ -1197,15 +1201,23 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
     block_classes = AUTO_BLOCKS.values()
     block_names = AUTO_BLOCKS.keys()
 
+    # Workflow map defines the trigger conditions for each workflow.
+    # How to define:
+    #   - Only include required inputs and trigger inputs (inputs that determine which blocks run)
+    #   - currently, only supports `True` means the workflow triggers when the input is not None
+
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image2image": {"prompt": True, "image": True},
+        "inpainting": {"prompt": True, "mask_image": True, "image": True},
+        "controlnet_text2image": {"prompt": True, "control_image": True},
+        "controlnet_image2image": {"prompt": True, "image": True, "control_image": True},
+        "controlnet_inpainting": {"prompt": True, "mask_image": True, "image": True, "control_image": True},
+    }
+
     @property
     def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
-            + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
-            + "- to run the controlnet workflow, you need to provide `control_image`\n"
-            + "- for text-to-image generation, all you need to provide is `prompt`"
-        )
+        return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage."
 
     @property
     def outputs(self):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 21a7044c9f6e..37b80b69ec7e 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -67,7 +67,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
           (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
@@ -75,7 +75,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           prompt_embeds (`Tensor`):
               The prompt embeddings.
@@ -115,13 +115,13 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
           (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           processed_image (`Tensor`):
               The processed image
@@ -156,7 +156,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
           (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           mask_image (`Image`):
               Mask image for inpainting.
@@ -166,13 +166,13 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           processed_image (`Tensor`):
               The processed image
           processed_mask_image (`Tensor`):
               The processed mask image
-          mask_overlay_kwargs (`Dict`):
+          mask_overlay_kwargs (`dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
               The latent representation of the input image.
@@ -450,9 +450,9 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -526,11 +526,11 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -627,7 +627,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -656,12 +656,12 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
               step.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -718,6 +718,11 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
         `padding_mask_crop`
 
+
+      Supported workflows:
+        - `image_conditioned`: requires `prompt`, `image`
+        - `image_conditioned_inpainting`: requires `prompt`, `mask_image`, `image`
+
       Components:
           image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
           (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
@@ -725,7 +730,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
@@ -751,28 +756,32 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt'.
-          mask_overlay_kwargs (`Dict`, *optional*):
+          mask_overlay_kwargs (`dict`, *optional*):
               The kwargs for the postprocess step to apply the mask overlay. generated in
               InpaintProcessImagesInputStep.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
     model_name = "qwenimage-edit"
     block_classes = EDIT_AUTO_BLOCKS.values()
     block_names = EDIT_AUTO_BLOCKS.keys()
+    _workflow_map = {
+        "image_conditioned": {"prompt": True, "image": True},
+        "image_conditioned_inpainting": {"prompt": True, "mask_image": True, "image": True},
+    }
 
     @property
     def description(self):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 56652c94c4b0..4a1f418d7b45 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -58,7 +58,7 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
           (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
@@ -66,9 +66,9 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               Images resized to 1024x1024 target area for VAE encoding
-          resized_cond_image (`List`):
+          resized_cond_image (`list`):
               Images resized to 384x384 target area for VL text encoding
           prompt_embeds (`Tensor`):
               The prompt embeddings.
@@ -108,15 +108,15 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
           (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               Images resized to 1024x1024 target area for VAE encoding
-          resized_cond_image (`List`):
+          resized_cond_image (`list`):
               Images resized to 384x384 target area for VL text encoding
           processed_image (`Tensor`):
               The processed image
@@ -189,9 +189,9 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
               The negative prompt embeddings. (batch-expanded)
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask. (batch-expanded)
-          image_height (`List`):
+          image_height (`list`):
               The image heights calculated from the image latents dimension
-          image_width (`List`):
+          image_width (`list`):
               The image widths calculated from the image latents dimension
           height (`int`):
               if not provided, updated to image height
@@ -253,9 +253,9 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -315,7 +315,7 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images. (tensor output of the vae decoder.)
     """
 
@@ -357,7 +357,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
           transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           prompt (`str`):
               The prompt or prompts to guide image generation.
@@ -375,9 +375,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -385,7 +385,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 43cefa5eb658..a10454f1fb0c 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -60,7 +60,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
           (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
@@ -74,7 +74,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
               Maximum sequence length for prompt encoding.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           prompt (`str`):
               The prompt or prompts to guide image generation. If not provided, updated using image caption
@@ -117,7 +117,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
           (`AutoencoderKLQwenImage`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
@@ -125,7 +125,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
 
       Outputs:
-          resized_image (`List`):
+          resized_image (`list`):
               The resized images
           processed_image (`Tensor`):
               The processed image
@@ -250,9 +250,9 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -317,7 +317,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
           scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-          image (`Union[Image, List]`):
+          image (`Image | list`):
               Reference image(s) for denoising. Can be a single image or list of images.
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
@@ -339,9 +339,9 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
               Number of layers to extract from the image
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-          sigmas (`List`, *optional*):
+          sigmas (`list`, *optional*):
               Custom sigmas for the denoising process.
-          attention_kwargs (`Dict`, *optional*):
+          attention_kwargs (`dict`, *optional*):
               Additional kwargs for attention processors.
           **denoiser_input_fields (`None`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -349,7 +349,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
               Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`List`):
+          images (`list`):
               Generated images.
     """
 
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
index 59ec46dc6d36..44f1c555cef3 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
@@ -21,21 +21,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["encoders"] = ["StableDiffusionXLTextEncoderStep"]
-    _import_structure["modular_blocks"] = [
-        "ALL_BLOCKS",
-        "AUTO_BLOCKS",
-        "CONTROLNET_BLOCKS",
-        "IMAGE2IMAGE_BLOCKS",
-        "INPAINT_BLOCKS",
-        "IP_ADAPTER_BLOCKS",
-        "TEXT2IMAGE_BLOCKS",
-        "StableDiffusionXLAutoBlocks",
-        "StableDiffusionXLAutoControlnetStep",
-        "StableDiffusionXLAutoDecodeStep",
-        "StableDiffusionXLAutoIPAdapterStep",
-        "StableDiffusionXLAutoVaeEncoderStep",
-    ]
+    _import_structure["modular_blocks_stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks"]
     _import_structure["modular_pipeline"] = ["StableDiffusionXLModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -45,23 +31,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .encoders import (
-            StableDiffusionXLTextEncoderStep,
-        )
-        from .modular_blocks import (
-            ALL_BLOCKS,
-            AUTO_BLOCKS,
-            CONTROLNET_BLOCKS,
-            IMAGE2IMAGE_BLOCKS,
-            INPAINT_BLOCKS,
-            IP_ADAPTER_BLOCKS,
-            TEXT2IMAGE_BLOCKS,
-            StableDiffusionXLAutoBlocks,
-            StableDiffusionXLAutoControlnetStep,
-            StableDiffusionXLAutoDecodeStep,
-            StableDiffusionXLAutoIPAdapterStep,
-            StableDiffusionXLAutoVaeEncoderStep,
-        )
+        from .modular_blocks_stable_diffusion_xl import StableDiffusionXLAutoBlocks
         from .modular_pipeline import StableDiffusionXLModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
similarity index 55%
rename from src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
rename to src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
index 68b5e33755b5..a7a18e514777 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
@@ -14,7 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     StableDiffusionXLControlNetInputStep,
     StableDiffusionXLControlNetUnionInputStep,
@@ -277,7 +277,161 @@ def description(self):
 
 
 # ip-adapter, controlnet, text2img, img2img, inpainting
+# auto_docstring
 class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion
+    XL.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image2image`: requires `image`, `prompt`
+        - `inpainting`: requires `mask_image`, `image`, `prompt`
+        - `controlnet_text2image`: requires `control_image`, `prompt`
+        - `controlnet_image2image`: requires `control_image`, `image`, `prompt`
+        - `controlnet_inpainting`: requires `control_image`, `mask_image`, `image`, `prompt`
+        - `controlnet_union_text2image`: requires `control_image`, `control_mode`, `prompt`
+        - `controlnet_union_image2image`: requires `control_image`, `control_mode`, `image`, `prompt`
+        - `controlnet_union_inpainting`: requires `control_image`, `control_mode`, `mask_image`, `image`, `prompt`
+        - `ip_adapter_text2image`: requires `ip_adapter_image`, `prompt`
+        - `ip_adapter_image2image`: requires `ip_adapter_image`, `image`, `prompt`
+        - `ip_adapter_inpainting`: requires `ip_adapter_image`, `mask_image`, `image`, `prompt`
+        - `ip_adapter_controlnet_text2image`: requires `ip_adapter_image`, `control_image`, `prompt`
+        - `ip_adapter_controlnet_image2image`: requires `ip_adapter_image`, `control_image`, `image`, `prompt`
+        - `ip_adapter_controlnet_inpainting`: requires `ip_adapter_image`, `control_image`, `mask_image`, `image`,
+          `prompt`
+        - `ip_adapter_controlnet_union_text2image`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+          `prompt`
+        - `ip_adapter_controlnet_union_image2image`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+          `image`, `prompt`
+        - `ip_adapter_controlnet_union_inpainting`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+          `mask_image`, `image`, `prompt`
+
+      Components:
+          text_encoder (`CLIPTextModel`) text_encoder_2 (`CLIPTextModelWithProjection`) tokenizer (`CLIPTokenizer`)
+          tokenizer_2 (`CLIPTokenizer`) guider (`ClassifierFreeGuidance`) image_encoder
+          (`CLIPVisionModelWithProjection`) feature_extractor (`CLIPImageProcessor`) unet (`UNet2DConditionModel`) vae
+          (`AutoencoderKL`) image_processor (`VaeImageProcessor`) mask_processor (`VaeImageProcessor`) scheduler
+          (`EulerDiscreteScheduler`) controlnet (`ControlNetUnionModel`) control_image_processor (`VaeImageProcessor`)
+
+      Configs:
+          force_zeros_for_empty_prompt (default: True) requires_aesthetics_score (default: False)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          prompt_2 (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt_2 (`None`, *optional*):
+              TODO: Add description.
+          cross_attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          clip_skip (`None`, *optional*):
+              TODO: Add description.
+          ip_adapter_image (`Image | ndarray | Tensor | list | list | list`, *optional*):
+              The image(s) to be used as ip adapter
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image (`None`, *optional*):
+              TODO: Add description.
+          mask_image (`None`, *optional*):
+              TODO: Add description.
+          padding_mask_crop (`None`, *optional*):
+              TODO: Add description.
+          dtype (`dtype`, *optional*):
+              The dtype of the model inputs
+          generator (`None`, *optional*):
+              TODO: Add description.
+          preprocess_kwargs (`dict | NoneType`, *optional*):
+              A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under
+              `self.image_processor` in [diffusers.image_processor.VaeImageProcessor]
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          ip_adapter_embeds (`list`, *optional*):
+              Pre-generated image embeddings for IP-Adapter. Can be generated from ip_adapter step.
+          negative_ip_adapter_embeds (`list`, *optional*):
+              Pre-generated negative image embeddings for IP-Adapter. Can be generated from ip_adapter step.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          denoising_end (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.3):
+              TODO: Add description.
+          denoising_start (`None`, *optional*):
+              TODO: Add description.
+          latents (`None`):
+              TODO: Add description.
+          image_latents (`Tensor`, *optional*):
+              The latents representing the reference image for image-to-image/inpainting generation. Can be generated
+              in vae_encode step.
+          mask (`Tensor`, *optional*):
+              The mask for the inpainting generation. Can be generated in vae_encode step.
+          masked_image_latents (`Tensor`, *optional*):
+              The masked image latents for the inpainting generation (only for inpainting-specific unet). Can be
+              generated in vae_encode step.
+          original_size (`None`, *optional*):
+              TODO: Add description.
+          target_size (`None`, *optional*):
+              TODO: Add description.
+          negative_original_size (`None`, *optional*):
+              TODO: Add description.
+          negative_target_size (`None`, *optional*):
+              TODO: Add description.
+          crops_coords_top_left (`None`, *optional*, defaults to (0, 0)):
+              TODO: Add description.
+          negative_crops_coords_top_left (`None`, *optional*, defaults to (0, 0)):
+              TODO: Add description.
+          aesthetic_score (`None`, *optional*, defaults to 6.0):
+              TODO: Add description.
+          negative_aesthetic_score (`None`, *optional*, defaults to 2.0):
+              TODO: Add description.
+          control_image (`None`, *optional*):
+              TODO: Add description.
+          control_mode (`None`, *optional*):
+              TODO: Add description.
+          control_guidance_start (`None`, *optional*, defaults to 0.0):
+              TODO: Add description.
+          control_guidance_end (`None`, *optional*, defaults to 1.0):
+              TODO: Add description.
+          controlnet_conditioning_scale (`None`, *optional*, defaults to 1.0):
+              TODO: Add description.
+          guess_mode (`None`, *optional*, defaults to False):
+              TODO: Add description.
+          crops_coords (`tuple | NoneType`, *optional*):
+              The crop coordinates to use for preprocess/postprocess the image and mask, for inpainting task only. Can
+              be generated in vae_encode step.
+          controlnet_cond (`Tensor`, *optional*):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          conditioning_scale (`float`, *optional*):
+              The controlnet conditioning scale value to use for the denoising process. Can be generated in
+              prepare_controlnet_inputs step.
+          controlnet_keep (`list`, *optional*):
+              The controlnet keep values to use for the denoising process. Can be generated in
+              prepare_controlnet_inputs step.
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs that need to be prepared with guider. It should contain
+              prompt_embeds/negative_prompt_embeds, add_time_ids/negative_add_time_ids,
+              pooled_prompt_embeds/negative_pooled_prompt_embeds, and ip_adapter_embeds/negative_ip_adapter_embeds
+              (optional).please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when
+              they are created and added to the pipeline state
+          eta (`None`, *optional*, defaults to 0.0):
+              TODO: Add description.
+          output_type (`None`, *optional*, defaults to pil):
+              TODO: Add description.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
     block_classes = [
         StableDiffusionXLTextEncoderStep,
         StableDiffusionXLAutoIPAdapterStep,
@@ -293,103 +447,66 @@ class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
         "decode",
     ]
 
-    @property
-    def description(self):
-        return (
-            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL.\n"
-            + "- for image-to-image generation, you need to provide either `image` or `image_latents`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
-            + "- to run the controlnet workflow, you need to provide `control_image`\n"
-            + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
-            + "- to run the ip_adapter workflow, you need to provide `ip_adapter_image`\n"
-            + "- for text-to-image generation, all you need to provide is `prompt`"
-        )
-
-
-# controlnet (input + denoise step)
-class StableDiffusionXLAutoControlnetStep(SequentialPipelineBlocks):
-    block_classes = [
-        StableDiffusionXLAutoControlNetInputStep,
-        StableDiffusionXLAutoControlNetDenoiseStep,
-    ]
-    block_names = ["controlnet_input", "controlnet_denoise"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image2image": {"image": True, "prompt": True},
+        "inpainting": {"mask_image": True, "image": True, "prompt": True},
+        "controlnet_text2image": {"control_image": True, "prompt": True},
+        "controlnet_image2image": {"control_image": True, "image": True, "prompt": True},
+        "controlnet_inpainting": {"control_image": True, "mask_image": True, "image": True, "prompt": True},
+        "controlnet_union_text2image": {"control_image": True, "control_mode": True, "prompt": True},
+        "controlnet_union_image2image": {"control_image": True, "control_mode": True, "image": True, "prompt": True},
+        "controlnet_union_inpainting": {
+            "control_image": True,
+            "control_mode": True,
+            "mask_image": True,
+            "image": True,
+            "prompt": True,
+        },
+        "ip_adapter_text2image": {"ip_adapter_image": True, "prompt": True},
+        "ip_adapter_image2image": {"ip_adapter_image": True, "image": True, "prompt": True},
+        "ip_adapter_inpainting": {"ip_adapter_image": True, "mask_image": True, "image": True, "prompt": True},
+        "ip_adapter_controlnet_text2image": {"ip_adapter_image": True, "control_image": True, "prompt": True},
+        "ip_adapter_controlnet_image2image": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "image": True,
+            "prompt": True,
+        },
+        "ip_adapter_controlnet_inpainting": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "mask_image": True,
+            "image": True,
+            "prompt": True,
+        },
+        "ip_adapter_controlnet_union_text2image": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "control_mode": True,
+            "prompt": True,
+        },
+        "ip_adapter_controlnet_union_image2image": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "control_mode": True,
+            "image": True,
+            "prompt": True,
+        },
+        "ip_adapter_controlnet_union_inpainting": {
+            "ip_adapter_image": True,
+            "control_image": True,
+            "control_mode": True,
+            "mask_image": True,
+            "image": True,
+            "prompt": True,
+        },
+    }
 
     @property
     def description(self):
-        return (
-            "Controlnet auto step that prepare the controlnet input and denoise the latents. "
-            + "It works for both controlnet and controlnet_union and supports text2img, img2img and inpainting tasks."
-            + " (it should be replace at 'denoise' step)"
-        )
-
-
-TEXT2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("input", StableDiffusionXLInputStep),
-        ("set_timesteps", StableDiffusionXLSetTimestepsStep),
-        ("prepare_latents", StableDiffusionXLPrepareLatentsStep),
-        ("prepare_add_cond", StableDiffusionXLPrepareAdditionalConditioningStep),
-        ("denoise", StableDiffusionXLDenoiseStep),
-        ("decode", StableDiffusionXLDecodeStep),
-    ]
-)
+        return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL."
 
-IMAGE2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("vae_encoder", StableDiffusionXLVaeEncoderStep),
-        ("input", StableDiffusionXLInputStep),
-        ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
-        ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
-        ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
-        ("denoise", StableDiffusionXLDenoiseStep),
-        ("decode", StableDiffusionXLDecodeStep),
-    ]
-)
-
-INPAINT_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep),
-        ("input", StableDiffusionXLInputStep),
-        ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
-        ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
-        ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
-        ("denoise", StableDiffusionXLInpaintDenoiseStep),
-        ("decode", StableDiffusionXLInpaintDecodeStep),
-    ]
-)
-
-CONTROLNET_BLOCKS = InsertableDict(
-    [
-        ("denoise", StableDiffusionXLAutoControlnetStep),
-    ]
-)
-
-
-IP_ADAPTER_BLOCKS = InsertableDict(
-    [
-        ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
-    ]
-)
-
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", StableDiffusionXLTextEncoderStep),
-        ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
-        ("vae_encoder", StableDiffusionXLAutoVaeEncoderStep),
-        ("denoise", StableDiffusionXLCoreDenoiseStep),
-        ("decode", StableDiffusionXLAutoDecodeStep),
-    ]
-)
-
-
-ALL_BLOCKS = {
-    "text2img": TEXT2IMAGE_BLOCKS,
-    "img2img": IMAGE2IMAGE_BLOCKS,
-    "inpaint": INPAINT_BLOCKS,
-    "controlnet": CONTROLNET_BLOCKS,
-    "ip_adapter": IP_ADAPTER_BLOCKS,
-    "auto": AUTO_BLOCKS,
-}
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
index d01a86ca09b5..b641c6cd7fcc 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     WanPrepareLatentsStep,
     WanSetTimestepsStep,
@@ -37,7 +38,45 @@
 
 
 # inputs(text) -> set_timesteps -> prepare_latents -> denoise
+# auto_docstring
 class WanCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded conditions and runs the denoising process.
+
+      Components:
+          transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          num_frames (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "wan"
     block_classes = [
         WanTextInputStep,
@@ -49,14 +88,11 @@ class WanCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
-            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `WanDenoiseStep` is used to denoise the latents\n"
-        )
+        return "denoise block that takes encoded conditions and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
 
 
 # ====================
@@ -64,7 +100,51 @@ def description(self):
 # ====================
 
 
+# auto_docstring
 class WanBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline blocks for Wan2.1.
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer
+          (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) vae (`AutoencoderKLWan`) video_processor
+          (`VideoProcessor`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          num_frames (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              The output type of the decoded videos
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "wan"
     block_classes = [
         WanTextEncoderStep,
@@ -75,9 +155,8 @@ class WanBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Modular pipeline blocks for Wan2.1.\n"
-            + "- `WanTextEncoderStep` is used to encode the text\n"
-            + "- `WanCoreDenoiseStep` is used to denoise the latents\n"
-            + "- `WanVaeDecoderStep` is used to decode the latents to images"
-        )
+        return "Modular pipeline blocks for Wan2.1."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
index 21164422f3d9..9f602c24713b 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     WanPrepareLatentsStep,
     WanSetTimestepsStep,
@@ -38,7 +39,50 @@
 # inputs(text) -> set_timesteps -> prepare_latents -> denoise
 
 
+# auto_docstring
 class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded conditions and runs the denoising process.
+
+      Components:
+          transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+          guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+      Configs:
+          boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+          noise stages.
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          num_frames (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "wan"
     block_classes = [
         WanTextInputStep,
@@ -50,14 +94,11 @@ class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
-            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `Wan22DenoiseStep` is used to denoise the latents in wan2.2\n"
-        )
+        return "denoise block that takes encoded conditions and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
 
 
 # ====================
@@ -65,7 +106,55 @@ def description(self):
 # ====================
 
 
+# auto_docstring
 class Wan22Blocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline for text-to-video using Wan2.2.
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer
+          (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider_2 (`ClassifierFreeGuidance`)
+          transformer_2 (`WanTransformer3DModel`) vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Configs:
+          boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+          noise stages.
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          num_frames (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              The output type of the decoded videos
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "wan"
     block_classes = [
         WanTextEncoderStep,
@@ -80,9 +169,8 @@ class Wan22Blocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Modular pipeline for text-to-video using Wan2.2.\n"
-            + " - `WanTextEncoderStep` encodes the text\n"
-            + " - `Wan22CoreDenoiseStep` denoes the latents\n"
-            + " - `WanVaeDecoderStep` decodes the latents to video frames\n"
-        )
+        return "Modular pipeline for text-to-video using Wan2.2."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
index 3db1c8fa837b..8e55b7a50f08 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     WanAdditionalInputsStep,
     WanPrepareLatentsStep,
@@ -40,7 +41,36 @@
 # ====================
 
 
+# auto_docstring
 class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent
+    representation
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          first_frame_latents (`Tensor`):
+              video latent representation with the first frame image condition
+          image_condition_latents (`Tensor | NoneType`):
+              TODO: Add description.
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep]
     block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"]
@@ -56,7 +86,52 @@ def description(self):
 
 
 # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents)
+# auto_docstring
 class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+      Components:
+          transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+          guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+      Configs:
+          boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+          noise stages.
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          num_frames (`None`, *optional*):
+              TODO: Add description.
+          image_condition_latents (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanTextInputStep,
@@ -75,15 +150,11 @@ class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
-            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
-            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `Wan22Image2VideoDenoiseStep` is used to denoise the latents in wan2.2\n"
-        )
+        return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
 
 
 # ====================
@@ -91,7 +162,57 @@ def description(self):
 # ====================
 
 
+# auto_docstring
 class Wan22Image2VideoBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline for image-to-video using Wan2.2.
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) vae
+          (`AutoencoderKLWan`) video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler
+          (`UniPCMultistepScheduler`) guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+      Configs:
+          boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+          noise stages.
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              The output type of the decoded videos
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanTextEncoderStep,
@@ -108,10 +229,8 @@ class Wan22Image2VideoBlocks(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "Modular pipeline for image-to-video using Wan2.2.\n"
-            + " - `WanTextEncoderStep` encodes the text\n"
-            + " - `WanImage2VideoVaeEncoderStep` encodes the image\n"
-            + " - `Wan22Image2VideoCoreDenoiseStep` denoes the latents\n"
-            + " - `WanVaeDecoderStep` decodes the latents to video frames\n"
-        )
+        return "Modular pipeline for image-to-video using Wan2.2."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
index d07ab8ecf473..c08db62c469a 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
@@ -14,6 +14,7 @@
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     WanAdditionalInputsStep,
     WanPrepareLatentsStep,
@@ -45,7 +46,29 @@
 
 
 # wan2.1 I2V (first frame only)
+# auto_docstring
 class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks):
+    """
+    Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings
+
+      Components:
+          image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              The image embeddings
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanImageEncoderStep]
     block_names = ["image_resize", "image_encoder"]
@@ -56,7 +79,34 @@ def description(self):
 
 
 # wan2.1 FLF2V (first and last frame)
+# auto_docstring
 class WanFLF2VImageEncoderStep(SequentialPipelineBlocks):
+    """
+    FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image
+    embeddings
+
+      Components:
+          image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`):
+              The last frameimage
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          resized_last_image (`Image`):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              The image embeddings
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep]
     block_names = ["image_resize", "last_image_resize", "image_encoder"]
@@ -67,7 +117,36 @@ def description(self):
 
 
 # wan2.1 Auto Image Encoder
+# auto_docstring
 class WanAutoImageEncoderStep(AutoPipelineBlocks):
+    """
+    Image Encoder step that encode the image to generate the image embeddingsThis is an auto pipeline block that works
+    for image2video tasks. - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided. -
+    `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is
+    not provided, step will be skipped.
+
+      Components:
+          image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+      Inputs:
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`, *optional*):
+              The last frameimage
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          resized_last_image (`Image`):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              The image embeddings
+    """
+
     block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep]
     block_names = ["flf2v_image_encoder", "image2video_image_encoder"]
     block_trigger_inputs = ["last_image", "image"]
@@ -90,7 +169,36 @@ def description(self):
 
 
 # wan2.1 I2V (first frame only)
+# auto_docstring
 class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent
+    representation
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          first_frame_latents (`Tensor`):
+              video latent representation with the first frame image condition
+          image_condition_latents (`Tensor | NoneType`):
+              TODO: Add description.
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep]
     block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"]
@@ -101,7 +209,40 @@ def description(self):
 
 
 # wan2.1 FLF2V (first and last frame)
+# auto_docstring
 class WanFLF2VVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the
+    latent conditions
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          image (`Image`):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`):
+              The last frameimage
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          resized_last_image (`Image`):
+              TODO: Add description.
+          first_last_frame_latents (`Tensor`):
+              video latent representation with the first and last frame images condition
+          image_condition_latents (`Tensor | NoneType`):
+              TODO: Add description.
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanImageResizeStep,
@@ -117,7 +258,44 @@ def description(self):
 
 
 # wan2.1 Auto Vae Encoder
+# auto_docstring
 class WanAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Vae Image Encoder step that encode the image to generate the image latentsThis is an auto pipeline block that works
+    for image2video tasks. - `WanFLF2VVaeEncoderStep` (flf2v) is used when `last_image` is provided. -
+    `WanImage2VideoVaeEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is not
+    provided, step will be skipped.
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`, *optional*):
+              The last frameimage
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          resized_image (`Image`):
+              TODO: Add description.
+          resized_last_image (`Image`):
+              TODO: Add description.
+          first_last_frame_latents (`Tensor`):
+              video latent representation with the first and last frame images condition
+          image_condition_latents (`Tensor | NoneType`):
+              TODO: Add description.
+          first_frame_latents (`Tensor`):
+              video latent representation with the first frame image condition
+    """
+
     model_name = "wan-i2v"
     block_classes = [WanFLF2VVaeEncoderStep, WanImage2VideoVaeEncoderStep]
     block_names = ["flf2v_vae_encoder", "image2video_vae_encoder"]
@@ -141,7 +319,53 @@ def description(self):
 
 # wan2.1 I2V core denoise (support both I2V and FLF2V)
 # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents)
+# auto_docstring
 class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+      Components:
+          transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`Tensor`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          num_frames (`None`, *optional*):
+              TODO: Add description.
+          image_condition_latents (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              TODO: Add description.
+
+      Outputs:
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `transformer.dtype`)
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanTextInputStep,
@@ -160,15 +384,7 @@ class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
 
     @property
     def description(self):
-        return (
-            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
-            + " - `WanSetTimestepsStep` is used to set the timesteps\n"
-            + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n"
-        )
+        return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
 
 
 # ====================
@@ -177,7 +393,64 @@ def description(self):
 
 
 # wan2.1 Image2Video Auto Blocks
+# auto_docstring
 class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for image-to-video using Wan.
+
+      Supported workflows:
+        - `image2video`: requires `image`, `prompt`
+        - `flf2v`: requires `last_image`, `image`, `prompt`
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`)
+          image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) vae (`AutoencoderKLWan`)
+          video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler
+          (`UniPCMultistepScheduler`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`int`, *optional*, defaults to 480):
+              TODO: Add description.
+          width (`int`, *optional*, defaults to 832):
+              TODO: Add description.
+          last_image (`Image`, *optional*):
+              The last frameimage
+          num_frames (`int`, *optional*, defaults to 81):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_videos_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_condition_latents (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 50):
+              TODO: Add description.
+          timesteps (`None`, *optional*):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`None`, *optional*):
+              TODO: Add description.
+          image_embeds (`Tensor`):
+              TODO: Add description.
+          output_type (`str`, *optional*, defaults to np):
+              The output type of the decoded videos
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
     model_name = "wan-i2v"
     block_classes = [
         WanTextEncoderStep,
@@ -194,10 +467,15 @@ class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
         "decode",
     ]
 
+    _workflow_map = {
+        "image2video": {"image": True, "prompt": True},
+        "flf2v": {"last_image": True, "image": True, "prompt": True},
+    }
+
     @property
     def description(self):
-        return (
-            "Auto Modular pipeline for image-to-video using Wan.\n"
-            + "- for I2V workflow, all you need to provide is `image`"
-            + "- for FLF2V workflow, all you need to provide is `last_image` and `image`"
-        )
+        return "Auto Modular pipeline for image-to-video using Wan."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/z_image/__init__.py b/src/diffusers/modular_pipelines/z_image/__init__.py
index c8a8c14396c0..5c04008d3305 100644
--- a/src/diffusers/modular_pipelines/z_image/__init__.py
+++ b/src/diffusers/modular_pipelines/z_image/__init__.py
@@ -21,12 +21,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["decoders"] = ["ZImageVaeDecoderStep"]
-    _import_structure["encoders"] = ["ZImageTextEncoderStep", "ZImageVaeImageEncoderStep"]
-    _import_structure["modular_blocks"] = [
-        "ALL_BLOCKS",
-        "ZImageAutoBlocks",
-    ]
+    _import_structure["modular_blocks_z_image"] = ["ZImageAutoBlocks"]
     _import_structure["modular_pipeline"] = ["ZImageModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -36,12 +31,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .decoders import ZImageVaeDecoderStep
-        from .encoders import ZImageTextEncoderStep
-        from .modular_blocks import (
-            ALL_BLOCKS,
-            ZImageAutoBlocks,
-        )
+        from .modular_blocks_z_image import ZImageAutoBlocks
         from .modular_pipeline import ZImageModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks.py b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
deleted file mode 100644
index a54baeccaf0c..000000000000
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
-    ZImageAdditionalInputsStep,
-    ZImagePrepareLatentsStep,
-    ZImagePrepareLatentswithImageStep,
-    ZImageSetTimestepsStep,
-    ZImageSetTimestepsWithStrengthStep,
-    ZImageTextInputStep,
-)
-from .decoders import ZImageVaeDecoderStep
-from .denoise import (
-    ZImageDenoiseStep,
-)
-from .encoders import (
-    ZImageTextEncoderStep,
-    ZImageVaeImageEncoderStep,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# z-image
-# text2image
-class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [
-        ZImageTextInputStep,
-        ZImagePrepareLatentsStep,
-        ZImageSetTimestepsStep,
-        ZImageDenoiseStep,
-    ]
-    block_names = ["input", "prepare_latents", "set_timesteps", "denoise"]
-
-    @property
-    def description(self):
-        return (
-            "denoise block that takes encoded conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n"
-            + " - `ZImageSetTimestepsStep` is used to set the timesteps\n"
-            + " - `ZImageDenoiseStep` is used to denoise the latents\n"
-        )
-
-
-# z-image: image2image
-## denoise
-class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [
-        ZImageTextInputStep,
-        ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
-        ZImagePrepareLatentsStep,
-        ZImageSetTimestepsStep,
-        ZImageSetTimestepsWithStrengthStep,
-        ZImagePrepareLatentswithImageStep,
-        ZImageDenoiseStep,
-    ]
-    block_names = [
-        "input",
-        "additional_inputs",
-        "prepare_latents",
-        "set_timesteps",
-        "set_timesteps_with_strength",
-        "prepare_latents_with_image",
-        "denoise",
-    ]
-
-    @property
-    def description(self):
-        return (
-            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `ZImageAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
-            + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n"
-            + " - `ZImageSetTimestepsStep` is used to set the timesteps\n"
-            + " - `ZImageSetTimestepsWithStrengthStep` is used to set the timesteps with strength\n"
-            + " - `ZImagePrepareLatentswithImageStep` is used to prepare the latents with image\n"
-            + " - `ZImageDenoiseStep` is used to denoise the latents\n"
-        )
-
-
-## auto blocks
-class ZImageAutoDenoiseStep(AutoPipelineBlocks):
-    block_classes = [
-        ZImageImage2ImageCoreDenoiseStep,
-        ZImageCoreDenoiseStep,
-    ]
-    block_names = ["image2image", "text2image"]
-    block_trigger_inputs = ["image_latents", None]
-
-    @property
-    def description(self) -> str:
-        return (
-            "Denoise step that iteratively denoise the latents. "
-            "This is a auto pipeline block that works for text2image and image2image tasks."
-            " - `ZImageCoreDenoiseStep` (text2image) for text2image tasks."
-            " - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks."
-            + " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n"
-            + " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n"
-        )
-
-
-class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
-    block_classes = [ZImageVaeImageEncoderStep]
-    block_names = ["vae_encoder"]
-    block_trigger_inputs = ["image"]
-
-    @property
-    def description(self) -> str:
-        return "Vae Image Encoder step that encode the image to generate the image latents"
-        +"This is an auto pipeline block that works for image2image tasks."
-        +" - `ZImageVaeImageEncoderStep` is used when `image` is provided."
-        +" - if `image` is not provided, step will be skipped."
-
-
-class ZImageAutoBlocks(SequentialPipelineBlocks):
-    block_classes = [
-        ZImageTextEncoderStep,
-        ZImageAutoVaeImageEncoderStep,
-        ZImageAutoDenoiseStep,
-        ZImageVaeDecoderStep,
-    ]
-    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
-
-    @property
-    def description(self) -> str:
-        return "Auto Modular pipeline for text-to-image and image-to-image using ZImage.\n"
-        +" - for text-to-image generation, all you need to provide is `prompt`\n"
-        +" - for image-to-image generation, you need to provide `image`\n"
-        +" - if `image` is not provided, step will be skipped."
-
-
-# presets
-TEXT2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", ZImageTextEncoderStep),
-        ("input", ZImageTextInputStep),
-        ("prepare_latents", ZImagePrepareLatentsStep),
-        ("set_timesteps", ZImageSetTimestepsStep),
-        ("denoise", ZImageDenoiseStep),
-        ("decode", ZImageVaeDecoderStep),
-    ]
-)
-
-IMAGE2IMAGE_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", ZImageTextEncoderStep),
-        ("vae_encoder", ZImageVaeImageEncoderStep),
-        ("input", ZImageTextInputStep),
-        ("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])),
-        ("prepare_latents", ZImagePrepareLatentsStep),
-        ("set_timesteps", ZImageSetTimestepsStep),
-        ("set_timesteps_with_strength", ZImageSetTimestepsWithStrengthStep),
-        ("prepare_latents_with_image", ZImagePrepareLatentswithImageStep),
-        ("denoise", ZImageDenoiseStep),
-        ("decode", ZImageVaeDecoderStep),
-    ]
-)
-
-
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", ZImageTextEncoderStep),
-        ("vae_encoder", ZImageAutoVaeImageEncoderStep),
-        ("denoise", ZImageAutoDenoiseStep),
-        ("decode", ZImageVaeDecoderStep),
-    ]
-)
-
-ALL_BLOCKS = {
-    "text2image": TEXT2IMAGE_BLOCKS,
-    "image2image": IMAGE2IMAGE_BLOCKS,
-    "auto": AUTO_BLOCKS,
-}
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
new file mode 100644
index 000000000000..23e20d55fb1e
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
@@ -0,0 +1,334 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
+from .before_denoise import (
+    ZImageAdditionalInputsStep,
+    ZImagePrepareLatentsStep,
+    ZImagePrepareLatentswithImageStep,
+    ZImageSetTimestepsStep,
+    ZImageSetTimestepsWithStrengthStep,
+    ZImageTextInputStep,
+)
+from .decoders import ZImageVaeDecoderStep
+from .denoise import (
+    ZImageDenoiseStep,
+)
+from .encoders import (
+    ZImageTextEncoderStep,
+    ZImageVaeImageEncoderStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# ====================
+# 1. DENOISE
+# ====================
+
+
+# text2image: inputs(text) -> set_timesteps -> prepare_latents -> denoise
+# auto_docstring
+class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded conditions and runs the denoising process.
+
+      Components:
+          transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`list`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`list`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              TODO: Add description.
+          width (`int`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 9):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    block_classes = [
+        ZImageTextInputStep,
+        ZImagePrepareLatentsStep,
+        ZImageSetTimestepsStep,
+        ZImageDenoiseStep,
+    ]
+    block_names = ["input", "prepare_latents", "set_timesteps", "denoise"]
+
+    @property
+    def description(self):
+        return "denoise block that takes encoded conditions and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
+
+# image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise
+# auto_docstring
+class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+      Components:
+          transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`list`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`list`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`, *optional*, defaults to 9):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    block_classes = [
+        ZImageTextInputStep,
+        ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        ZImagePrepareLatentsStep,
+        ZImageSetTimestepsStep,
+        ZImageSetTimestepsWithStrengthStep,
+        ZImagePrepareLatentswithImageStep,
+        ZImageDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "prepare_latents",
+        "set_timesteps",
+        "set_timesteps_with_strength",
+        "prepare_latents_with_image",
+        "denoise",
+    ]
+
+    @property
+    def description(self):
+        return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
+
+# auto_docstring
+class ZImageAutoDenoiseStep(AutoPipelineBlocks):
+    """
+    Denoise step that iteratively denoise the latents. This is a auto pipeline block that works for text2image and
+    image2image tasks. - `ZImageCoreDenoiseStep` (text2image) for text2image tasks. -
+    `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks. - if `image_latents` is provided,
+    `ZImageImage2ImageCoreDenoiseStep` will be used.
+       - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.
+
+      Components:
+          transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          prompt_embeds (`list`):
+              Pre-generated text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`list`, *optional*):
+              Pre-generated negative text embeddings. Can be generated from text_encoder step.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    block_classes = [
+        ZImageImage2ImageCoreDenoiseStep,
+        ZImageCoreDenoiseStep,
+    ]
+    block_names = ["image2image", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. "
+            "This is a auto pipeline block that works for text2image and image2image tasks."
+            " - `ZImageCoreDenoiseStep` (text2image) for text2image tasks."
+            " - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks."
+            + " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n"
+            + " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n"
+        )
+
+
+# auto_docstring
+class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
+    """
+    Vae Image Encoder step that encode the image to generate the image latents
+
+      Components:
+          vae (`AutoencoderKL`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          image_latents (`Tensor`):
+              video latent representation with the first frame image condition
+    """
+
+    block_classes = [ZImageVaeImageEncoderStep]
+    block_names = ["vae_encoder"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self) -> str:
+        return "Vae Image Encoder step that encode the image to generate the image latents"
+        +"This is an auto pipeline block that works for image2image tasks."
+        +" - `ZImageVaeImageEncoderStep` is used when `image` is provided."
+        +" - if `image` is not provided, step will be skipped."
+
+
+# auto_docstring
+class ZImageAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image and image-to-image using ZImage.
+
+      Supported workflows:
+        - `text2image`: requires `prompt`
+        - `image2image`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) guider (`ClassifierFreeGuidance`) vae
+          (`AutoencoderKL`) image_processor (`VaeImageProcessor`) transformer (`ZImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          prompt (`None`, *optional*):
+              TODO: Add description.
+          negative_prompt (`None`, *optional*):
+              TODO: Add description.
+          max_sequence_length (`None`, *optional*, defaults to 512):
+              TODO: Add description.
+          image (`Image`, *optional*):
+              TODO: Add description.
+          height (`None`, *optional*):
+              TODO: Add description.
+          width (`None`, *optional*):
+              TODO: Add description.
+          generator (`None`, *optional*):
+              TODO: Add description.
+          num_images_per_prompt (`None`, *optional*, defaults to 1):
+              TODO: Add description.
+          image_latents (`None`, *optional*):
+              TODO: Add description.
+          latents (`Tensor | NoneType`):
+              TODO: Add description.
+          num_inference_steps (`None`):
+              TODO: Add description.
+          sigmas (`None`, *optional*):
+              TODO: Add description.
+          strength (`None`, *optional*, defaults to 0.6):
+              TODO: Add description.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          output_type (`str`, *optional*, defaults to pil):
+              The type of the output images, can be 'pil', 'np', 'pt'
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
+    block_classes = [
+        ZImageTextEncoderStep,
+        ZImageAutoVaeImageEncoderStep,
+        ZImageAutoDenoiseStep,
+        ZImageVaeDecoderStep,
+    ]
+    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "image2image": {"image": True, "prompt": True},
+    }
+
+    @property
+    def description(self) -> str:
+        return "Auto Modular pipeline for text-to-image and image-to-image using ZImage."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
index 854b5218c617..9a6b4b9b6fb4 100644
--- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -33,6 +33,19 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
+FLUX_TEXT2IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "FluxTextEncoderStep"),
+        ("denoise.input", "FluxTextInputStep"),
+        ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+        ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+        ("denoise.denoise", "FluxDenoiseStep"),
+        ("decode", "FluxDecodeStep"),
+    ]
+}
+
+
 class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxModularPipeline
     pipeline_blocks_class = FluxAutoBlocks
@@ -40,6 +53,7 @@ class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = FLUX_TEXT2IMAGE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -59,6 +73,23 @@ def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
 
+FLUX_IMAGE2IMAGE_WORKFLOWS = {
+    "image2image": [
+        ("text_encoder", "FluxTextEncoderStep"),
+        ("vae_encoder.preprocess", "FluxProcessImagesInputStep"),
+        ("vae_encoder.encode", "FluxVaeEncoderStep"),
+        ("denoise.input.text_inputs", "FluxTextInputStep"),
+        ("denoise.input.additional_inputs", "FluxAdditionalInputsStep"),
+        ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+        ("denoise.before_denoise.set_timesteps", "FluxImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_img2img_latents", "FluxImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+        ("denoise.denoise", "FluxDenoiseStep"),
+        ("decode", "FluxDecodeStep"),
+    ]
+}
+
+
 class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxModularPipeline
     pipeline_blocks_class = FluxAutoBlocks
@@ -66,6 +97,7 @@ class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX_IMAGE2IMAGE_WORKFLOWS
 
     def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
         pipeline = super().get_pipeline(components_manager, torch_dtype)
@@ -125,6 +157,32 @@ def test_float16_inference(self):
         super().test_float16_inference(8e-2)
 
 
+FLUX_KONTEXT_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "FluxTextEncoderStep"),
+        ("denoise.input", "FluxTextInputStep"),
+        ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+        ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+        ("denoise.denoise", "FluxKontextDenoiseStep"),
+        ("decode", "FluxDecodeStep"),
+    ],
+    "image_conditioned": [
+        ("text_encoder", "FluxTextEncoderStep"),
+        ("vae_encoder.preprocess", "FluxKontextProcessImagesInputStep"),
+        ("vae_encoder.encode", "FluxVaeEncoderStep"),
+        ("denoise.input.set_resolution", "FluxKontextSetResolutionStep"),
+        ("denoise.input.text_inputs", "FluxTextInputStep"),
+        ("denoise.input.additional_inputs", "FluxKontextAdditionalInputsStep"),
+        ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+        ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
+        ("denoise.denoise", "FluxKontextDenoiseStep"),
+        ("decode", "FluxDecodeStep"),
+    ],
+}
+
+
 class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = FluxKontextModularPipeline
     pipeline_blocks_class = FluxKontextAutoBlocks
@@ -132,6 +190,7 @@ class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX_KONTEXT_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
index 8fd529e97e71..3045af636841 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
@@ -28,6 +28,21 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
+FLUX2_TEXT2IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "Flux2TextEncoderStep"),
+        ("denoise.input", "Flux2TextInputStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_guidance", "Flux2PrepareGuidanceStep"),
+        ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise.denoise", "Flux2DenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+
 class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2ModularPipeline
     pipeline_blocks_class = Flux2AutoBlocks
@@ -35,6 +50,7 @@ class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = FLUX2_TEXT2IMAGE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -56,6 +72,24 @@ def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
 
+FLUX2_IMAGE_CONDITIONED_WORKFLOWS = {
+    "image_conditioned": [
+        ("text_encoder", "Flux2TextEncoderStep"),
+        ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+        ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+        ("denoise.input", "Flux2TextInputStep"),
+        ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_guidance", "Flux2PrepareGuidanceStep"),
+        ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise.denoise", "Flux2DenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+
 class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2ModularPipeline
     pipeline_blocks_class = Flux2AutoBlocks
@@ -63,6 +97,7 @@ class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX2_IMAGE_CONDITIONED_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
index 26653b20f8c4..ad295a961357 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
@@ -28,13 +28,28 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
-class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
+FLUX2_KLEIN_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "Flux2KleinTextEncoderStep"),
+        ("denoise.input", "Flux2TextInputStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise.denoise", "Flux2KleinDenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+
+class TestFlux2KleinModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2KleinModularPipeline
     pipeline_blocks_class = Flux2KleinAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular"
 
     params = frozenset(["prompt", "height", "width"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = FLUX2_KLEIN_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -55,13 +70,31 @@ def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
 
-class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+FLUX2_KLEIN_IMAGE_CONDITIONED_WORKFLOWS = {
+    "image_conditioned": [
+        ("text_encoder", "Flux2KleinTextEncoderStep"),
+        ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+        ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+        ("denoise.input", "Flux2TextInputStep"),
+        ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+        ("denoise.denoise", "Flux2KleinDenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+
+class TestFlux2KleinImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = Flux2KleinModularPipeline
     pipeline_blocks_class = Flux2KleinAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular"
 
     params = frozenset(["prompt", "height", "width", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX2_KLEIN_IMAGE_CONDITIONED_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
index 701dd0fed896..b3aa79040317 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
@@ -21,20 +21,35 @@
 
 from diffusers.modular_pipelines import (
     Flux2KleinBaseAutoBlocks,
-    Flux2KleinModularPipeline,
+    Flux2KleinBaseModularPipeline,
 )
 
 from ...testing_utils import floats_tensor, torch_device
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
-class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
-    pipeline_class = Flux2KleinModularPipeline
+FLUX2_KLEIN_BASE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "Flux2KleinBaseTextEncoderStep"),
+        ("denoise.input", "Flux2KleinBaseTextInputStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "Flux2KleinBaseRoPEInputsStep"),
+        ("denoise.denoise", "Flux2KleinBaseDenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+
+class TestFlux2KleinBaseModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2KleinBaseModularPipeline
     pipeline_blocks_class = Flux2KleinBaseAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular"
 
     params = frozenset(["prompt", "height", "width"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = FLUX2_KLEIN_BASE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -55,13 +70,31 @@ def test_float16_inference(self):
         super().test_float16_inference(9e-2)
 
 
-class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
-    pipeline_class = Flux2KleinModularPipeline
+FLUX2_KLEIN_BASE_IMAGE_CONDITIONED_WORKFLOWS = {
+    "image_conditioned": [
+        ("text_encoder", "Flux2KleinBaseTextEncoderStep"),
+        ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+        ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+        ("denoise.input", "Flux2KleinBaseTextInputStep"),
+        ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+        ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+        ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "Flux2KleinBaseRoPEInputsStep"),
+        ("denoise.denoise", "Flux2KleinBaseDenoiseStep"),
+        ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+        ("decode", "Flux2DecodeStep"),
+    ],
+}
+
+
+class TestFlux2KleinBaseImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2KleinBaseModularPipeline
     pipeline_blocks_class = Flux2KleinBaseAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular"
 
     params = frozenset(["prompt", "height", "width", "image"])
     batch_params = frozenset(["prompt", "image"])
+    expected_workflow_blocks = FLUX2_KLEIN_BASE_IMAGE_CONDITIONED_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
index f4bd27b7ea47..92573c202e49 100644
--- a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
+++ b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
@@ -30,6 +30,103 @@
 from ..test_modular_pipelines_common import ModularGuiderTesterMixin, ModularPipelineTesterMixin
 
 
+QWEN_IMAGE_TEXT2IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("denoise.input", "QwenImageTextInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "image2image": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("vae_encoder.preprocess", "QwenImageProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_img2img_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "inpainting": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("vae_encoder.preprocess", "QwenImageInpaintProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageInpaintDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+    ],
+    "controlnet_text2image": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+        ("denoise.input", "QwenImageTextInputsStep"),
+        ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+        ("denoise.controlnet_denoise", "QwenImageControlNetDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "controlnet_image2image": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("vae_encoder.preprocess", "QwenImageProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_img2img_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+        ("denoise.controlnet_denoise", "QwenImageControlNetDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "controlnet_inpainting": [
+        ("text_encoder", "QwenImageTextEncoderStep"),
+        ("vae_encoder.preprocess", "QwenImageInpaintProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+        ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+        ("denoise.controlnet_denoise", "QwenImageInpaintControlNetDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+    ],
+}
+
+
 class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
     pipeline_class = QwenImageModularPipeline
     pipeline_blocks_class = QwenImageAutoBlocks
@@ -37,6 +134,7 @@ class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuider
 
     params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
     batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+    expected_workflow_blocks = QWEN_IMAGE_TEXT2IMAGE_WORKFLOWS
 
     def get_dummy_inputs(self):
         generator = self.get_generator()
@@ -56,6 +154,44 @@ def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=5e-4)
 
 
+QWEN_IMAGE_EDIT_WORKFLOWS = {
+    "image_conditioned": [
+        ("text_encoder.resize", "QwenImageEditResizeStep"),
+        ("text_encoder.encode", "QwenImageEditTextEncoderStep"),
+        ("vae_encoder.resize", "QwenImageEditResizeStep"),
+        ("vae_encoder.preprocess", "QwenImageEditProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageEditRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageEditDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+    ],
+    "image_conditioned_inpainting": [
+        ("text_encoder.resize", "QwenImageEditResizeStep"),
+        ("text_encoder.encode", "QwenImageEditTextEncoderStep"),
+        ("vae_encoder.resize", "QwenImageEditResizeStep"),
+        ("vae_encoder.preprocess", "QwenImageEditInpaintProcessImagesInputStep"),
+        ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+        ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+        ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+        ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+        ("denoise.prepare_rope_inputs", "QwenImageEditRoPEInputsStep"),
+        ("denoise.denoise", "QwenImageEditInpaintDenoiseStep"),
+        ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+        ("decode.decode", "QwenImageDecoderStep"),
+        ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+    ],
+}
+
+
 class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
     pipeline_class = QwenImageEditModularPipeline
     pipeline_blocks_class = QwenImageEditAutoBlocks
@@ -63,6 +199,7 @@ class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGu
 
     params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
     batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+    expected_workflow_blocks = QWEN_IMAGE_EDIT_WORKFLOWS
 
     def get_dummy_inputs(self):
         generator = self.get_generator()
diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
index 7b55933e4caf..f640f0ec83f2 100644
--- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
+++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
@@ -267,6 +267,60 @@ def test_controlnet_cfg(self):
         assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
 
 
+TEXT2IMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "controlnet_text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "controlnet_union_text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "ip_adapter_text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "ip_adapter_controlnet_text2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+}
+
+
 class TestSDXLModularPipelineFast(
     SDXLModularTesterMixin,
     SDXLModularIPAdapterTesterMixin,
@@ -291,6 +345,8 @@ class TestSDXLModularPipelineFast(
     batch_params = frozenset(["prompt", "negative_prompt"])
     expected_image_output_shape = (1, 3, 64, 64)
 
+    expected_workflow_blocks = TEXT2IMAGE_WORKFLOWS
+
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
         inputs = {
@@ -314,6 +370,65 @@ def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
 
+IMAGE2IMAGE_WORKFLOWS = {
+    "image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "controlnet_image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "controlnet_union_image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "ip_adapter_image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+    "ip_adapter_controlnet_image2image": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+        ("denoise.input", "StableDiffusionXLInputStep"),
+        ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+        ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLDecodeStep"),
+    ],
+}
+
+
 class TestSDXLImg2ImgModularPipelineFast(
     SDXLModularTesterMixin,
     SDXLModularIPAdapterTesterMixin,
@@ -338,6 +453,7 @@ class TestSDXLImg2ImgModularPipelineFast(
     )
     batch_params = frozenset(["prompt", "negative_prompt", "image"])
     expected_image_output_shape = (1, 3, 64, 64)
+    expected_workflow_blocks = IMAGE2IMAGE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)
@@ -367,6 +483,65 @@ def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 
 
+INPAINTING_WORKFLOWS = {
+    "inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise", "StableDiffusionXLInpaintDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+    "controlnet_inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+    "controlnet_union_inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+        ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+    "ip_adapter_inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("denoise", "StableDiffusionXLInpaintDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+    "ip_adapter_controlnet_inpainting": [
+        ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+        ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+        ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+        ("input", "StableDiffusionXLInputStep"),
+        ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+        ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+        ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+        ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+        ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+        ("decode", "StableDiffusionXLInpaintDecodeStep"),
+    ],
+}
+
+
 class SDXLInpaintingModularPipelineFastTests(
     SDXLModularTesterMixin,
     SDXLModularIPAdapterTesterMixin,
@@ -392,6 +567,7 @@ class SDXLInpaintingModularPipelineFastTests(
     )
     batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
     expected_image_output_shape = (1, 3, 64, 64)
+    expected_workflow_blocks = INPAINTING_WORKFLOWS
 
     def get_dummy_inputs(self, device, seed=0):
         generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py
index 937a9ccec880..e97b543ff85d 100644
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -100,6 +100,14 @@ def batch_params(self) -> frozenset:
             "See existing pipeline tests for reference."
         )
 
+    @property
+    def expected_workflow_blocks(self) -> dict:
+        raise NotImplementedError(
+            "You need to set the attribute `expected_workflow_blocks` in the child test class. "
+            "`expected_workflow_blocks` is a dictionary that maps workflow names to list of block names. "
+            "See existing pipeline tests for reference."
+        )
+
     def setup_method(self):
         # clean up the VRAM before each test
         torch.compiler.reset()
@@ -341,6 +349,34 @@ def test_save_from_pretrained(self):
 
         assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
 
+    def test_workflow_map(self):
+        blocks = self.pipeline_blocks_class()
+        if blocks._workflow_map is None:
+            pytest.skip("Skipping test as _workflow_map is not set")
+
+        assert hasattr(self, "expected_workflow_blocks") and self.expected_workflow_blocks, (
+            "expected_workflow_blocks must be defined in the test class"
+        )
+
+        for workflow_name, expected_blocks in self.expected_workflow_blocks.items():
+            workflow_blocks = blocks.get_workflow(workflow_name)
+            actual_blocks = list(workflow_blocks.sub_blocks.items())
+
+            # Check that the number of blocks matches
+            assert len(actual_blocks) == len(expected_blocks), (
+                f"Workflow '{workflow_name}' has {len(actual_blocks)} blocks, expected {len(expected_blocks)}"
+            )
+
+            # Check that each block name and type matches
+            for i, ((actual_name, actual_block), (expected_name, expected_class_name)) in enumerate(
+                zip(actual_blocks, expected_blocks)
+            ):
+                assert actual_name == expected_name
+                assert actual_block.__class__.__name__ == expected_class_name, (
+                    f"Workflow '{workflow_name}': block '{actual_name}' has type "
+                    f"{actual_block.__class__.__name__}, expected {expected_class_name}"
+                )
+
 
 class ModularGuiderTesterMixin:
     def test_guider_cfg(self, expected_max_diff=1e-2):
diff --git a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
index 29da18fce61b..ab45def3ef30 100644
--- a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
+++ b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
@@ -19,6 +19,30 @@
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 
 
+ZIMAGE_WORKFLOWS = {
+    "text2image": [
+        ("text_encoder", "ZImageTextEncoderStep"),
+        ("denoise.input", "ZImageTextInputStep"),
+        ("denoise.prepare_latents", "ZImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "ZImageSetTimestepsStep"),
+        ("denoise.denoise", "ZImageDenoiseStep"),
+        ("decode", "ZImageVaeDecoderStep"),
+    ],
+    "image2image": [
+        ("text_encoder", "ZImageTextEncoderStep"),
+        ("vae_encoder", "ZImageVaeImageEncoderStep"),
+        ("denoise.input", "ZImageTextInputStep"),
+        ("denoise.additional_inputs", "ZImageAdditionalInputsStep"),
+        ("denoise.prepare_latents", "ZImagePrepareLatentsStep"),
+        ("denoise.set_timesteps", "ZImageSetTimestepsStep"),
+        ("denoise.set_timesteps_with_strength", "ZImageSetTimestepsWithStrengthStep"),
+        ("denoise.prepare_latents_with_image", "ZImagePrepareLatentswithImageStep"),
+        ("denoise.denoise", "ZImageDenoiseStep"),
+        ("decode", "ZImageVaeDecoderStep"),
+    ],
+}
+
+
 class TestZImageModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = ZImageModularPipeline
     pipeline_blocks_class = ZImageAutoBlocks
@@ -26,6 +50,7 @@ class TestZImageModularPipelineFast(ModularPipelineTesterMixin):
 
     params = frozenset(["prompt", "height", "width"])
     batch_params = frozenset(["prompt"])
+    expected_workflow_blocks = ZIMAGE_WORKFLOWS
 
     def get_dummy_inputs(self, seed=0):
         generator = self.get_generator(seed)