diff --git a/docs/source/en/modular_diffusers/guiders.md b/docs/source/en/modular_diffusers/guiders.md
index 6abe4fad2736..ffe039f41556 100644
--- a/docs/source/en/modular_diffusers/guiders.md
+++ b/docs/source/en/modular_diffusers/guiders.md
@@ -89,10 +89,8 @@ t2i_pipeline.guider
## Changing guider parameters
-The guider parameters can be adjusted with either the [`~ComponentSpec.create`] method or with [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value.
+The guider parameters can be adjusted with the [`~ComponentSpec.create`] method and [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value.
-
-
```py
guider_spec = t2i_pipeline.get_component_spec("guider")
@@ -100,18 +98,6 @@ guider = guider_spec.create(guidance_scale=10)
t2i_pipeline.update_components(guider=guider)
```
-
-
-
-```py
-guider_spec = t2i_pipeline.get_component_spec("guider")
-guider_spec.config["guidance_scale"] = 10
-t2i_pipeline.update_components(guider=guider_spec)
-```
-
-
-
-
## Uploading custom guiders
Call the [`~utils.PushToHubMixin.push_to_hub`] method on a custom guider to share it to the Hub.
diff --git a/docs/source/zh/modular_diffusers/guiders.md b/docs/source/zh/modular_diffusers/guiders.md
index 50436f90c4a5..2315625a197a 100644
--- a/docs/source/zh/modular_diffusers/guiders.md
+++ b/docs/source/zh/modular_diffusers/guiders.md
@@ -86,10 +86,7 @@ t2i_pipeline.guider
## 更改引导器参数
-引导器参数可以通过 [`~ComponentSpec.create`] 方法或 [`~ModularPipeline.update_components`] 方法进行调整。下面的示例更改了 `guidance_scale` 值。
-
-
-
+引导器参数可以通过 [`~ComponentSpec.create`] 方法以及 [`~ModularPipeline.update_components`] 方法进行调整。下面的示例更改了 `guidance_scale` 值。
```py
guider_spec = t2i_pipeline.get_component_spec("guider")
@@ -97,18 +94,6 @@ guider = guider_spec.create(guidance_scale=10)
t2i_pipeline.update_components(guider=guider)
```
-
-
-
-```py
-guider_spec = t2i_pipeline.get_component_spec("guider")
-guider_spec.config["guidance_scale"] = 10
-t2i_pipeline.update_components(guider=guider_spec)
-```
-
-
-
-
## 上传自定义引导器
在自定义引导器上调用 [`~utils.PushToHubMixin.push_to_hub`] 方法,将其分享到 Hub。
diff --git a/src/diffusers/modular_pipelines/flux/__init__.py b/src/diffusers/modular_pipelines/flux/__init__.py
index ec00986611c8..4754ed01ce6a 100644
--- a/src/diffusers/modular_pipelines/flux/__init__.py
+++ b/src/diffusers/modular_pipelines/flux/__init__.py
@@ -21,21 +21,8 @@
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
- _import_structure["encoders"] = ["FluxTextEncoderStep"]
- _import_structure["modular_blocks"] = [
- "ALL_BLOCKS",
- "AUTO_BLOCKS",
- "AUTO_BLOCKS_KONTEXT",
- "FLUX_KONTEXT_BLOCKS",
- "TEXT2IMAGE_BLOCKS",
- "FluxAutoBeforeDenoiseStep",
- "FluxAutoBlocks",
- "FluxAutoDecodeStep",
- "FluxAutoDenoiseStep",
- "FluxKontextAutoBlocks",
- "FluxKontextAutoDenoiseStep",
- "FluxKontextBeforeDenoiseStep",
- ]
+ _import_structure["modular_blocks_flux"] = ["FluxAutoBlocks"]
+ _import_structure["modular_blocks_flux_kontext"] = ["FluxKontextAutoBlocks"]
_import_structure["modular_pipeline"] = ["FluxKontextModularPipeline", "FluxModularPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -45,21 +32,8 @@
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
- from .encoders import FluxTextEncoderStep
- from .modular_blocks import (
- ALL_BLOCKS,
- AUTO_BLOCKS,
- AUTO_BLOCKS_KONTEXT,
- FLUX_KONTEXT_BLOCKS,
- TEXT2IMAGE_BLOCKS,
- FluxAutoBeforeDenoiseStep,
- FluxAutoBlocks,
- FluxAutoDecodeStep,
- FluxAutoDenoiseStep,
- FluxKontextAutoBlocks,
- FluxKontextAutoDenoiseStep,
- FluxKontextBeforeDenoiseStep,
- )
+ from .modular_blocks_flux import FluxAutoBlocks
+ from .modular_blocks_flux_kontext import FluxKontextAutoBlocks
from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline
else:
import sys
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
index 4f94a17d88eb..583c139ff22e 100644
--- a/src/diffusers/modular_pipelines/flux/encoders.py
+++ b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -205,7 +205,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState):
return components, state
-class FluxVaeEncoderDynamicStep(ModularPipelineBlocks):
+class FluxVaeEncoderStep(ModularPipelineBlocks):
model_name = "flux"
def __init__(
diff --git a/src/diffusers/modular_pipelines/flux/inputs.py b/src/diffusers/modular_pipelines/flux/inputs.py
index dbf42e0c6df4..9d2f69dbe26f 100644
--- a/src/diffusers/modular_pipelines/flux/inputs.py
+++ b/src/diffusers/modular_pipelines/flux/inputs.py
@@ -121,7 +121,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
# Adapted from `QwenImageAdditionalInputsStep`
-class FluxInputsDynamicStep(ModularPipelineBlocks):
+class FluxAdditionalInputsStep(ModularPipelineBlocks):
model_name = "flux"
def __init__(
@@ -243,7 +243,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
return components, state
-class FluxKontextInputsDynamicStep(FluxInputsDynamicStep):
+class FluxKontextAdditionalInputsStep(FluxAdditionalInputsStep):
model_name = "flux-kontext"
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
@@ -256,7 +256,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
continue
# 1. Calculate height/width from latents
- # Unlike the `FluxInputsDynamicStep`, we don't overwrite the `block.height` and `block.width`
+ # Unlike the `FluxAdditionalInputsStep`, we don't overwrite the `block.height` and `block.width`
height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
if not hasattr(block_state, "image_height"):
block_state.image_height = height
@@ -303,6 +303,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
class FluxKontextSetResolutionStep(ModularPipelineBlocks):
model_name = "flux-kontext"
+ @property
def description(self):
return (
"Determines the height and width to be used during the subsequent computations.\n"
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
deleted file mode 100644
index bd9b2d1b40c9..000000000000
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
- FluxImg2ImgPrepareLatentsStep,
- FluxImg2ImgSetTimestepsStep,
- FluxKontextRoPEInputsStep,
- FluxPrepareLatentsStep,
- FluxRoPEInputsStep,
- FluxSetTimestepsStep,
-)
-from .decoders import FluxDecodeStep
-from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
-from .encoders import (
- FluxKontextProcessImagesInputStep,
- FluxProcessImagesInputStep,
- FluxTextEncoderStep,
- FluxVaeEncoderDynamicStep,
-)
-from .inputs import (
- FluxInputsDynamicStep,
- FluxKontextInputsDynamicStep,
- FluxKontextSetResolutionStep,
- FluxTextInputStep,
-)
-
-
-logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-
-
-# vae encoder (run before before_denoise)
-FluxImg2ImgVaeEncoderBlocks = InsertableDict(
- [("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep())]
-)
-
-
-class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
- model_name = "flux"
-
- block_classes = FluxImg2ImgVaeEncoderBlocks.values()
- block_names = FluxImg2ImgVaeEncoderBlocks.keys()
-
- @property
- def description(self) -> str:
- return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
-
-
-class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
- block_classes = [FluxImg2ImgVaeEncoderStep]
- block_names = ["img2img"]
- block_trigger_inputs = ["image"]
-
- @property
- def description(self):
- return (
- "Vae encoder step that encode the image inputs into their latent representations.\n"
- + "This is an auto pipeline block that works for img2img tasks.\n"
- + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
- + " - if `image` is not provided, step will be skipped."
- )
-
-
-# Flux Kontext vae encoder (run before before_denoise)
-
-FluxKontextVaeEncoderBlocks = InsertableDict(
- [("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep(sample_mode="argmax"))]
-)
-
-
-class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
- model_name = "flux-kontext"
-
- block_classes = FluxKontextVaeEncoderBlocks.values()
- block_names = FluxKontextVaeEncoderBlocks.keys()
-
- @property
- def description(self) -> str:
- return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
-
-
-class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
- block_classes = [FluxKontextVaeEncoderStep]
- block_names = ["img2img"]
- block_trigger_inputs = ["image"]
-
- @property
- def description(self):
- return (
- "Vae encoder step that encode the image inputs into their latent representations.\n"
- + "This is an auto pipeline block that works for img2img tasks.\n"
- + " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided."
- + " - if `image` is not provided, step will be skipped."
- )
-
-
-# before_denoise: text2img
-FluxBeforeDenoiseBlocks = InsertableDict(
- [
- ("prepare_latents", FluxPrepareLatentsStep()),
- ("set_timesteps", FluxSetTimestepsStep()),
- ("prepare_rope_inputs", FluxRoPEInputsStep()),
- ]
-)
-
-
-class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
- block_classes = FluxBeforeDenoiseBlocks.values()
- block_names = FluxBeforeDenoiseBlocks.keys()
-
- @property
- def description(self):
- return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
-
-
-# before_denoise: img2img
-FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
- [
- ("prepare_latents", FluxPrepareLatentsStep()),
- ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
- ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
- ("prepare_rope_inputs", FluxRoPEInputsStep()),
- ]
-)
-
-
-class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
- block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
- block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
-
- @property
- def description(self):
- return "Before denoise step that prepare the inputs for the denoise step for img2img task."
-
-
-# before_denoise: all task (text2img, img2img)
-class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
- model_name = "flux-kontext"
- block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
- block_names = ["img2img", "text2image"]
- block_trigger_inputs = ["image_latents", None]
-
- @property
- def description(self):
- return (
- "Before denoise step that prepare the inputs for the denoise step.\n"
- + "This is an auto pipeline block that works for text2image.\n"
- + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
- + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
- )
-
-
-# before_denoise: FluxKontext
-
-FluxKontextBeforeDenoiseBlocks = InsertableDict(
- [
- ("prepare_latents", FluxPrepareLatentsStep()),
- ("set_timesteps", FluxSetTimestepsStep()),
- ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
- ]
-)
-
-
-class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
- block_classes = FluxKontextBeforeDenoiseBlocks.values()
- block_names = FluxKontextBeforeDenoiseBlocks.keys()
-
- @property
- def description(self):
- return (
- "Before denoise step that prepare the inputs for the denoise step\n"
- "for img2img/text2img task for Flux Kontext."
- )
-
-
-class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
- block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep]
- block_names = ["img2img", "text2image"]
- block_trigger_inputs = ["image_latents", None]
-
- @property
- def description(self):
- return (
- "Before denoise step that prepare the inputs for the denoise step.\n"
- + "This is an auto pipeline block that works for text2image.\n"
- + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
- + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
- )
-
-
-# denoise: text2image
-class FluxAutoDenoiseStep(AutoPipelineBlocks):
- block_classes = [FluxDenoiseStep]
- block_names = ["denoise"]
- block_trigger_inputs = [None]
-
- @property
- def description(self) -> str:
- return (
- "Denoise step that iteratively denoise the latents. "
- "This is a auto pipeline block that works for text2image and img2img tasks."
- " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
- )
-
-
-# denoise: Flux Kontext
-
-
-class FluxKontextAutoDenoiseStep(AutoPipelineBlocks):
- block_classes = [FluxKontextDenoiseStep]
- block_names = ["denoise"]
- block_trigger_inputs = [None]
-
- @property
- def description(self) -> str:
- return (
- "Denoise step that iteratively denoise the latents for Flux Kontext. "
- "This is a auto pipeline block that works for text2image and img2img tasks."
- " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
- )
-
-
-# decode: all task (text2img, img2img)
-class FluxAutoDecodeStep(AutoPipelineBlocks):
- block_classes = [FluxDecodeStep]
- block_names = ["non-inpaint"]
- block_trigger_inputs = [None]
-
- @property
- def description(self):
- return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
-
-
-# inputs: text2image/img2img
-FluxImg2ImgBlocks = InsertableDict(
- [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
-)
-
-
-class FluxImg2ImgInputStep(SequentialPipelineBlocks):
- model_name = "flux"
- block_classes = FluxImg2ImgBlocks.values()
- block_names = FluxImg2ImgBlocks.keys()
-
- @property
- def description(self):
- return "Input step that prepares the inputs for the img2img denoising step. It:\n"
- " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
- " - update height/width based `image_latents`, patchify `image_latents`."
-
-
-class FluxAutoInputStep(AutoPipelineBlocks):
- block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
- block_names = ["img2img", "text2image"]
- block_trigger_inputs = ["image_latents", None]
-
- @property
- def description(self):
- return (
- "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
- " This is an auto pipeline block that works for text2image/img2img tasks.\n"
- + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
- + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
- )
-
-
-# inputs: Flux Kontext
-
-FluxKontextBlocks = InsertableDict(
- [
- ("set_resolution", FluxKontextSetResolutionStep()),
- ("text_inputs", FluxTextInputStep()),
- ("additional_inputs", FluxKontextInputsDynamicStep()),
- ]
-)
-
-
-class FluxKontextInputStep(SequentialPipelineBlocks):
- model_name = "flux-kontext"
- block_classes = FluxKontextBlocks.values()
- block_names = FluxKontextBlocks.keys()
-
- @property
- def description(self):
- return (
- "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
- " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
- " - update height/width based `image_latents`, patchify `image_latents`."
- )
-
-
-class FluxKontextAutoInputStep(AutoPipelineBlocks):
- block_classes = [FluxKontextInputStep, FluxTextInputStep]
- # block_classes = [FluxKontextInputStep]
- block_names = ["img2img", "text2img"]
- # block_names = ["img2img"]
- block_trigger_inputs = ["image_latents", None]
- # block_trigger_inputs = ["image_latents"]
-
- @property
- def description(self):
- return (
- "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
- " This is an auto pipeline block that works for text2image/img2img tasks.\n"
- + " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n"
- + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
- )
-
-
-class FluxCoreDenoiseStep(SequentialPipelineBlocks):
- model_name = "flux"
- block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
- block_names = ["input", "before_denoise", "denoise"]
-
- @property
- def description(self):
- return (
- "Core step that performs the denoising process. \n"
- + " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
- + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
- + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
- + "This step supports text-to-image and image-to-image tasks for Flux:\n"
- + " - for image-to-image generation, you need to provide `image_latents`\n"
- + " - for text-to-image generation, all you need to provide is prompt embeddings."
- )
-
-
-class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
- model_name = "flux-kontext"
- block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextAutoDenoiseStep]
- block_names = ["input", "before_denoise", "denoise"]
-
- @property
- def description(self):
- return (
- "Core step that performs the denoising process. \n"
- + " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
- + " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
- + " - `FluxKontextAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
- + "This step supports text-to-image and image-to-image tasks for Flux:\n"
- + " - for image-to-image generation, you need to provide `image_latents`\n"
- + " - for text-to-image generation, all you need to provide is prompt embeddings."
- )
-
-
-# Auto blocks (text2image and img2img)
-AUTO_BLOCKS = InsertableDict(
- [
- ("text_encoder", FluxTextEncoderStep()),
- ("vae_encoder", FluxAutoVaeEncoderStep()),
- ("denoise", FluxCoreDenoiseStep()),
- ("decode", FluxDecodeStep()),
- ]
-)
-
-AUTO_BLOCKS_KONTEXT = InsertableDict(
- [
- ("text_encoder", FluxTextEncoderStep()),
- ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
- ("denoise", FluxKontextCoreDenoiseStep()),
- ("decode", FluxDecodeStep()),
- ]
-)
-
-
-class FluxAutoBlocks(SequentialPipelineBlocks):
- model_name = "flux"
-
- block_classes = AUTO_BLOCKS.values()
- block_names = AUTO_BLOCKS.keys()
-
- @property
- def description(self):
- return (
- "Auto Modular pipeline for text-to-image and image-to-image using Flux.\n"
- + "- for text-to-image generation, all you need to provide is `prompt`\n"
- + "- for image-to-image generation, you need to provide either `image` or `image_latents`"
- )
-
-
-class FluxKontextAutoBlocks(FluxAutoBlocks):
- model_name = "flux-kontext"
-
- block_classes = AUTO_BLOCKS_KONTEXT.values()
- block_names = AUTO_BLOCKS_KONTEXT.keys()
-
-
-TEXT2IMAGE_BLOCKS = InsertableDict(
- [
- ("text_encoder", FluxTextEncoderStep()),
- ("input", FluxTextInputStep()),
- ("prepare_latents", FluxPrepareLatentsStep()),
- ("set_timesteps", FluxSetTimestepsStep()),
- ("prepare_rope_inputs", FluxRoPEInputsStep()),
- ("denoise", FluxDenoiseStep()),
- ("decode", FluxDecodeStep()),
- ]
-)
-
-IMAGE2IMAGE_BLOCKS = InsertableDict(
- [
- ("text_encoder", FluxTextEncoderStep()),
- ("vae_encoder", FluxVaeEncoderDynamicStep()),
- ("input", FluxImg2ImgInputStep()),
- ("prepare_latents", FluxPrepareLatentsStep()),
- ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
- ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
- ("prepare_rope_inputs", FluxRoPEInputsStep()),
- ("denoise", FluxDenoiseStep()),
- ("decode", FluxDecodeStep()),
- ]
-)
-
-FLUX_KONTEXT_BLOCKS = InsertableDict(
- [
- ("text_encoder", FluxTextEncoderStep()),
- ("vae_encoder", FluxVaeEncoderDynamicStep(sample_mode="argmax")),
- ("input", FluxKontextInputStep()),
- ("prepare_latents", FluxPrepareLatentsStep()),
- ("set_timesteps", FluxSetTimestepsStep()),
- ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
- ("denoise", FluxKontextDenoiseStep()),
- ("decode", FluxDecodeStep()),
- ]
-)
-
-ALL_BLOCKS = {
- "text2image": TEXT2IMAGE_BLOCKS,
- "img2img": IMAGE2IMAGE_BLOCKS,
- "auto": AUTO_BLOCKS,
- "auto_kontext": AUTO_BLOCKS_KONTEXT,
- "kontext": FLUX_KONTEXT_BLOCKS,
-}
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
new file mode 100644
index 000000000000..f2e78e933448
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py
@@ -0,0 +1,586 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+ FluxImg2ImgPrepareLatentsStep,
+ FluxImg2ImgSetTimestepsStep,
+ FluxPrepareLatentsStep,
+ FluxRoPEInputsStep,
+ FluxSetTimestepsStep,
+)
+from .decoders import FluxDecodeStep
+from .denoise import FluxDenoiseStep
+from .encoders import (
+ FluxProcessImagesInputStep,
+ FluxTextEncoderStep,
+ FluxVaeEncoderStep,
+)
+from .inputs import (
+ FluxAdditionalInputsStep,
+ FluxTextInputStep,
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+# vae encoder (run before before_denoise)
+
+
+# auto_docstring
+class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+ """
+ Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+ Components:
+ image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+ Inputs:
+ resized_image (`None`, *optional*):
+ TODO: Add description.
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ processed_image (`None`):
+ TODO: Add description.
+ image_latents (`Tensor`):
+ The latents representing the reference image
+ """
+
+ model_name = "flux"
+
+ block_classes = [FluxProcessImagesInputStep(), FluxVaeEncoderStep()]
+ block_names = ["preprocess", "encode"]
+
+ @property
+ def description(self) -> str:
+ return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
+# auto_docstring
+class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
+ """
+ Vae encoder step that encode the image inputs into their latent representations.
+ This is an auto pipeline block that works for img2img tasks.
+ - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided. - if `image` is not provided,
+ step will be skipped.
+
+ Components:
+ image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+ Inputs:
+ resized_image (`None`, *optional*):
+ TODO: Add description.
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ processed_image (`None`):
+ TODO: Add description.
+ image_latents (`Tensor`):
+ The latents representing the reference image
+ """
+
+ model_name = "flux"
+ block_classes = [FluxImg2ImgVaeEncoderStep]
+ block_names = ["img2img"]
+ block_trigger_inputs = ["image"]
+
+ @property
+ def description(self):
+ return (
+ "Vae encoder step that encode the image inputs into their latent representations.\n"
+ + "This is an auto pipeline block that works for img2img tasks.\n"
+ + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
+ + " - if `image` is not provided, step will be skipped."
+ )
+
+
+# before_denoise: text2img
+# auto_docstring
+class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
+ """
+ Before denoise step that prepares the inputs for the denoise step in text-to-image generation.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+ Inputs:
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+ Can be generated in input step.
+ dtype (`dtype`, *optional*):
+ The dtype of the model inputs
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ prompt_embeds (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ The initial latents to use for the denoising process
+ timesteps (`Tensor`):
+ The timesteps to use for inference
+ num_inference_steps (`int`):
+ The number of denoising steps to perform at inference time
+ guidance (`Tensor`):
+ Optional guidance to be used.
+ txt_ids (`list`):
+ The sequence lengths of the prompt embeds, used for RoPE calculation.
+ img_ids (`list`):
+ The sequence lengths of the image latents, used for RoPE calculation.
+ """
+
+ model_name = "flux"
+ block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()]
+ block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
+
+ @property
+ def description(self):
+ return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
+
+
+# before_denoise: img2img
+# auto_docstring
+class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
+ """
+ Before denoise step that prepare the inputs for the denoise step for img2img task.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+ Inputs:
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+ Can be generated in input step.
+ dtype (`dtype`, *optional*):
+ The dtype of the model inputs
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ strength (`None`, *optional*, defaults to 0.6):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ image_latents (`Tensor`):
+ The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+ step.
+ prompt_embeds (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ The initial latents to use for the denoising process
+ timesteps (`Tensor`):
+ The timesteps to use for inference
+ num_inference_steps (`int`):
+ The number of denoising steps to perform at inference time
+ guidance (`Tensor`):
+ Optional guidance to be used.
+ initial_noise (`Tensor`):
+ The initial random noised used for inpainting denoising.
+ txt_ids (`list`):
+ The sequence lengths of the prompt embeds, used for RoPE calculation.
+ img_ids (`list`):
+ The sequence lengths of the image latents, used for RoPE calculation.
+ """
+
+ model_name = "flux"
+ block_classes = [
+ FluxPrepareLatentsStep(),
+ FluxImg2ImgSetTimestepsStep(),
+ FluxImg2ImgPrepareLatentsStep(),
+ FluxRoPEInputsStep(),
+ ]
+ block_names = ["prepare_latents", "set_timesteps", "prepare_img2img_latents", "prepare_rope_inputs"]
+
+ @property
+ def description(self):
+ return "Before denoise step that prepare the inputs for the denoise step for img2img task."
+
+
+# before_denoise: all task (text2img, img2img)
+# auto_docstring
+class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
+ """
+ Before denoise step that prepare the inputs for the denoise step.
+ This is an auto pipeline block that works for text2image.
+ - `FluxBeforeDenoiseStep` (text2image) is used.
+ - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+ Inputs:
+ height (`int`):
+ TODO: Add description.
+ width (`int`):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+ Can be generated in input step.
+ dtype (`dtype`, *optional*):
+ The dtype of the model inputs
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ strength (`None`, *optional*, defaults to 0.6):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ image_latents (`Tensor`, *optional*):
+ The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+ step.
+ prompt_embeds (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ The initial latents to use for the denoising process
+ timesteps (`Tensor`):
+ The timesteps to use for inference
+ num_inference_steps (`int`):
+ The number of denoising steps to perform at inference time
+ guidance (`Tensor`):
+ Optional guidance to be used.
+ initial_noise (`Tensor`):
+ The initial random noised used for inpainting denoising.
+ txt_ids (`list`):
+ The sequence lengths of the prompt embeds, used for RoPE calculation.
+ img_ids (`list`):
+ The sequence lengths of the image latents, used for RoPE calculation.
+ """
+
+ model_name = "flux"
+ block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
+ block_names = ["img2img", "text2image"]
+ block_trigger_inputs = ["image_latents", None]
+
+ @property
+ def description(self):
+ return (
+ "Before denoise step that prepare the inputs for the denoise step.\n"
+ + "This is an auto pipeline block that works for text2image.\n"
+ + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
+ + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
+ )
+
+
+# inputs: text2image/img2img
+
+
+# auto_docstring
+class FluxImg2ImgInputStep(SequentialPipelineBlocks):
+ """
+ Input step that prepares the inputs for the img2img denoising step. It:
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ pooled_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+ dtype (`dtype`):
+ Data type of model tensor inputs (determined by `prompt_embeds`)
+ prompt_embeds (`Tensor`):
+ text embeddings used to guide the image generation
+ pooled_prompt_embeds (`Tensor`):
+ pooled text embeddings used to guide the image generation
+ image_height (`int`):
+ The height of the image latents
+ image_width (`int`):
+ The width of the image latents
+ """
+
+ model_name = "flux"
+ block_classes = [FluxTextInputStep(), FluxAdditionalInputsStep()]
+ block_names = ["text_inputs", "additional_inputs"]
+
+ @property
+ def description(self):
+ return "Input step that prepares the inputs for the img2img denoising step. It:\n"
+ " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+ " - update height/width based `image_latents`, patchify `image_latents`."
+
+
+# auto_docstring
+class FluxAutoInputStep(AutoPipelineBlocks):
+ """
+ Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size,
+ and patchified.
+ This is an auto pipeline block that works for text2image/img2img tasks.
+ - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.
+ - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ pooled_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+ dtype (`dtype`):
+ Data type of model tensor inputs (determined by `prompt_embeds`)
+ prompt_embeds (`Tensor`):
+ text embeddings used to guide the image generation
+ pooled_prompt_embeds (`Tensor`):
+ pooled text embeddings used to guide the image generation
+ image_height (`int`):
+ The height of the image latents
+ image_width (`int`):
+ The width of the image latents
+ """
+
+ model_name = "flux"
+
+ block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
+ block_names = ["img2img", "text2image"]
+ block_trigger_inputs = ["image_latents", None]
+
+ @property
+ def description(self):
+ return (
+ "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+ " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+ + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
+ + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
+ )
+
+
+# auto_docstring
+class FluxCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ Core step that performs the denoising process for Flux.
+ This step supports text-to-image and image-to-image tasks for Flux:
+ - for image-to-image generation, you need to provide `image_latents`
+ - for text-to-image generation, all you need to provide is prompt embeddings.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ pooled_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ strength (`None`, *optional*, defaults to 0.6):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
+ model_name = "flux"
+ block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxDenoiseStep]
+ block_names = ["input", "before_denoise", "denoise"]
+
+ @property
+ def description(self):
+ return (
+ "Core step that performs the denoising process for Flux.\n"
+ + "This step supports text-to-image and image-to-image tasks for Flux:\n"
+ + " - for image-to-image generation, you need to provide `image_latents`\n"
+ + " - for text-to-image generation, all you need to provide is prompt embeddings."
+ )
+
+ @property
+ def outputs(self):
+ return [
+ OutputParam.template("latents"),
+ ]
+
+
+# Auto blocks (text2image and img2img)
+AUTO_BLOCKS = InsertableDict(
+ [
+ ("text_encoder", FluxTextEncoderStep()),
+ ("vae_encoder", FluxAutoVaeEncoderStep()),
+ ("denoise", FluxCoreDenoiseStep()),
+ ("decode", FluxDecodeStep()),
+ ]
+)
+
+
+# auto_docstring
+class FluxAutoBlocks(SequentialPipelineBlocks):
+ """
+ Auto Modular pipeline for text-to-image and image-to-image using Flux.
+
+ Supported workflows:
+ - `text2image`: requires `prompt`
+ - `image2image`: requires `image`, `prompt`
+
+ Components:
+ text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2
+ (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler
+ (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ prompt_2 (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`int`, *optional*, defaults to 512):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ resized_image (`None`, *optional*):
+ TODO: Add description.
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ strength (`None`, *optional*, defaults to 0.6):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ output_type (`None`, *optional*, defaults to pil):
+ TODO: Add description.
+
+ Outputs:
+ images (`list`):
+ Generated images.
+ """
+
+ model_name = "flux"
+
+ block_classes = AUTO_BLOCKS.values()
+ block_names = AUTO_BLOCKS.keys()
+
+ _workflow_map = {
+ "text2image": {"prompt": True},
+ "image2image": {"image": True, "prompt": True},
+ }
+
+ @property
+ def description(self):
+ return "Auto Modular pipeline for text-to-image and image-to-image using Flux."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
new file mode 100644
index 000000000000..b5a5dbf78c0e
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py
@@ -0,0 +1,585 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+ FluxKontextRoPEInputsStep,
+ FluxPrepareLatentsStep,
+ FluxRoPEInputsStep,
+ FluxSetTimestepsStep,
+)
+from .decoders import FluxDecodeStep
+from .denoise import FluxKontextDenoiseStep
+from .encoders import (
+ FluxKontextProcessImagesInputStep,
+ FluxTextEncoderStep,
+ FluxVaeEncoderStep,
+)
+from .inputs import (
+ FluxKontextAdditionalInputsStep,
+ FluxKontextSetResolutionStep,
+ FluxTextInputStep,
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+# Flux Kontext vae encoder (run before before_denoise)
+# auto_docstring
+class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
+ """
+ Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+ Components:
+ image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+ Inputs:
+ image (`None`, *optional*):
+ TODO: Add description.
+ _auto_resize (`bool`, *optional*, defaults to True):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ processed_image (`None`):
+ TODO: Add description.
+ image_latents (`Tensor`):
+ The latents representing the reference image
+ """
+
+ model_name = "flux-kontext"
+
+ block_classes = [FluxKontextProcessImagesInputStep(), FluxVaeEncoderStep(sample_mode="argmax")]
+ block_names = ["preprocess", "encode"]
+
+ @property
+ def description(self) -> str:
+ return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
+# auto_docstring
+class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
+ """
+ Vae encoder step that encode the image inputs into their latent representations.
+ This is an auto pipeline block that works for image-conditioned tasks.
+ - `FluxKontextVaeEncoderStep` (image_conditioned) is used when only `image` is provided. - if `image` is not
+ provided, step will be skipped.
+
+ Components:
+ image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)
+
+ Inputs:
+ image (`None`, *optional*):
+ TODO: Add description.
+ _auto_resize (`bool`, *optional*, defaults to True):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ processed_image (`None`):
+ TODO: Add description.
+ image_latents (`Tensor`):
+ The latents representing the reference image
+ """
+
+ model_name = "flux-kontext"
+
+ block_classes = [FluxKontextVaeEncoderStep]
+ block_names = ["image_conditioned"]
+ block_trigger_inputs = ["image"]
+
+ @property
+ def description(self):
+ return (
+ "Vae encoder step that encode the image inputs into their latent representations.\n"
+ + "This is an auto pipeline block that works for image-conditioned tasks.\n"
+ + " - `FluxKontextVaeEncoderStep` (image_conditioned) is used when only `image` is provided."
+ + " - if `image` is not provided, step will be skipped."
+ )
+
+
+# before_denoise: text2img
+# auto_docstring
+class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
+ """
+ Before denoise step that prepares the inputs for the denoise step for Flux Kontext
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+ Inputs:
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+ Can be generated in input step.
+ dtype (`dtype`, *optional*):
+ The dtype of the model inputs
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ prompt_embeds (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ The initial latents to use for the denoising process
+ timesteps (`Tensor`):
+ The timesteps to use for inference
+ num_inference_steps (`int`):
+ The number of denoising steps to perform at inference time
+ guidance (`Tensor`):
+ Optional guidance to be used.
+ txt_ids (`list`):
+ The sequence lengths of the prompt embeds, used for RoPE calculation.
+ img_ids (`list`):
+ The sequence lengths of the image latents, used for RoPE calculation.
+ """
+
+ model_name = "flux-kontext"
+
+ block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()]
+ block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
+
+ @property
+ def description(self):
+ return "Before denoise step that prepares the inputs for the denoise step for Flux Kontext\n"
+ "for text-to-image tasks."
+
+
+# before_denoise: image-conditioned
+# auto_docstring
+class FluxKontextImageConditionedBeforeDenoiseStep(SequentialPipelineBlocks):
+ """
+ Before denoise step that prepare the inputs for the denoise step for Flux Kontext
+ for image-conditioned tasks.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+ Inputs:
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+ Can be generated in input step.
+ dtype (`dtype`, *optional*):
+ The dtype of the model inputs
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ image_height (`None`, *optional*):
+ TODO: Add description.
+ image_width (`None`, *optional*):
+ TODO: Add description.
+ prompt_embeds (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ The initial latents to use for the denoising process
+ timesteps (`Tensor`):
+ The timesteps to use for inference
+ num_inference_steps (`int`):
+ The number of denoising steps to perform at inference time
+ guidance (`Tensor`):
+ Optional guidance to be used.
+ txt_ids (`list`):
+ The sequence lengths of the prompt embeds, used for RoPE calculation.
+ img_ids (`list`):
+ The sequence lengths of the image latents, used for RoPE calculation.
+ """
+
+ model_name = "flux-kontext"
+
+ block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxKontextRoPEInputsStep()]
+ block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"]
+
+ @property
+ def description(self):
+ return (
+ "Before denoise step that prepare the inputs for the denoise step for Flux Kontext\n"
+ "for image-conditioned tasks."
+ )
+
+
+# auto_docstring
+class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
+ """
+ Before denoise step that prepare the inputs for the denoise step.
+ This is an auto pipeline block that works for text2image.
+ - `FluxKontextBeforeDenoiseStep` (text2image) is used.
+ - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is
+ provided.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+ Inputs:
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
+ Can be generated in input step.
+ dtype (`dtype`, *optional*):
+ The dtype of the model inputs
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ image_height (`None`, *optional*):
+ TODO: Add description.
+ image_width (`None`, *optional*):
+ TODO: Add description.
+ prompt_embeds (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ The initial latents to use for the denoising process
+ timesteps (`Tensor`):
+ The timesteps to use for inference
+ num_inference_steps (`int`):
+ The number of denoising steps to perform at inference time
+ guidance (`Tensor`):
+ Optional guidance to be used.
+ txt_ids (`list`):
+ The sequence lengths of the prompt embeds, used for RoPE calculation.
+ img_ids (`list`):
+ The sequence lengths of the image latents, used for RoPE calculation.
+ """
+
+ model_name = "flux-kontext"
+
+ block_classes = [FluxKontextImageConditionedBeforeDenoiseStep, FluxKontextBeforeDenoiseStep]
+ block_names = ["image_conditioned", "text2image"]
+ block_trigger_inputs = ["image_latents", None]
+
+ @property
+ def description(self):
+ return (
+ "Before denoise step that prepare the inputs for the denoise step.\n"
+ + "This is an auto pipeline block that works for text2image.\n"
+ + " - `FluxKontextBeforeDenoiseStep` (text2image) is used.\n"
+ + " - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is provided.\n"
+ )
+
+
+# inputs: Flux Kontext
+# auto_docstring
+class FluxKontextInputStep(SequentialPipelineBlocks):
+ """
+ Input step that prepares the inputs for the both text2img and img2img denoising step. It:
+ - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).
+ - update height/width based `image_latents`, patchify `image_latents`.
+
+ Inputs:
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ max_area (`int`, *optional*, defaults to 1048576):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ pooled_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ height (`int`):
+ The height of the initial noisy latents
+ width (`int`):
+ The width of the initial noisy latents
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+ dtype (`dtype`):
+ Data type of model tensor inputs (determined by `prompt_embeds`)
+ prompt_embeds (`Tensor`):
+ text embeddings used to guide the image generation
+ pooled_prompt_embeds (`Tensor`):
+ pooled text embeddings used to guide the image generation
+ image_height (`int`):
+ The height of the image latents
+ image_width (`int`):
+ The width of the image latents
+ """
+
+ model_name = "flux-kontext"
+ block_classes = [FluxKontextSetResolutionStep(), FluxTextInputStep(), FluxKontextAdditionalInputsStep()]
+ block_names = ["set_resolution", "text_inputs", "additional_inputs"]
+
+ @property
+ def description(self):
+ return (
+ "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
+ " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+ " - update height/width based `image_latents`, patchify `image_latents`."
+ )
+
+
+# auto_docstring
+class FluxKontextAutoInputStep(AutoPipelineBlocks):
+ """
+ Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size,
+ and patchified.
+ This is an auto pipeline block that works for text2image/img2img tasks.
+ - `FluxKontextInputStep` (image_conditioned) is used when `image_latents` is provided.
+ - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present.
+
+ Inputs:
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ max_area (`int`, *optional*, defaults to 1048576):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ pooled_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ height (`int`):
+ The height of the initial noisy latents
+ width (`int`):
+ The width of the initial noisy latents
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+ dtype (`dtype`):
+ Data type of model tensor inputs (determined by `prompt_embeds`)
+ prompt_embeds (`Tensor`):
+ text embeddings used to guide the image generation
+ pooled_prompt_embeds (`Tensor`):
+ pooled text embeddings used to guide the image generation
+ image_height (`int`):
+ The height of the image latents
+ image_width (`int`):
+ The width of the image latents
+ """
+
+ model_name = "flux-kontext"
+ block_classes = [FluxKontextInputStep, FluxTextInputStep]
+ block_names = ["image_conditioned", "text2image"]
+ block_trigger_inputs = ["image_latents", None]
+
+ @property
+ def description(self):
+ return (
+ "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+ " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+ + " - `FluxKontextInputStep` (image_conditioned) is used when `image_latents` is provided.\n"
+ + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
+ )
+
+
+# auto_docstring
+class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ Core step that performs the denoising process for Flux Kontext.
+ This step supports text-to-image and image-conditioned tasks for Flux Kontext:
+ - for image-conditioned generation, you need to provide `image_latents`
+ - for text-to-image generation, all you need to provide is prompt embeddings.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+ Inputs:
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ max_area (`int`, *optional*, defaults to 1048576):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ pooled_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated pooled text embeddings. Can be generated from text_encoder step.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
+ model_name = "flux-kontext"
+ block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextDenoiseStep]
+ block_names = ["input", "before_denoise", "denoise"]
+
+ @property
+ def description(self):
+ return (
+ "Core step that performs the denoising process for Flux Kontext.\n"
+ + "This step supports text-to-image and image-conditioned tasks for Flux Kontext:\n"
+ + " - for image-conditioned generation, you need to provide `image_latents`\n"
+ + " - for text-to-image generation, all you need to provide is prompt embeddings."
+ )
+
+ @property
+ def outputs(self):
+ return [
+ OutputParam.template("latents"),
+ ]
+
+
+AUTO_BLOCKS_KONTEXT = InsertableDict(
+ [
+ ("text_encoder", FluxTextEncoderStep()),
+ ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
+ ("denoise", FluxKontextCoreDenoiseStep()),
+ ("decode", FluxDecodeStep()),
+ ]
+)
+
+
+# auto_docstring
+class FluxKontextAutoBlocks(SequentialPipelineBlocks):
+ """
+ Modular pipeline for image-to-image using Flux Kontext.
+
+ Supported workflows:
+ - `image_conditioned`: requires `image`, `prompt`
+ - `text2image`: requires `prompt`
+
+ Components:
+ text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2
+ (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler
+ (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ prompt_2 (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`int`, *optional*, defaults to 512):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image (`None`, *optional*):
+ TODO: Add description.
+ _auto_resize (`bool`, *optional*, defaults to True):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ max_area (`int`, *optional*, defaults to 1048576):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 3.5):
+ TODO: Add description.
+ output_type (`None`, *optional*, defaults to pil):
+ TODO: Add description.
+
+ Outputs:
+ images (`list`):
+ Generated images.
+ """
+
+ model_name = "flux-kontext"
+
+ block_classes = AUTO_BLOCKS_KONTEXT.values()
+ block_names = AUTO_BLOCKS_KONTEXT.keys()
+ _workflow_map = {
+ "image_conditioned": {"image": True, "prompt": True},
+ "text2image": {"prompt": True},
+ }
+
+ @property
+ def description(self):
+ return "Modular pipeline for image-to-image using Flux Kontext."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/flux2/__init__.py b/src/diffusers/modular_pipelines/flux2/__init__.py
index 74907a9af806..d7cc8badcaf7 100644
--- a/src/diffusers/modular_pipelines/flux2/__init__.py
+++ b/src/diffusers/modular_pipelines/flux2/__init__.py
@@ -21,44 +21,14 @@
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
- _import_structure["encoders"] = [
- "Flux2TextEncoderStep",
- "Flux2RemoteTextEncoderStep",
- "Flux2VaeEncoderStep",
- ]
- _import_structure["before_denoise"] = [
- "Flux2SetTimestepsStep",
- "Flux2PrepareLatentsStep",
- "Flux2RoPEInputsStep",
- "Flux2PrepareImageLatentsStep",
- ]
- _import_structure["denoise"] = [
- "Flux2LoopDenoiser",
- "Flux2LoopAfterDenoiser",
- "Flux2DenoiseLoopWrapper",
- "Flux2DenoiseStep",
- ]
- _import_structure["decoders"] = ["Flux2DecodeStep"]
- _import_structure["inputs"] = [
- "Flux2ProcessImagesInputStep",
- "Flux2TextInputStep",
- ]
- _import_structure["modular_blocks_flux2"] = [
- "ALL_BLOCKS",
- "AUTO_BLOCKS",
- "REMOTE_AUTO_BLOCKS",
- "TEXT2IMAGE_BLOCKS",
- "IMAGE_CONDITIONED_BLOCKS",
- "Flux2AutoBlocks",
- "Flux2AutoVaeEncoderStep",
- "Flux2CoreDenoiseStep",
- "Flux2VaeEncoderSequentialStep",
- ]
- _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks", "Flux2KleinBaseAutoBlocks"]
+ _import_structure["encoders"] = ["Flux2RemoteTextEncoderStep"]
+ _import_structure["modular_blocks_flux2"] = ["Flux2AutoBlocks"]
+ _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks"]
+ _import_structure["modular_blocks_flux2_klein_base"] = ["Flux2KleinBaseAutoBlocks"]
_import_structure["modular_pipeline"] = [
- "Flux2ModularPipeline",
- "Flux2KleinModularPipeline",
"Flux2KleinBaseModularPipeline",
+ "Flux2KleinModularPipeline",
+ "Flux2ModularPipeline",
]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -68,43 +38,10 @@
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
- from .before_denoise import (
- Flux2PrepareImageLatentsStep,
- Flux2PrepareLatentsStep,
- Flux2RoPEInputsStep,
- Flux2SetTimestepsStep,
- )
- from .decoders import Flux2DecodeStep
- from .denoise import (
- Flux2DenoiseLoopWrapper,
- Flux2DenoiseStep,
- Flux2LoopAfterDenoiser,
- Flux2LoopDenoiser,
- )
- from .encoders import (
- Flux2RemoteTextEncoderStep,
- Flux2TextEncoderStep,
- Flux2VaeEncoderStep,
- )
- from .inputs import (
- Flux2ProcessImagesInputStep,
- Flux2TextInputStep,
- )
- from .modular_blocks_flux2 import (
- ALL_BLOCKS,
- AUTO_BLOCKS,
- IMAGE_CONDITIONED_BLOCKS,
- REMOTE_AUTO_BLOCKS,
- TEXT2IMAGE_BLOCKS,
- Flux2AutoBlocks,
- Flux2AutoVaeEncoderStep,
- Flux2CoreDenoiseStep,
- Flux2VaeEncoderSequentialStep,
- )
- from .modular_blocks_flux2_klein import (
- Flux2KleinAutoBlocks,
- Flux2KleinBaseAutoBlocks,
- )
+ from .encoders import Flux2RemoteTextEncoderStep
+ from .modular_blocks_flux2 import Flux2AutoBlocks
+ from .modular_blocks_flux2_klein import Flux2KleinAutoBlocks
+ from .modular_blocks_flux2_klein_base import Flux2KleinBaseAutoBlocks
from .modular_pipeline import Flux2KleinBaseModularPipeline, Flux2KleinModularPipeline, Flux2ModularPipeline
else:
import sys
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
index 41a0ff7dee28..b1033a7dff9e 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
@@ -12,10 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import List
-
-import PIL.Image
-import torch
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
@@ -30,7 +26,6 @@
from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
from .denoise import Flux2DenoiseStep
from .encoders import (
- Flux2RemoteTextEncoderStep,
Flux2TextEncoderStep,
Flux2VaeEncoderStep,
)
@@ -43,26 +38,69 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-Flux2VaeEncoderBlocks = InsertableDict(
- [
- ("preprocess", Flux2ProcessImagesInputStep()),
- ("encode", Flux2VaeEncoderStep()),
- ]
-)
-
-
+# auto_docstring
class Flux2VaeEncoderSequentialStep(SequentialPipelineBlocks):
+ """
+ VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning.
+
+ Components:
+ image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+ Inputs:
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ condition_images (`list`):
+ TODO: Add description.
+ image_latents (`list`):
+ List of latent representations for each reference image
+ """
+
model_name = "flux2"
- block_classes = Flux2VaeEncoderBlocks.values()
- block_names = Flux2VaeEncoderBlocks.keys()
+ block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+ block_names = ["preprocess", "encode"]
@property
def description(self) -> str:
return "VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning."
+# auto_docstring
class Flux2AutoVaeEncoderStep(AutoPipelineBlocks):
+ """
+ VAE encoder step that encodes the image inputs into their latent representations.
+ This is an auto pipeline block that works for image conditioning tasks.
+ - `Flux2VaeEncoderSequentialStep` is used when `image` is provided.
+ - If `image` is not provided, step will be skipped.
+
+ Components:
+ image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+ Inputs:
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ condition_images (`list`):
+ TODO: Add description.
+ image_latents (`list`):
+ List of latent representations for each reference image
+ """
+
block_classes = [Flux2VaeEncoderSequentialStep]
block_names = ["img_conditioning"]
block_trigger_inputs = ["image"]
@@ -80,7 +118,6 @@ def description(self):
Flux2CoreDenoiseBlocks = InsertableDict(
[
("input", Flux2TextInputStep()),
- ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
("prepare_latents", Flux2PrepareLatentsStep()),
("set_timesteps", Flux2SetTimestepsStep()),
("prepare_guidance", Flux2PrepareGuidanceStep()),
@@ -91,7 +128,47 @@ def description(self):
)
+# auto_docstring
class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ Core denoise step that performs the denoising process for Flux2-dev.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 4.0):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`Tensor`, *optional*):
+ Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+ image_latent_ids (`Tensor`, *optional*):
+ Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
model_name = "flux2"
block_classes = Flux2CoreDenoiseBlocks.values()
@@ -99,108 +176,181 @@ class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "Core denoise step that performs the denoising process for Flux2-dev.\n"
- " - `Flux2TextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n"
- " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
- " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
- " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
- " - `Flux2PrepareGuidanceStep` (prepare_guidance) prepares the guidance tensor for the denoising step.\n"
- " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
- " - `Flux2DenoiseStep` (denoise) iteratively denoises the latents.\n"
- " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
- )
+ return "Core denoise step that performs the denoising process for Flux2-dev."
@property
def outputs(self):
return [
- OutputParam(
- name="latents",
- type_hint=torch.Tensor,
- description="The latents from the denoising step.",
- )
+ OutputParam.template("latents"),
]
-AUTO_BLOCKS = InsertableDict(
+Flux2ImageConditionedCoreDenoiseBlocks = InsertableDict(
[
- ("text_encoder", Flux2TextEncoderStep()),
- ("vae_encoder", Flux2AutoVaeEncoderStep()),
- ("denoise", Flux2CoreDenoiseStep()),
- ("decode", Flux2DecodeStep()),
+ ("input", Flux2TextInputStep()),
+ ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+ ("prepare_latents", Flux2PrepareLatentsStep()),
+ ("set_timesteps", Flux2SetTimestepsStep()),
+ ("prepare_guidance", Flux2PrepareGuidanceStep()),
+ ("prepare_rope_inputs", Flux2RoPEInputsStep()),
+ ("denoise", Flux2DenoiseStep()),
+ ("after_denoise", Flux2UnpackLatentsStep()),
]
)
-REMOTE_AUTO_BLOCKS = InsertableDict(
- [
- ("text_encoder", Flux2RemoteTextEncoderStep()),
- ("vae_encoder", Flux2AutoVaeEncoderStep()),
- ("denoise", Flux2CoreDenoiseStep()),
- ("decode", Flux2DecodeStep()),
- ]
-)
-
+# auto_docstring
+class Flux2ImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ Core denoise step that performs the denoising process for Flux2-dev with image conditioning.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ image_latents (`list`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 4.0):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
-class Flux2AutoBlocks(SequentialPipelineBlocks):
model_name = "flux2"
- block_classes = AUTO_BLOCKS.values()
- block_names = AUTO_BLOCKS.keys()
+ block_classes = Flux2ImageConditionedCoreDenoiseBlocks.values()
+ block_names = Flux2ImageConditionedCoreDenoiseBlocks.keys()
@property
def description(self):
- return (
- "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.\n"
- "- For text-to-image generation, all you need to provide is `prompt`.\n"
- "- For image-conditioned generation, you need to provide `image` (list of PIL images)."
- )
+ return "Core denoise step that performs the denoising process for Flux2-dev with image conditioning."
@property
def outputs(self):
return [
- OutputParam(
- name="images",
- type_hint=List[PIL.Image.Image],
- description="The images from the decoding step.",
- )
+ OutputParam.template("latents"),
]
-TEXT2IMAGE_BLOCKS = InsertableDict(
- [
- ("text_encoder", Flux2TextEncoderStep()),
- ("text_input", Flux2TextInputStep()),
- ("prepare_latents", Flux2PrepareLatentsStep()),
- ("set_timesteps", Flux2SetTimestepsStep()),
- ("prepare_guidance", Flux2PrepareGuidanceStep()),
- ("prepare_rope_inputs", Flux2RoPEInputsStep()),
- ("denoise", Flux2DenoiseStep()),
- ("after_denoise", Flux2UnpackLatentsStep()),
- ("decode", Flux2DecodeStep()),
- ]
-)
+class Flux2AutoCoreDenoiseStep(AutoPipelineBlocks):
+ model_name = "flux2"
+
+ block_classes = [Flux2ImageConditionedCoreDenoiseStep, Flux2CoreDenoiseStep]
+ block_names = ["image_conditioned", "text2image"]
+ block_trigger_inputs = ["image_latents", None]
+
+ @property
+ def description(self):
+ return (
+ "Auto core denoise step that performs the denoising process for Flux2-dev."
+ "This is an auto pipeline block that works for text-to-image and image-conditioned generation."
+ " - `Flux2CoreDenoiseStep` is used for text-to-image generation.\n"
+ " - `Flux2ImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
+ )
+
-IMAGE_CONDITIONED_BLOCKS = InsertableDict(
+AUTO_BLOCKS = InsertableDict(
[
("text_encoder", Flux2TextEncoderStep()),
- ("text_input", Flux2TextInputStep()),
- ("preprocess_images", Flux2ProcessImagesInputStep()),
- ("vae_encoder", Flux2VaeEncoderStep()),
- ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
- ("prepare_latents", Flux2PrepareLatentsStep()),
- ("set_timesteps", Flux2SetTimestepsStep()),
- ("prepare_guidance", Flux2PrepareGuidanceStep()),
- ("prepare_rope_inputs", Flux2RoPEInputsStep()),
- ("denoise", Flux2DenoiseStep()),
- ("after_denoise", Flux2UnpackLatentsStep()),
+ ("vae_encoder", Flux2AutoVaeEncoderStep()),
+ ("denoise", Flux2AutoCoreDenoiseStep()),
("decode", Flux2DecodeStep()),
]
)
-ALL_BLOCKS = {
- "text2image": TEXT2IMAGE_BLOCKS,
- "image_conditioned": IMAGE_CONDITIONED_BLOCKS,
- "auto": AUTO_BLOCKS,
- "remote": REMOTE_AUTO_BLOCKS,
-}
+
+# auto_docstring
+class Flux2AutoBlocks(SequentialPipelineBlocks):
+ """
+ Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.
+
+ Supported workflows:
+ - `text2image`: requires `prompt`
+ - `image_conditioned`: requires `image`, `prompt`
+
+ Components:
+ text_encoder (`Mistral3ForConditionalGeneration`) tokenizer (`AutoProcessor`) image_processor
+ (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
+ (`Flux2Transformer2DModel`)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`int`, *optional*, defaults to 512):
+ TODO: Add description.
+ text_encoder_out_layers (`tuple`, *optional*, defaults to (10, 20, 30)):
+ TODO: Add description.
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ image_latents (`list`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`):
+ TODO: Add description.
+ num_inference_steps (`None`):
+ TODO: Add description.
+ timesteps (`None`):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ guidance_scale (`None`, *optional*, defaults to 4.0):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_latent_ids (`Tensor`, *optional*):
+ Position IDs for image latents. Shape: (B, img_seq_len, 4)
+ output_type (`None`, *optional*, defaults to pil):
+ TODO: Add description.
+
+ Outputs:
+ images (`list`):
+ Generated images.
+ """
+
+ model_name = "flux2"
+
+ block_classes = AUTO_BLOCKS.values()
+ block_names = AUTO_BLOCKS.keys()
+ _workflow_map = {
+ "text2image": {"prompt": True},
+ "image_conditioned": {"image": True, "prompt": True},
+ }
+
+ @property
+ def description(self):
+ return "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2."
+
+ @property
+ def outputs(self):
+ return [
+ OutputParam.template("images"),
+ ]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
index 984832d77be5..5dbae43a5a7f 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -12,30 +12,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import List
-
-import PIL.Image
-import torch
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict, OutputParam
from .before_denoise import (
- Flux2KleinBaseRoPEInputsStep,
Flux2PrepareImageLatentsStep,
Flux2PrepareLatentsStep,
Flux2RoPEInputsStep,
Flux2SetTimestepsStep,
)
from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
-from .denoise import Flux2KleinBaseDenoiseStep, Flux2KleinDenoiseStep
+from .denoise import Flux2KleinDenoiseStep
from .encoders import (
- Flux2KleinBaseTextEncoderStep,
Flux2KleinTextEncoderStep,
Flux2VaeEncoderStep,
)
from .inputs import (
- Flux2KleinBaseTextInputStep,
Flux2ProcessImagesInputStep,
Flux2TextInputStep,
)
@@ -47,26 +40,72 @@
# VAE encoder
################
-Flux2KleinVaeEncoderBlocks = InsertableDict(
- [
- ("preprocess", Flux2ProcessImagesInputStep()),
- ("encode", Flux2VaeEncoderStep()),
- ]
-)
-
+# auto_docstring
class Flux2KleinVaeEncoderSequentialStep(SequentialPipelineBlocks):
- model_name = "flux2"
+ """
+ VAE encoder step that preprocesses and encodes the image inputs into their latent representations.
+
+ Components:
+ image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+ Inputs:
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ condition_images (`list`):
+ TODO: Add description.
+ image_latents (`list`):
+ List of latent representations for each reference image
+ """
+
+ model_name = "flux2-klein"
- block_classes = Flux2KleinVaeEncoderBlocks.values()
- block_names = Flux2KleinVaeEncoderBlocks.keys()
+ block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+ block_names = ["preprocess", "encode"]
@property
def description(self) -> str:
return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
+# auto_docstring
class Flux2KleinAutoVaeEncoderStep(AutoPipelineBlocks):
+ """
+ VAE encoder step that encodes the image inputs into their latent representations.
+ This is an auto pipeline block that works for image conditioning tasks.
+ - `Flux2KleinVaeEncoderSequentialStep` is used when `image` is provided.
+ - If `image` is not provided, step will be skipped.
+
+ Components:
+ image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+ Inputs:
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ condition_images (`list`):
+ TODO: Add description.
+ image_latents (`list`):
+ List of latent representations for each reference image
+ """
+
+ model_name = "flux2-klein"
+
block_classes = [Flux2KleinVaeEncoderSequentialStep]
block_names = ["img_conditioning"]
block_trigger_inputs = ["image"]
@@ -88,7 +127,6 @@ def description(self):
Flux2KleinCoreDenoiseBlocks = InsertableDict(
[
("input", Flux2TextInputStep()),
- ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
("prepare_latents", Flux2PrepareLatentsStep()),
("set_timesteps", Flux2SetTimestepsStep()),
("prepare_rope_inputs", Flux2RoPEInputsStep()),
@@ -98,7 +136,46 @@ def description(self):
)
+# auto_docstring
class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image
+ generation.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`Tensor`, *optional*):
+ Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+ image_latent_ids (`Tensor`, *optional*):
+ Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
model_name = "flux2-klein"
block_classes = Flux2KleinCoreDenoiseBlocks.values()
@@ -106,127 +183,218 @@ class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "Core denoise step that performs the denoising process for Flux2-Klein (distilled model).\n"
- " - `Flux2KleinTextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n"
- " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
- " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
- " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
- " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
- " - `Flux2KleinDenoiseStep` (denoise) iteratively denoises the latents.\n"
- " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
- )
+ return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image generation."
@property
def outputs(self):
return [
- OutputParam(
- name="latents",
- type_hint=torch.Tensor,
- description="The latents from the denoising step.",
- )
+ OutputParam.template("latents"),
]
-Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
+Flux2KleinImageConditionedCoreDenoiseBlocks = InsertableDict(
[
- ("input", Flux2KleinBaseTextInputStep()),
- ("prepare_latents", Flux2PrepareLatentsStep()),
+ ("input", Flux2TextInputStep()),
("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+ ("prepare_latents", Flux2PrepareLatentsStep()),
("set_timesteps", Flux2SetTimestepsStep()),
- ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
- ("denoise", Flux2KleinBaseDenoiseStep()),
+ ("prepare_rope_inputs", Flux2RoPEInputsStep()),
+ ("denoise", Flux2KleinDenoiseStep()),
("after_denoise", Flux2UnpackLatentsStep()),
]
)
-class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
+# auto_docstring
+class Flux2KleinImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ image_latents (`list`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
model_name = "flux2-klein"
- block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
- block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
+
+ block_classes = Flux2KleinImageConditionedCoreDenoiseBlocks.values()
+ block_names = Flux2KleinImageConditionedCoreDenoiseBlocks.keys()
@property
def description(self):
- return "Core denoise step that performs the denoising process for Flux2-Klein (base model)."
- return (
- "Core denoise step that performs the denoising process for Flux2-Klein (base model).\n"
- " - `Flux2KleinBaseTextInputStep` (input) standardizes the text inputs (prompt_embeds + negative_prompt_embeds) for the denoising step.\n"
- " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
- " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
- " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
- " - `Flux2KleinBaseRoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids + negative_txt_ids) for the denoising step.\n"
- " - `Flux2KleinBaseDenoiseStep` (denoise) iteratively denoises the latents using Classifier-Free Guidance.\n"
- " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
- )
+ return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning."
@property
def outputs(self):
return [
- OutputParam(
- name="latents",
- type_hint=torch.Tensor,
- description="The latents from the denoising step.",
- )
+ OutputParam.template("latents"),
]
-###
-### Auto blocks
-###
-class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
+# auto_docstring
+class Flux2KleinAutoCoreDenoiseStep(AutoPipelineBlocks):
+ """
+ Auto core denoise step that performs the denoising process for Flux2-Klein.
+ This is an auto pipeline block that works for text-to-image and image-conditioned generation.
+ - `Flux2KleinCoreDenoiseStep` is used for text-to-image generation.
+ - `Flux2KleinImageConditionedCoreDenoiseStep` is used for image-conditioned generation.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ image_latents (`list`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`):
+ TODO: Add description.
+ timesteps (`None`):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_latent_ids (`Tensor`, *optional*):
+ Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
model_name = "flux2-klein"
- block_classes = [
- Flux2KleinTextEncoderStep(),
- Flux2KleinAutoVaeEncoderStep(),
- Flux2KleinCoreDenoiseStep(),
- Flux2DecodeStep(),
- ]
- block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+ block_classes = [Flux2KleinImageConditionedCoreDenoiseStep, Flux2KleinCoreDenoiseStep]
+ block_names = ["image_conditioned", "text2image"]
+ block_trigger_inputs = ["image_latents", None]
@property
def description(self):
return (
- "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.\n"
- + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
- + " - for text-to-image generation, all you need to provide is `prompt`.\n"
+ "Auto core denoise step that performs the denoising process for Flux2-Klein.\n"
+ "This is an auto pipeline block that works for text-to-image and image-conditioned generation.\n"
+ " - `Flux2KleinCoreDenoiseStep` is used for text-to-image generation.\n"
+ " - `Flux2KleinImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
)
- @property
- def outputs(self):
- return [
- OutputParam(
- name="images",
- type_hint=List[PIL.Image.Image],
- description="The images from the decoding step.",
- )
- ]
+
+###
+### Auto blocks
+###
-class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
+# auto_docstring
+class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
+ """
+ Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.
+
+ Supported workflows:
+ - `text2image`: requires `prompt`
+ - `image_conditioned`: requires `image`, `prompt`
+
+ Components:
+ text_encoder (`Qwen3ForCausalLM`) tokenizer (`Qwen2TokenizerFast`) image_processor (`Flux2ImageProcessor`)
+ vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
+ (`Flux2Transformer2DModel`)
+
+ Configs:
+ is_distilled (default: True)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`int`, *optional*, defaults to 512):
+ TODO: Add description.
+ text_encoder_out_layers (`tuple`, *optional*, defaults to (9, 18, 27)):
+ TODO: Add description.
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ image_latents (`list`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`):
+ TODO: Add description.
+ num_inference_steps (`None`):
+ TODO: Add description.
+ timesteps (`None`):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_latent_ids (`Tensor`, *optional*):
+ Position IDs for image latents. Shape: (B, img_seq_len, 4)
+ output_type (`None`, *optional*, defaults to pil):
+ TODO: Add description.
+
+ Outputs:
+ images (`list`):
+ Generated images.
+ """
+
model_name = "flux2-klein"
block_classes = [
- Flux2KleinBaseTextEncoderStep(),
+ Flux2KleinTextEncoderStep(),
Flux2KleinAutoVaeEncoderStep(),
- Flux2KleinBaseCoreDenoiseStep(),
+ Flux2KleinAutoCoreDenoiseStep(),
Flux2DecodeStep(),
]
block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+ _workflow_map = {
+ "text2image": {"prompt": True},
+ "image_conditioned": {"image": True, "prompt": True},
+ }
@property
def description(self):
- return (
- "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).\n"
- + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
- + " - for text-to-image generation, all you need to provide is `prompt`.\n"
- )
+ return "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein."
@property
def outputs(self):
return [
- OutputParam(
- name="images",
- type_hint=List[PIL.Image.Image],
- description="The images from the decoding step.",
- )
+ OutputParam.template("images"),
]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
new file mode 100644
index 000000000000..42e025c622b4
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py
@@ -0,0 +1,413 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+ Flux2KleinBaseRoPEInputsStep,
+ Flux2PrepareImageLatentsStep,
+ Flux2PrepareLatentsStep,
+ Flux2SetTimestepsStep,
+)
+from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
+from .denoise import Flux2KleinBaseDenoiseStep
+from .encoders import (
+ Flux2KleinBaseTextEncoderStep,
+ Flux2VaeEncoderStep,
+)
+from .inputs import (
+ Flux2KleinBaseTextInputStep,
+ Flux2ProcessImagesInputStep,
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+################
+# VAE encoder
+################
+
+
+# auto_docstring
+class Flux2KleinBaseVaeEncoderSequentialStep(SequentialPipelineBlocks):
+ """
+ VAE encoder step that preprocesses and encodes the image inputs into their latent representations.
+
+ Components:
+ image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+ Inputs:
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ condition_images (`list`):
+ TODO: Add description.
+ image_latents (`list`):
+ List of latent representations for each reference image
+ """
+
+ model_name = "flux2"
+
+ block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
+ block_names = ["preprocess", "encode"]
+
+ @property
+ def description(self) -> str:
+ return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
+
+
+# auto_docstring
+class Flux2KleinBaseAutoVaeEncoderStep(AutoPipelineBlocks):
+ """
+ VAE encoder step that encodes the image inputs into their latent representations.
+ This is an auto pipeline block that works for image conditioning tasks.
+ - `Flux2KleinBaseVaeEncoderSequentialStep` is used when `image` is provided.
+ - If `image` is not provided, step will be skipped.
+
+ Components:
+ image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
+
+ Inputs:
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ condition_images (`list`):
+ TODO: Add description.
+ image_latents (`list`):
+ List of latent representations for each reference image
+ """
+
+ block_classes = [Flux2KleinBaseVaeEncoderSequentialStep]
+ block_names = ["img_conditioning"]
+ block_trigger_inputs = ["image"]
+
+ @property
+ def description(self):
+ return (
+ "VAE encoder step that encodes the image inputs into their latent representations.\n"
+ "This is an auto pipeline block that works for image conditioning tasks.\n"
+ " - `Flux2KleinBaseVaeEncoderSequentialStep` is used when `image` is provided.\n"
+ " - If `image` is not provided, step will be skipped."
+ )
+
+
+###
+### Core denoise
+###
+
+Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
+ [
+ ("input", Flux2KleinBaseTextInputStep()),
+ ("prepare_latents", Flux2PrepareLatentsStep()),
+ ("set_timesteps", Flux2SetTimestepsStep()),
+ ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
+ ("denoise", Flux2KleinBaseDenoiseStep()),
+ ("after_denoise", Flux2UnpackLatentsStep()),
+ ]
+)
+
+
+# auto_docstring
+class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+ (`ClassifierFreeGuidance`)
+
+ Configs:
+ is_distilled (default: False)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`Tensor`, *optional*):
+ Packed image latents for conditioning. Shape: (B, img_seq_len, C)
+ image_latent_ids (`Tensor`, *optional*):
+ Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
+ model_name = "flux2-klein"
+ block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
+ block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
+
+ @property
+ def description(self):
+ return "Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation."
+
+ @property
+ def outputs(self):
+ return [
+ OutputParam.template("latents"),
+ ]
+
+
+Flux2KleinBaseImageConditionedCoreDenoiseBlocks = InsertableDict(
+ [
+ ("input", Flux2KleinBaseTextInputStep()),
+ ("prepare_latents", Flux2PrepareLatentsStep()),
+ ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+ ("set_timesteps", Flux2SetTimestepsStep()),
+ ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
+ ("denoise", Flux2KleinBaseDenoiseStep()),
+ ("after_denoise", Flux2UnpackLatentsStep()),
+ ]
+)
+
+
+# auto_docstring
+class Flux2KleinBaseImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+ (`ClassifierFreeGuidance`)
+
+ Configs:
+ is_distilled (default: False)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`list`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
+ model_name = "flux2-klein"
+ block_classes = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.values()
+ block_names = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.keys()
+
+ @property
+ def description(self):
+ return "Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning."
+
+ @property
+ def outputs(self):
+ return [
+ OutputParam.template("latents"),
+ ]
+
+
+# auto_docstring
+class Flux2KleinBaseAutoCoreDenoiseStep(AutoPipelineBlocks):
+ """
+ Auto core denoise step that performs the denoising process for Flux2-Klein (base model).
+ This is an auto pipeline block that works for text-to-image and image-conditioned generation.
+ - `Flux2KleinBaseCoreDenoiseStep` is used for text-to-image generation.
+ - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation.
+
+ Components:
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider
+ (`ClassifierFreeGuidance`)
+
+ Configs:
+ is_distilled (default: False)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`list`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`):
+ TODO: Add description.
+ timesteps (`None`):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_latent_ids (`Tensor`, *optional*):
+ Position IDs for image latents. Shape: (B, img_seq_len, 4)
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
+ model_name = "flux2-klein"
+ block_classes = [Flux2KleinBaseImageConditionedCoreDenoiseStep, Flux2KleinBaseCoreDenoiseStep]
+ block_names = ["image_conditioned", "text2image"]
+ block_trigger_inputs = ["image_latents", None]
+
+ @property
+ def description(self):
+ return (
+ "Auto core denoise step that performs the denoising process for Flux2-Klein (base model).\n"
+ "This is an auto pipeline block that works for text-to-image and image-conditioned generation.\n"
+ " - `Flux2KleinBaseCoreDenoiseStep` is used for text-to-image generation.\n"
+ " - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
+ )
+
+
+###
+### Auto blocks
+###
+
+
+# auto_docstring
+class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
+ """
+ Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).
+
+ Supported workflows:
+ - `text2image`: requires `prompt`
+ - `image_conditioned`: requires `image`, `prompt`
+
+ Components:
+ text_encoder (`Qwen3ForCausalLM`) tokenizer (`Qwen2TokenizerFast`) guider (`ClassifierFreeGuidance`)
+ image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler
+ (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
+
+ Configs:
+ is_distilled (default: False)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`int`, *optional*, defaults to 512):
+ TODO: Add description.
+ text_encoder_out_layers (`tuple`, *optional*, defaults to (9, 18, 27)):
+ TODO: Add description.
+ image (`None`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ latents (`Tensor | NoneType`):
+ TODO: Add description.
+ image_latents (`list`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`):
+ TODO: Add description.
+ timesteps (`None`):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ joint_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_latent_ids (`Tensor`, *optional*):
+ Position IDs for image latents. Shape: (B, img_seq_len, 4)
+ output_type (`None`, *optional*, defaults to pil):
+ TODO: Add description.
+
+ Outputs:
+ images (`list`):
+ Generated images.
+ """
+
+ model_name = "flux2-klein"
+ block_classes = [
+ Flux2KleinBaseTextEncoderStep(),
+ Flux2KleinBaseAutoVaeEncoderStep(),
+ Flux2KleinBaseAutoCoreDenoiseStep(),
+ Flux2DecodeStep(),
+ ]
+ block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+ _workflow_map = {
+ "text2image": {"prompt": True},
+ "image_conditioned": {"image": True, "prompt": True},
+ }
+
+ @property
+ def description(self):
+ return "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model)."
+
+ @property
+ def outputs(self):
+ return [
+ OutputParam.template("images"),
+ ]
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 0eff85926fc5..76a850b63c4e 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -40,8 +40,11 @@
InputParam,
InsertableDict,
OutputParam,
+ combine_inputs,
+ combine_outputs,
format_components,
format_configs,
+ format_workflow,
generate_modular_model_card_content,
make_doc_string,
)
@@ -287,6 +290,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
config_name = "modular_config.json"
model_name = None
+ _workflow_map = None
@classmethod
def _get_signature_keys(cls, obj):
@@ -342,6 +346,35 @@ def _get_outputs(self):
def outputs(self) -> list[OutputParam]:
return self._get_outputs()
+ # currentlyonly ConditionalPipelineBlocks and SequentialPipelineBlocks support `get_execution_blocks`
+ def get_execution_blocks(self, **kwargs):
+ """
+ Get the block(s) that would execute given the inputs. Must be implemented by subclasses that support
+ conditional block selection.
+
+ Args:
+ **kwargs: Input names and values. Only trigger inputs affect block selection.
+ """
+ raise NotImplementedError(f"`get_execution_blocks` is not implemented for {self.__class__.__name__}")
+
+ # currently only SequentialPipelineBlocks support workflows
+ @property
+ def available_workflows(self):
+ """
+ Returns a list of available workflow names. Must be implemented by subclasses that define `_workflow_map`.
+ """
+ raise NotImplementedError(f"`available_workflows` is not implemented for {self.__class__.__name__}")
+
+ def get_workflow(self, workflow_name: str):
+ """
+ Get the execution blocks for a specific workflow. Must be implemented by subclasses that define
+ `_workflow_map`.
+
+ Args:
+ workflow_name: Name of the workflow to retrieve.
+ """
+ raise NotImplementedError(f"`get_workflow` is not implemented for {self.__class__.__name__}")
+
@classmethod
def from_pretrained(
cls,
@@ -480,72 +513,6 @@ def set_block_state(self, state: PipelineState, block_state: BlockState):
if current_value is not param: # Using identity comparison to check if object was modified
state.set(param_name, param, input_param.kwargs_type)
- @staticmethod
- def combine_inputs(*named_input_lists: list[tuple[str, list[InputParam]]]) -> list[InputParam]:
- """
- Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if
- current default value is None and new default value is not None. Warns if multiple non-None default values
- exist for the same input.
-
- Args:
- named_input_lists: list of tuples containing (block_name, input_param_list) pairs
-
- Returns:
- list[InputParam]: Combined list of unique InputParam objects
- """
- combined_dict = {} # name -> InputParam
- value_sources = {} # name -> block_name
-
- for block_name, inputs in named_input_lists:
- for input_param in inputs:
- if input_param.name is None and input_param.kwargs_type is not None:
- input_name = "*_" + input_param.kwargs_type
- else:
- input_name = input_param.name
- if input_name in combined_dict:
- current_param = combined_dict[input_name]
- if (
- current_param.default is not None
- and input_param.default is not None
- and current_param.default != input_param.default
- ):
- warnings.warn(
- f"Multiple different default values found for input '{input_name}': "
- f"{current_param.default} (from block '{value_sources[input_name]}') and "
- f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
- )
- if current_param.default is None and input_param.default is not None:
- combined_dict[input_name] = input_param
- value_sources[input_name] = block_name
- else:
- combined_dict[input_name] = input_param
- value_sources[input_name] = block_name
-
- return list(combined_dict.values())
-
- @staticmethod
- def combine_outputs(*named_output_lists: list[tuple[str, list[OutputParam]]]) -> list[OutputParam]:
- """
- Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
- occurrence of each output name.
-
- Args:
- named_output_lists: list of tuples containing (block_name, output_param_list) pairs
-
- Returns:
- list[OutputParam]: Combined list of unique OutputParam objects
- """
- combined_dict = {} # name -> OutputParam
-
- for block_name, outputs in named_output_lists:
- for output_param in outputs:
- if (output_param.name not in combined_dict) or (
- combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
- ):
- combined_dict[output_param.name] = output_param
-
- return list(combined_dict.values())
-
@property
def input_names(self) -> list[str]:
return [input_param.name for input_param in self.inputs if input_param.name is not None]
@@ -577,7 +544,8 @@ def doc(self):
class ConditionalPipelineBlocks(ModularPipelineBlocks):
"""
A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the
- `select_block` method to define the logic for selecting the block.
+ `select_block` method to define the logic for selecting the block. Currently, we only support selection logic based
+ on the presence or absence of inputs (i.e., whether they are `None` or not)
This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the
library implements for all the pipeline blocks (such as loading or saving etc.)
@@ -585,15 +553,20 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks):
> [!WARNING] > This is an experimental feature and is likely to change in the future.
Attributes:
- block_classes: List of block classes to be used
- block_names: List of prefixes for each block
- block_trigger_inputs: List of input names that select_block() uses to determine which block to run
+ block_classes: List of block classes to be used. Must have the same length as `block_names`.
+ block_names: List of names for each block. Must have the same length as `block_classes`.
+ block_trigger_inputs: List of input names that `select_block()` uses to determine which block to run.
+ For `ConditionalPipelineBlocks`, this does not need to correspond to `block_names` and `block_classes`. For
+ `AutoPipelineBlocks`, this must have the same length as `block_names` and `block_classes`, where each
+ element specifies the trigger input for the corresponding block.
+ default_block_name: Name of the default block to run when no trigger inputs match.
+ If None, this block can be skipped entirely when no trigger inputs are provided.
"""
block_classes = []
block_names = []
block_trigger_inputs = []
- default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided
+ default_block_name = None
def __init__(self):
sub_blocks = InsertableDict()
@@ -657,7 +630,7 @@ def required_inputs(self) -> list[str]:
@property
def inputs(self) -> list[tuple[str, Any]]:
named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()]
- combined_inputs = self.combine_inputs(*named_inputs)
+ combined_inputs = combine_inputs(*named_inputs)
# mark Required inputs only if that input is required by all the blocks
for input_param in combined_inputs:
if input_param.name in self.required_inputs:
@@ -669,15 +642,16 @@ def inputs(self) -> list[tuple[str, Any]]:
@property
def intermediate_outputs(self) -> list[str]:
named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
- combined_outputs = self.combine_outputs(*named_outputs)
+ combined_outputs = combine_outputs(*named_outputs)
return combined_outputs
@property
def outputs(self) -> list[str]:
named_outputs = [(name, block.outputs) for name, block in self.sub_blocks.items()]
- combined_outputs = self.combine_outputs(*named_outputs)
+ combined_outputs = combine_outputs(*named_outputs)
return combined_outputs
+ # used for `__repr__`
def _get_trigger_inputs(self) -> set:
"""
Returns a set of all unique trigger input values found in this block and nested blocks.
@@ -706,16 +680,16 @@ def fn_recursive_get_trigger(blocks):
return all_triggers
- @property
- def trigger_inputs(self):
- """All trigger inputs including from nested blocks."""
- return self._get_trigger_inputs()
-
def select_block(self, **kwargs) -> str | None:
"""
Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic
for selecting the block.
+ Note: When trigger inputs include intermediate outputs from earlier blocks, the selection logic should only
+ depend on the presence or absence of the input (i.e., whether it is None or not), not on its actual value. This
+ is because `get_execution_blocks()` resolves conditions statically by propagating intermediate output names
+ without their runtime values.
+
Args:
**kwargs: Trigger input names and their values from the state.
@@ -750,6 +724,39 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
logger.error(error_msg)
raise
+ def get_execution_blocks(self, **kwargs) -> ModularPipelineBlocks | None:
+ """
+ Get the block(s) that would execute given the inputs.
+
+ Recursively resolves nested ConditionalPipelineBlocks until reaching either:
+ - A leaf block (no sub_blocks or LoopSequentialPipelineBlocks) → returns single `ModularPipelineBlocks`
+ - A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns
+ a `SequentialPipelineBlocks` containing the resolved execution blocks
+
+ Args:
+ **kwargs: Input names and values. Only trigger inputs affect block selection.
+
+ Returns:
+ - `ModularPipelineBlocks`: A leaf block or resolved `SequentialPipelineBlocks`
+ - `None`: If this block would be skipped (no trigger matched and no default)
+ """
+ trigger_kwargs = {name: kwargs.get(name) for name in self.block_trigger_inputs if name is not None}
+ block_name = self.select_block(**trigger_kwargs)
+
+ if block_name is None:
+ block_name = self.default_block_name
+
+ if block_name is None:
+ return None
+
+ block = self.sub_blocks[block_name]
+
+ # Recursively resolve until we hit a leaf block
+ if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
+ return block.get_execution_blocks(**kwargs)
+
+ return block
+
def __repr__(self):
class_name = self.__class__.__name__
base_class = self.__class__.__bases__[0].__name__
@@ -757,11 +764,11 @@ def __repr__(self):
f"{class_name}(\n Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
)
- if self.trigger_inputs:
+ if self._get_trigger_inputs():
header += "\n"
header += " " + "=" * 100 + "\n"
header += " This pipeline contains blocks that are selected at runtime based on inputs.\n"
- header += f" Trigger Inputs: {sorted(self.trigger_inputs)}\n"
+ header += f" Trigger Inputs: {sorted(self._get_trigger_inputs())}\n"
header += " " + "=" * 100 + "\n\n"
# Format description with proper indentation
@@ -828,24 +835,56 @@ def doc(self):
class AutoPipelineBlocks(ConditionalPipelineBlocks):
"""
- A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
+ A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs.
+
+ This is a specialized version of `ConditionalPipelineBlocks` where:
+ - Each block has one corresponding trigger input (1:1 mapping)
+ - Block selection is automatic: the first block whose trigger input is present gets selected
+ - `block_trigger_inputs` must have the same length as `block_names` and `block_classes`
+ - Use `None` in `block_trigger_inputs` to specify the default block, i.e the block that will run if no trigger
+ inputs are present
+
+ Attributes:
+ block_classes:
+ List of block classes to be used. Must have the same length as `block_names` and
+ `block_trigger_inputs`.
+ block_names:
+ List of names for each block. Must have the same length as `block_classes` and `block_trigger_inputs`.
+ block_trigger_inputs:
+ List of input names where each element specifies the trigger input for the corresponding block. Use
+ `None` to mark the default block.
+
+ Example:
+ ```python
+ class MyAutoBlock(AutoPipelineBlocks):
+ block_classes = [InpaintEncoderBlock, ImageEncoderBlock, TextEncoderBlock]
+ block_names = ["inpaint", "img2img", "text2img"]
+ block_trigger_inputs = ["mask_image", "image", None] # text2img is the default
+ ```
+
+ With this definition:
+ - As long as `mask_image` is provided, "inpaint" block runs (regardless of `image` being provided or not)
+ - If `mask_image` is not provided but `image` is provided, "img2img" block runs
+ - Otherwise, "text2img" block runs (default, trigger is `None`)
"""
def __init__(self):
super().__init__()
+ if self.default_block_name is not None:
+ raise ValueError(
+ f"In {self.__class__.__name__}, do not set `default_block_name` for AutoPipelineBlocks. "
+ f"Use `None` in `block_trigger_inputs` to specify the default block."
+ )
+
if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
raise ValueError(
f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same."
)
- @property
- def default_block_name(self) -> str | None:
- """Derive default_block_name from block_trigger_inputs (None entry)."""
if None in self.block_trigger_inputs:
idx = self.block_trigger_inputs.index(None)
- return self.block_names[idx]
- return None
+ self.default_block_name = self.block_names[idx]
def select_block(self, **kwargs) -> str | None:
"""Select block based on which trigger input is present (not None)."""
@@ -899,6 +938,29 @@ def expected_configs(self):
expected_configs.append(config)
return expected_configs
+ @property
+ def available_workflows(self):
+ if self._workflow_map is None:
+ raise NotImplementedError(
+ f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
+ )
+
+ return list(self._workflow_map.keys())
+
+ def get_workflow(self, workflow_name: str):
+ if self._workflow_map is None:
+ raise NotImplementedError(
+ f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}"
+ )
+
+ if workflow_name not in self._workflow_map:
+ raise ValueError(f"Workflow {workflow_name} not found in {self.__class__.__name__}")
+
+ trigger_inputs = self._workflow_map[workflow_name]
+ workflow_blocks = self.get_execution_blocks(**trigger_inputs)
+
+ return workflow_blocks
+
@classmethod
def from_blocks_dict(
cls, blocks_dict: dict[str, Any], description: str | None = None
@@ -994,7 +1056,7 @@ def intermediate_outputs(self) -> list[str]:
# filter out them here so they do not end up as intermediate_outputs
if name not in inp_names:
named_outputs.append((name, block.intermediate_outputs))
- combined_outputs = self.combine_outputs(*named_outputs)
+ combined_outputs = combine_outputs(*named_outputs)
return combined_outputs
# YiYi TODO: I think we can remove the outputs property
@@ -1018,6 +1080,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
raise
return pipeline, state
+ # used for `__repr__`
def _get_trigger_inputs(self):
"""
Returns a set of all unique trigger input values found in the blocks.
@@ -1041,89 +1104,56 @@ def fn_recursive_get_trigger(blocks):
return fn_recursive_get_trigger(self.sub_blocks)
- @property
- def trigger_inputs(self):
- return self._get_trigger_inputs()
-
- def _traverse_trigger_blocks(self, active_inputs):
+ def get_execution_blocks(self, **kwargs) -> "SequentialPipelineBlocks":
"""
- Traverse blocks and select which ones would run given the active inputs.
+ Get the blocks that would execute given the specified inputs.
+
+ As the traversal walks through sequential blocks, intermediate outputs from resolved blocks are added to the
+ active inputs. This means conditional blocks that depend on intermediates (e.g., "run img2img if image_latents
+ is present") will resolve correctly, as long as the condition is based on presence/absence (None or not None),
+ not on the actual value.
+
Args:
- active_inputs: Dict of input names to values that are "present"
+ **kwargs: Input names and values. Only trigger inputs affect block selection.
Returns:
- OrderedDict of block_name -> block that would execute
+ SequentialPipelineBlocks containing only the blocks that would execute
"""
+ # Copy kwargs so we can add outputs as we traverse
+ active_inputs = dict(kwargs)
def fn_recursive_traverse(block, block_name, active_inputs):
result_blocks = OrderedDict()
# ConditionalPipelineBlocks (includes AutoPipelineBlocks)
if isinstance(block, ConditionalPipelineBlocks):
- trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs}
- selected_block_name = block.select_block(**trigger_kwargs)
-
- if selected_block_name is None:
- selected_block_name = block.default_block_name
-
- if selected_block_name is None:
+ block = block.get_execution_blocks(**active_inputs)
+ if block is None:
return result_blocks
- selected_block = block.sub_blocks[selected_block_name]
-
- if selected_block.sub_blocks:
- result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs))
- else:
- result_blocks[block_name] = selected_block
- if hasattr(selected_block, "outputs"):
- for out in selected_block.outputs:
- active_inputs[out.name] = True
-
- return result_blocks
-
- # SequentialPipelineBlocks or LoopSequentialPipelineBlocks
- if block.sub_blocks:
+ # Has sub_blocks (SequentialPipelineBlocks/ConditionalPipelineBlocks)
+ if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks):
for sub_block_name, sub_block in block.sub_blocks.items():
- blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
- blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()}
- result_blocks.update(blocks_to_update)
+ nested_blocks = fn_recursive_traverse(sub_block, sub_block_name, active_inputs)
+ nested_blocks = {f"{block_name}.{k}": v for k, v in nested_blocks.items()}
+ result_blocks.update(nested_blocks)
else:
+ # Leaf block: single ModularPipelineBlocks or LoopSequentialPipelineBlocks
result_blocks[block_name] = block
- if hasattr(block, "outputs"):
- for out in block.outputs:
+ # Add outputs to active_inputs so subsequent blocks can use them as triggers
+ if hasattr(block, "intermediate_outputs"):
+ for out in block.intermediate_outputs:
active_inputs[out.name] = True
return result_blocks
all_blocks = OrderedDict()
for block_name, block in self.sub_blocks.items():
- blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs)
- all_blocks.update(blocks_to_update)
- return all_blocks
+ nested_blocks = fn_recursive_traverse(block, block_name, active_inputs)
+ all_blocks.update(nested_blocks)
- def get_execution_blocks(self, **kwargs):
- """
- Get the blocks that would execute given the specified inputs.
-
- Args:
- **kwargs: Input names and values. Only trigger inputs affect block selection.
- Pass any inputs that would be non-None at runtime.
-
- Returns:
- SequentialPipelineBlocks containing only the blocks that would execute
-
- Example:
- # Get blocks for inpainting workflow blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask,
- image=image)
-
- # Get blocks for text2image workflow blocks = pipeline.get_execution_blocks(prompt="a cat")
- """
- # Filter out None values
- active_inputs = {k: v for k, v in kwargs.items() if v is not None}
-
- blocks_triggered = self._traverse_trigger_blocks(active_inputs)
- return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered)
+ return SequentialPipelineBlocks.from_blocks_dict(all_blocks)
def __repr__(self):
class_name = self.__class__.__name__
@@ -1132,18 +1162,23 @@ def __repr__(self):
f"{class_name}(\n Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n"
)
- if self.trigger_inputs:
+ if self._workflow_map is None and self._get_trigger_inputs():
header += "\n"
header += " " + "=" * 100 + "\n"
header += " This pipeline contains blocks that are selected at runtime based on inputs.\n"
- header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n"
+ header += f" Trigger Inputs: {[inp for inp in self._get_trigger_inputs() if inp is not None]}\n"
# Get first trigger input as example
- example_input = next(t for t in self.trigger_inputs if t is not None)
+ example_input = next(t for t in self._get_trigger_inputs() if t is not None)
header += f" Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n"
header += " " + "=" * 100 + "\n\n"
+ description = self.description
+ if self._workflow_map is not None:
+ workflow_str = format_workflow(self._workflow_map)
+ description = f"{self.description}\n\n{workflow_str}"
+
# Format description with proper indentation
- desc_lines = self.description.split("\n")
+ desc_lines = description.split("\n")
desc = []
# First line with "Description:" label
desc.append(f" Description: {desc_lines[0]}")
@@ -1191,10 +1226,15 @@ def __repr__(self):
@property
def doc(self):
+ description = self.description
+ if self._workflow_map is not None:
+ workflow_str = format_workflow(self._workflow_map)
+ description = f"{self.description}\n\n{workflow_str}"
+
return make_doc_string(
self.inputs,
self.outputs,
- self.description,
+ description=description,
class_name=self.__class__.__name__,
expected_components=self.expected_components,
expected_configs=self.expected_configs,
@@ -1327,7 +1367,7 @@ def required_inputs(self) -> list[str]:
@property
def intermediate_outputs(self) -> list[str]:
named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()]
- combined_outputs = self.combine_outputs(*named_outputs)
+ combined_outputs = combine_outputs(*named_outputs)
for output in self.loop_intermediate_outputs:
if output.name not in {output.name for output in combined_outputs}:
combined_outputs.append(output)
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index aa378f715974..cab17c2aed5c 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -14,6 +14,7 @@
import inspect
import re
+import warnings
from collections import OrderedDict
from dataclasses import dataclass, field
from types import UnionType
@@ -503,6 +504,10 @@ class ConfigSpec:
"type_hint": list[PIL.Image.Image],
"description": "Generated images.",
},
+ "videos": {
+ "type_hint": list[PIL.Image.Image],
+ "description": "The generated videos.",
+ },
"latents": {
"type_hint": torch.Tensor,
"description": "Denoised latents.",
@@ -887,6 +892,30 @@ def format_configs(configs, indent_level=4, max_line_length=115, add_empty_lines
return "\n".join(formatted_configs)
+def format_workflow(workflow_map):
+ """Format a workflow map into a readable string representation.
+
+ Args:
+ workflow_map: Dictionary mapping workflow names to trigger inputs
+
+ Returns:
+ A formatted string representing all workflows
+ """
+ if workflow_map is None:
+ return ""
+
+ lines = ["Supported workflows:"]
+ for workflow_name, trigger_inputs in workflow_map.items():
+ required_inputs = [k for k, v in trigger_inputs.items() if v]
+ if required_inputs:
+ inputs_str = ", ".join(f"`{t}`" for t in required_inputs)
+ lines.append(f" - `{workflow_name}`: requires {inputs_str}")
+ else:
+ lines.append(f" - `{workflow_name}`: default (no additional inputs required)")
+
+ return "\n".join(lines)
+
+
def make_doc_string(
inputs,
outputs,
@@ -943,6 +972,72 @@ def make_doc_string(
return output
+def combine_inputs(*named_input_lists: list[tuple[str, list[InputParam]]]) -> list[InputParam]:
+ """
+ Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if current
+ default value is None and new default value is not None. Warns if multiple non-None default values exist for the
+ same input.
+
+ Args:
+ named_input_lists: List of tuples containing (block_name, input_param_list) pairs
+
+ Returns:
+ List[InputParam]: Combined list of unique InputParam objects
+ """
+ combined_dict = {} # name -> InputParam
+ value_sources = {} # name -> block_name
+
+ for block_name, inputs in named_input_lists:
+ for input_param in inputs:
+ if input_param.name is None and input_param.kwargs_type is not None:
+ input_name = "*_" + input_param.kwargs_type
+ else:
+ input_name = input_param.name
+ if input_name in combined_dict:
+ current_param = combined_dict[input_name]
+ if (
+ current_param.default is not None
+ and input_param.default is not None
+ and current_param.default != input_param.default
+ ):
+ warnings.warn(
+ f"Multiple different default values found for input '{input_name}': "
+ f"{current_param.default} (from block '{value_sources[input_name]}') and "
+ f"{input_param.default} (from block '{block_name}'). Using {current_param.default}."
+ )
+ if current_param.default is None and input_param.default is not None:
+ combined_dict[input_name] = input_param
+ value_sources[input_name] = block_name
+ else:
+ combined_dict[input_name] = input_param
+ value_sources[input_name] = block_name
+
+ return list(combined_dict.values())
+
+
+def combine_outputs(*named_output_lists: list[tuple[str, list[OutputParam]]]) -> list[OutputParam]:
+ """
+ Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first
+ occurrence of each output name.
+
+ Args:
+ named_output_lists: List of tuples containing (block_name, output_param_list) pairs
+
+ Returns:
+ List[OutputParam]: Combined list of unique OutputParam objects
+ """
+ combined_dict = {} # name -> OutputParam
+
+ for block_name, outputs in named_output_lists:
+ for output_param in outputs:
+ if (output_param.name not in combined_dict) or (
+ combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None
+ ):
+ combined_dict[output_param.name] = output_param
+
+ return list(combined_dict.values())
+
+
def generate_modular_model_card_content(blocks) -> dict[str, Any]:
"""
Generate model card content for a modular pipeline.
diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py
index 2b01a5b5a4b5..2e6af4495b37 100644
--- a/src/diffusers/modular_pipelines/qwenimage/__init__.py
+++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py
@@ -21,27 +21,15 @@
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
- _import_structure["modular_blocks_qwenimage"] = [
- "AUTO_BLOCKS",
- "QwenImageAutoBlocks",
- ]
- _import_structure["modular_blocks_qwenimage_edit"] = [
- "EDIT_AUTO_BLOCKS",
- "QwenImageEditAutoBlocks",
- ]
- _import_structure["modular_blocks_qwenimage_edit_plus"] = [
- "EDIT_PLUS_AUTO_BLOCKS",
- "QwenImageEditPlusAutoBlocks",
- ]
- _import_structure["modular_blocks_qwenimage_layered"] = [
- "LAYERED_AUTO_BLOCKS",
- "QwenImageLayeredAutoBlocks",
- ]
+ _import_structure["modular_blocks_qwenimage"] = ["QwenImageAutoBlocks"]
+ _import_structure["modular_blocks_qwenimage_edit"] = ["QwenImageEditAutoBlocks"]
+ _import_structure["modular_blocks_qwenimage_edit_plus"] = ["QwenImageEditPlusAutoBlocks"]
+ _import_structure["modular_blocks_qwenimage_layered"] = ["QwenImageLayeredAutoBlocks"]
_import_structure["modular_pipeline"] = [
"QwenImageEditModularPipeline",
"QwenImageEditPlusModularPipeline",
- "QwenImageModularPipeline",
"QwenImageLayeredModularPipeline",
+ "QwenImageModularPipeline",
]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -51,22 +39,10 @@
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
- from .modular_blocks_qwenimage import (
- AUTO_BLOCKS,
- QwenImageAutoBlocks,
- )
- from .modular_blocks_qwenimage_edit import (
- EDIT_AUTO_BLOCKS,
- QwenImageEditAutoBlocks,
- )
- from .modular_blocks_qwenimage_edit_plus import (
- EDIT_PLUS_AUTO_BLOCKS,
- QwenImageEditPlusAutoBlocks,
- )
- from .modular_blocks_qwenimage_layered import (
- LAYERED_AUTO_BLOCKS,
- QwenImageLayeredAutoBlocks,
- )
+ from .modular_blocks_qwenimage import QwenImageAutoBlocks
+ from .modular_blocks_qwenimage_edit import QwenImageEditAutoBlocks
+ from .modular_blocks_qwenimage_edit_plus import QwenImageEditPlusAutoBlocks
+ from .modular_blocks_qwenimage_layered import QwenImageLayeredAutoBlocks
from .modular_pipeline import (
QwenImageEditModularPipeline,
QwenImageEditPlusModularPipeline,
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index c4e14566a795..51b5c6ac8c3d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -558,7 +558,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
Inputs:
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
latents (`Tensor`):
The initial random noised latents for the denoising process. Can be generated in prepare latents step.
@@ -644,7 +644,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
Inputs:
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step.
@@ -725,7 +725,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
Inputs:
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
latents (`Tensor`):
The latents to use for the denoising process. Can be generated in prepare latents step.
@@ -842,7 +842,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
mask for the negative text embeddings. Can be generated from text_encoder step.
Outputs:
- img_shapes (`List`):
+ img_shapes (`list`):
The shapes of the images latents, used for RoPE calculation
"""
@@ -917,7 +917,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
mask for the negative text embeddings. Can be generated from text_encoder step.
Outputs:
- img_shapes (`List`):
+ img_shapes (`list`):
The shapes of the images latents, used for RoPE calculation
"""
@@ -995,9 +995,9 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
- image_height (`List`):
+ image_height (`list`):
The heights of the reference images. Can be generated in input step.
- image_width (`List`):
+ image_width (`list`):
The widths of the reference images. Can be generated in input step.
height (`int`):
The height in pixels of the generated image.
@@ -1009,11 +1009,11 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
mask for the negative text embeddings. Can be generated from text_encoder step.
Outputs:
- img_shapes (`List`):
+ img_shapes (`list`):
The shapes of the image latents, used for RoPE calculation
- txt_seq_lens (`List`):
+ txt_seq_lens (`list`):
The sequence lengths of the prompt embeds, used for RoPE calculation
- negative_txt_seq_lens (`List`):
+ negative_txt_seq_lens (`list`):
The sequence lengths of the negative prompt embeds, used for RoPE calculation
"""
@@ -1123,11 +1123,11 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
mask for the negative text embeddings. Can be generated from text_encoder step.
Outputs:
- img_shapes (`List`):
+ img_shapes (`list`):
The shapes of the image latents, used for RoPE calculation
- txt_seq_lens (`List`):
+ txt_seq_lens (`list`):
The sequence lengths of the prompt embeds, used for RoPE calculation
- negative_txt_seq_lens (`List`):
+ negative_txt_seq_lens (`list`):
The sequence lengths of the negative prompt embeds, used for RoPE calculation
additional_t_cond (`Tensor`):
The additional t cond, used for RoPE calculation
@@ -1238,7 +1238,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
Outputs:
- controlnet_keep (`List`):
+ controlnet_keep (`list`):
The controlnet keep values
"""
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 49183eed9cda..e4ccb6b8e047 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -191,7 +191,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
step.
Outputs:
- images (`List`):
+ images (`list`):
Generated images. (tensor output of the vae decoder.)
"""
@@ -268,7 +268,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
Output format: 'pil', 'np', 'pt'.
Outputs:
- images (`List`):
+ images (`list`):
Generated images.
"""
@@ -366,7 +366,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
Output format: 'pil', 'np', 'pt'.
Outputs:
- images (`List`):
+ images (`list`):
Generated images.
"""
@@ -436,12 +436,12 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
the generated image tensor from decoders step
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
- mask_overlay_kwargs (`Dict`, *optional*):
+ mask_overlay_kwargs (`dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
- images (`List`):
+ images (`list`):
Generated images.
"""
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index 6724612361aa..de8ea05c5047 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -518,11 +518,11 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
The number of denoising steps.
latents (`Tensor`):
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
- img_shapes (`List`):
+ img_shapes (`list`):
The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
Outputs:
@@ -576,11 +576,11 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
The number of denoising steps.
latents (`Tensor`):
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
- img_shapes (`List`):
+ img_shapes (`list`):
The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
mask (`Tensor`):
The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -645,13 +645,13 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
- controlnet_keep (`List`):
+ controlnet_keep (`list`):
The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
- img_shapes (`List`):
+ img_shapes (`list`):
The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
Outputs:
@@ -711,13 +711,13 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
- controlnet_keep (`List`):
+ controlnet_keep (`list`):
The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
- img_shapes (`List`):
+ img_shapes (`list`):
The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
mask (`Tensor`):
The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -787,11 +787,11 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
- img_shapes (`List`):
+ img_shapes (`list`):
The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
Outputs:
@@ -846,11 +846,11 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
- img_shapes (`List`):
+ img_shapes (`list`):
The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
mask (`Tensor`):
The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
@@ -910,11 +910,11 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
The initial latents to use for the denoising process. Can be generated in prepare_latent step.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
- img_shapes (`List`):
+ img_shapes (`list`):
The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
Outputs:
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 6abcf7ce215a..527267dc0d6e 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -285,11 +285,11 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
image_resize_processor (`VaeImageProcessor`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized images
"""
@@ -359,13 +359,13 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
image_resize_processor (`VaeImageProcessor`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized images
"""
@@ -452,13 +452,13 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
image_resize_processor (`VaeImageProcessor`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
Images resized to 1024x1024 target area for VAE encoding
- resized_cond_image (`List`):
+ resized_cond_image (`list`):
Images resized to 384x384 target area for VL text encoding
"""
@@ -1058,7 +1058,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
Inputs:
mask_image (`Image`):
Mask image for inpainting.
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
height (`int`, *optional*):
The height in pixels of the generated image.
@@ -1072,7 +1072,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
The processed image
processed_mask_image (`Tensor`):
The processed mask image
- mask_overlay_kwargs (`Dict`):
+ mask_overlay_kwargs (`dict`):
The kwargs for the postprocess step to apply the mask overlay
"""
@@ -1177,7 +1177,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
The processed image
processed_mask_image (`Tensor`):
The processed mask image
- mask_overlay_kwargs (`Dict`):
+ mask_overlay_kwargs (`dict`):
The kwargs for the postprocess step to apply the mask overlay
"""
@@ -1256,7 +1256,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
image_processor (`VaeImageProcessor`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
height (`int`, *optional*):
The height in pixels of the generated image.
@@ -1340,7 +1340,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
image_processor (`VaeImageProcessor`)
Inputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized image. should be generated using a resize step
Outputs:
@@ -1412,7 +1412,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
image_processor (`VaeImageProcessor`)
Inputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized image. should be generated using a resize step
Outputs:
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index ebe53940a4e5..faec7db245df 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -496,9 +496,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
image latents used to guide the image generation. Can be generated from vae_encoder step.
Outputs:
- image_height (`List`):
+ image_height (`list`):
The image heights calculated from the image latents dimension
- image_width (`List`):
+ image_width (`list`):
The image widths calculated from the image latents dimension
height (`int`):
if not provided, updated to image height
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 7503e0c7684b..bf87028b2f90 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -119,7 +119,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
Inputs:
mask_image (`Image`):
Mask image for inpainting.
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
height (`int`, *optional*):
The height in pixels of the generated image.
@@ -135,7 +135,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
The processed image
processed_mask_image (`Tensor`):
The processed mask image
- mask_overlay_kwargs (`Dict`):
+ mask_overlay_kwargs (`dict`):
The kwargs for the postprocess step to apply the mask overlay
image_latents (`Tensor`):
The latent representation of the input image.
@@ -164,7 +164,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
height (`int`, *optional*):
The height in pixels of the generated image.
@@ -476,9 +476,9 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -553,11 +553,11 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -632,11 +632,11 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -712,7 +712,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
control_guidance_start (`float`, *optional*, defaults to 0.0):
When to start applying ControlNet.
@@ -720,7 +720,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
When to stop applying ControlNet.
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
Scale for ControlNet conditioning.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -802,7 +802,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
@@ -812,7 +812,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
When to stop applying ControlNet.
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
Scale for ControlNet conditioning.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -894,7 +894,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
@@ -904,7 +904,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
When to stop applying ControlNet.
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
Scale for ControlNet conditioning.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -1032,7 +1032,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
Output format: 'pil', 'np', 'pt'.
Outputs:
- images (`List`):
+ images (`list`):
Generated images. (tensor output of the vae decoder.)
"""
@@ -1061,12 +1061,12 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
step.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
- mask_overlay_kwargs (`Dict`, *optional*):
+ mask_overlay_kwargs (`dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
- images (`List`):
+ images (`list`):
Generated images. (tensor output of the vae decoder.)
"""
@@ -1113,10 +1113,14 @@ def description(self):
class QwenImageAutoBlocks(SequentialPipelineBlocks):
"""
Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
- - for image-to-image generation, you need to provide `image`
- - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
- - to run the controlnet workflow, you need to provide `control_image`
- - for text-to-image generation, all you need to provide is `prompt`
+
+ Supported workflows:
+ - `text2image`: requires `prompt`
+ - `image2image`: requires `prompt`, `image`
+ - `inpainting`: requires `prompt`, `mask_image`, `image`
+ - `controlnet_text2image`: requires `prompt`, `control_image`
+ - `controlnet_image2image`: requires `prompt`, `image`, `control_image`
+ - `controlnet_inpainting`: requires `prompt`, `mask_image`, `image`, `control_image`
Components:
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
@@ -1134,7 +1138,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
Maximum sequence length for prompt encoding.
mask_image (`Image`, *optional*):
Mask image for inpainting.
- image (`Union[Image, List]`, *optional*):
+ image (`Image | list`, *optional*):
Reference image(s) for denoising. Can be a single image or list of images.
height (`int`, *optional*):
The height in pixels of the generated image.
@@ -1160,9 +1164,9 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
Pre-generated noisy latents for image generation.
num_inference_steps (`int`):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -1183,12 +1187,12 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
Scale for ControlNet conditioning.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
- mask_overlay_kwargs (`Dict`, *optional*):
+ mask_overlay_kwargs (`dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
- images (`List`):
+ images (`list`):
Generated images.
"""
@@ -1197,15 +1201,23 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
block_classes = AUTO_BLOCKS.values()
block_names = AUTO_BLOCKS.keys()
+ # Workflow map defines the trigger conditions for each workflow.
+ # How to define:
+ # - Only include required inputs and trigger inputs (inputs that determine which blocks run)
+ # - currently, only supports `True` means the workflow triggers when the input is not None
+
+ _workflow_map = {
+ "text2image": {"prompt": True},
+ "image2image": {"prompt": True, "image": True},
+ "inpainting": {"prompt": True, "mask_image": True, "image": True},
+ "controlnet_text2image": {"prompt": True, "control_image": True},
+ "controlnet_image2image": {"prompt": True, "image": True, "control_image": True},
+ "controlnet_inpainting": {"prompt": True, "mask_image": True, "image": True, "control_image": True},
+ }
+
@property
def description(self):
- return (
- "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
- + "- for image-to-image generation, you need to provide `image`\n"
- + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
- + "- to run the controlnet workflow, you need to provide `control_image`\n"
- + "- for text-to-image generation, all you need to provide is `prompt`"
- )
+ return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage."
@property
def outputs(self):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 21a7044c9f6e..37b80b69ec7e 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -67,7 +67,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
prompt (`str`):
The prompt or prompts to guide image generation.
@@ -75,7 +75,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
The prompt or prompts not to guide the image generation.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized images
prompt_embeds (`Tensor`):
The prompt embeddings.
@@ -115,13 +115,13 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
(`AutoencoderKLQwenImage`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized images
processed_image (`Tensor`):
The processed image
@@ -156,7 +156,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
(`AutoencoderKLQwenImage`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
mask_image (`Image`):
Mask image for inpainting.
@@ -166,13 +166,13 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized images
processed_image (`Tensor`):
The processed image
processed_mask_image (`Tensor`):
The processed mask image
- mask_overlay_kwargs (`Dict`):
+ mask_overlay_kwargs (`dict`):
The kwargs for the postprocess step to apply the mask overlay
image_latents (`Tensor`):
The latent representation of the input image.
@@ -450,9 +450,9 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -526,11 +526,11 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -627,7 +627,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
Output format: 'pil', 'np', 'pt'.
Outputs:
- images (`List`):
+ images (`list`):
Generated images. (tensor output of the vae decoder.)
"""
@@ -656,12 +656,12 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
step.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
- mask_overlay_kwargs (`Dict`, *optional*):
+ mask_overlay_kwargs (`dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
- images (`List`):
+ images (`list`):
Generated images. (tensor output of the vae decoder.)
"""
@@ -718,6 +718,11 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
`padding_mask_crop`
+
+ Supported workflows:
+ - `image_conditioned`: requires `prompt`, `image`
+ - `image_conditioned_inpainting`: requires `prompt`, `mask_image`, `image`
+
Components:
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
@@ -725,7 +730,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
(`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
prompt (`str`):
The prompt or prompts to guide image generation.
@@ -751,28 +756,32 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
Pre-generated noisy latents for image generation.
num_inference_steps (`int`):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
- mask_overlay_kwargs (`Dict`, *optional*):
+ mask_overlay_kwargs (`dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
- images (`List`):
+ images (`list`):
Generated images.
"""
model_name = "qwenimage-edit"
block_classes = EDIT_AUTO_BLOCKS.values()
block_names = EDIT_AUTO_BLOCKS.keys()
+ _workflow_map = {
+ "image_conditioned": {"prompt": True, "image": True},
+ "image_conditioned_inpainting": {"prompt": True, "mask_image": True, "image": True},
+ }
@property
def description(self):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 56652c94c4b0..4a1f418d7b45 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -58,7 +58,7 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
prompt (`str`):
The prompt or prompts to guide image generation.
@@ -66,9 +66,9 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
The prompt or prompts not to guide the image generation.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
Images resized to 1024x1024 target area for VAE encoding
- resized_cond_image (`List`):
+ resized_cond_image (`list`):
Images resized to 384x384 target area for VL text encoding
prompt_embeds (`Tensor`):
The prompt embeddings.
@@ -108,15 +108,15 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
(`AutoencoderKLQwenImage`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
Images resized to 1024x1024 target area for VAE encoding
- resized_cond_image (`List`):
+ resized_cond_image (`list`):
Images resized to 384x384 target area for VL text encoding
processed_image (`Tensor`):
The processed image
@@ -189,9 +189,9 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
The negative prompt embeddings. (batch-expanded)
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask. (batch-expanded)
- image_height (`List`):
+ image_height (`list`):
The image heights calculated from the image latents dimension
- image_width (`List`):
+ image_width (`list`):
The image widths calculated from the image latents dimension
height (`int`):
if not provided, updated to image height
@@ -253,9 +253,9 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -315,7 +315,7 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
Output format: 'pil', 'np', 'pt'.
Outputs:
- images (`List`):
+ images (`list`):
Generated images. (tensor output of the vae decoder.)
"""
@@ -357,7 +357,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
transformer (`QwenImageTransformer2DModel`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
prompt (`str`):
The prompt or prompts to guide image generation.
@@ -375,9 +375,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
Pre-generated noisy latents for image generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -385,7 +385,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
Output format: 'pil', 'np', 'pt'.
Outputs:
- images (`List`):
+ images (`list`):
Generated images.
"""
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 43cefa5eb658..a10454f1fb0c 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -60,7 +60,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
(`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
@@ -74,7 +74,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
Maximum sequence length for prompt encoding.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized images
prompt (`str`):
The prompt or prompts to guide image generation. If not provided, updated using image caption
@@ -117,7 +117,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
(`AutoencoderKLQwenImage`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
@@ -125,7 +125,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
Outputs:
- resized_image (`List`):
+ resized_image (`list`):
The resized images
processed_image (`Tensor`):
The processed image
@@ -250,9 +250,9 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -317,7 +317,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
Inputs:
- image (`Union[Image, List]`):
+ image (`Image | list`):
Reference image(s) for denoising. Can be a single image or list of images.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
@@ -339,9 +339,9 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
Number of layers to extract from the image
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
- sigmas (`List`, *optional*):
+ sigmas (`list`, *optional*):
Custom sigmas for the denoising process.
- attention_kwargs (`Dict`, *optional*):
+ attention_kwargs (`dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`None`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
@@ -349,7 +349,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
Output format: 'pil', 'np', 'pt'.
Outputs:
- images (`List`):
+ images (`list`):
Generated images.
"""
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
index 59ec46dc6d36..44f1c555cef3 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py
@@ -21,21 +21,7 @@
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
- _import_structure["encoders"] = ["StableDiffusionXLTextEncoderStep"]
- _import_structure["modular_blocks"] = [
- "ALL_BLOCKS",
- "AUTO_BLOCKS",
- "CONTROLNET_BLOCKS",
- "IMAGE2IMAGE_BLOCKS",
- "INPAINT_BLOCKS",
- "IP_ADAPTER_BLOCKS",
- "TEXT2IMAGE_BLOCKS",
- "StableDiffusionXLAutoBlocks",
- "StableDiffusionXLAutoControlnetStep",
- "StableDiffusionXLAutoDecodeStep",
- "StableDiffusionXLAutoIPAdapterStep",
- "StableDiffusionXLAutoVaeEncoderStep",
- ]
+ _import_structure["modular_blocks_stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks"]
_import_structure["modular_pipeline"] = ["StableDiffusionXLModularPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -45,23 +31,7 @@
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
- from .encoders import (
- StableDiffusionXLTextEncoderStep,
- )
- from .modular_blocks import (
- ALL_BLOCKS,
- AUTO_BLOCKS,
- CONTROLNET_BLOCKS,
- IMAGE2IMAGE_BLOCKS,
- INPAINT_BLOCKS,
- IP_ADAPTER_BLOCKS,
- TEXT2IMAGE_BLOCKS,
- StableDiffusionXLAutoBlocks,
- StableDiffusionXLAutoControlnetStep,
- StableDiffusionXLAutoDecodeStep,
- StableDiffusionXLAutoIPAdapterStep,
- StableDiffusionXLAutoVaeEncoderStep,
- )
+ from .modular_blocks_stable_diffusion_xl import StableDiffusionXLAutoBlocks
from .modular_pipeline import StableDiffusionXLModularPipeline
else:
import sys
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
similarity index 55%
rename from src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
rename to src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
index 68b5e33755b5..a7a18e514777 100644
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py
@@ -14,7 +14,7 @@
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
+from ..modular_pipeline_utils import OutputParam
from .before_denoise import (
StableDiffusionXLControlNetInputStep,
StableDiffusionXLControlNetUnionInputStep,
@@ -277,7 +277,161 @@ def description(self):
# ip-adapter, controlnet, text2img, img2img, inpainting
+# auto_docstring
class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
+ """
+ Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion
+ XL.
+
+ Supported workflows:
+ - `text2image`: requires `prompt`
+ - `image2image`: requires `image`, `prompt`
+ - `inpainting`: requires `mask_image`, `image`, `prompt`
+ - `controlnet_text2image`: requires `control_image`, `prompt`
+ - `controlnet_image2image`: requires `control_image`, `image`, `prompt`
+ - `controlnet_inpainting`: requires `control_image`, `mask_image`, `image`, `prompt`
+ - `controlnet_union_text2image`: requires `control_image`, `control_mode`, `prompt`
+ - `controlnet_union_image2image`: requires `control_image`, `control_mode`, `image`, `prompt`
+ - `controlnet_union_inpainting`: requires `control_image`, `control_mode`, `mask_image`, `image`, `prompt`
+ - `ip_adapter_text2image`: requires `ip_adapter_image`, `prompt`
+ - `ip_adapter_image2image`: requires `ip_adapter_image`, `image`, `prompt`
+ - `ip_adapter_inpainting`: requires `ip_adapter_image`, `mask_image`, `image`, `prompt`
+ - `ip_adapter_controlnet_text2image`: requires `ip_adapter_image`, `control_image`, `prompt`
+ - `ip_adapter_controlnet_image2image`: requires `ip_adapter_image`, `control_image`, `image`, `prompt`
+ - `ip_adapter_controlnet_inpainting`: requires `ip_adapter_image`, `control_image`, `mask_image`, `image`,
+ `prompt`
+ - `ip_adapter_controlnet_union_text2image`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+ `prompt`
+ - `ip_adapter_controlnet_union_image2image`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+ `image`, `prompt`
+ - `ip_adapter_controlnet_union_inpainting`: requires `ip_adapter_image`, `control_image`, `control_mode`,
+ `mask_image`, `image`, `prompt`
+
+ Components:
+ text_encoder (`CLIPTextModel`) text_encoder_2 (`CLIPTextModelWithProjection`) tokenizer (`CLIPTokenizer`)
+ tokenizer_2 (`CLIPTokenizer`) guider (`ClassifierFreeGuidance`) image_encoder
+ (`CLIPVisionModelWithProjection`) feature_extractor (`CLIPImageProcessor`) unet (`UNet2DConditionModel`) vae
+ (`AutoencoderKL`) image_processor (`VaeImageProcessor`) mask_processor (`VaeImageProcessor`) scheduler
+ (`EulerDiscreteScheduler`) controlnet (`ControlNetUnionModel`) control_image_processor (`VaeImageProcessor`)
+
+ Configs:
+ force_zeros_for_empty_prompt (default: True) requires_aesthetics_score (default: False)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ prompt_2 (`None`, *optional*):
+ TODO: Add description.
+ negative_prompt (`None`, *optional*):
+ TODO: Add description.
+ negative_prompt_2 (`None`, *optional*):
+ TODO: Add description.
+ cross_attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ clip_skip (`None`, *optional*):
+ TODO: Add description.
+ ip_adapter_image (`Image | ndarray | Tensor | list | list | list`, *optional*):
+ The image(s) to be used as ip adapter
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ image (`None`, *optional*):
+ TODO: Add description.
+ mask_image (`None`, *optional*):
+ TODO: Add description.
+ padding_mask_crop (`None`, *optional*):
+ TODO: Add description.
+ dtype (`dtype`, *optional*):
+ The dtype of the model inputs
+ generator (`None`, *optional*):
+ TODO: Add description.
+ preprocess_kwargs (`dict | NoneType`, *optional*):
+ A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under
+ `self.image_processor` in [diffusers.image_processor.VaeImageProcessor]
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ ip_adapter_embeds (`list`, *optional*):
+ Pre-generated image embeddings for IP-Adapter. Can be generated from ip_adapter step.
+ negative_ip_adapter_embeds (`list`, *optional*):
+ Pre-generated negative image embeddings for IP-Adapter. Can be generated from ip_adapter step.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ denoising_end (`None`, *optional*):
+ TODO: Add description.
+ strength (`None`, *optional*, defaults to 0.3):
+ TODO: Add description.
+ denoising_start (`None`, *optional*):
+ TODO: Add description.
+ latents (`None`):
+ TODO: Add description.
+ image_latents (`Tensor`, *optional*):
+ The latents representing the reference image for image-to-image/inpainting generation. Can be generated
+ in vae_encode step.
+ mask (`Tensor`, *optional*):
+ The mask for the inpainting generation. Can be generated in vae_encode step.
+ masked_image_latents (`Tensor`, *optional*):
+ The masked image latents for the inpainting generation (only for inpainting-specific unet). Can be
+ generated in vae_encode step.
+ original_size (`None`, *optional*):
+ TODO: Add description.
+ target_size (`None`, *optional*):
+ TODO: Add description.
+ negative_original_size (`None`, *optional*):
+ TODO: Add description.
+ negative_target_size (`None`, *optional*):
+ TODO: Add description.
+ crops_coords_top_left (`None`, *optional*, defaults to (0, 0)):
+ TODO: Add description.
+ negative_crops_coords_top_left (`None`, *optional*, defaults to (0, 0)):
+ TODO: Add description.
+ aesthetic_score (`None`, *optional*, defaults to 6.0):
+ TODO: Add description.
+ negative_aesthetic_score (`None`, *optional*, defaults to 2.0):
+ TODO: Add description.
+ control_image (`None`, *optional*):
+ TODO: Add description.
+ control_mode (`None`, *optional*):
+ TODO: Add description.
+ control_guidance_start (`None`, *optional*, defaults to 0.0):
+ TODO: Add description.
+ control_guidance_end (`None`, *optional*, defaults to 1.0):
+ TODO: Add description.
+ controlnet_conditioning_scale (`None`, *optional*, defaults to 1.0):
+ TODO: Add description.
+ guess_mode (`None`, *optional*, defaults to False):
+ TODO: Add description.
+ crops_coords (`tuple | NoneType`, *optional*):
+ The crop coordinates to use for preprocess/postprocess the image and mask, for inpainting task only. Can
+ be generated in vae_encode step.
+ controlnet_cond (`Tensor`, *optional*):
+ The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+ conditioning_scale (`float`, *optional*):
+ The controlnet conditioning scale value to use for the denoising process. Can be generated in
+ prepare_controlnet_inputs step.
+ controlnet_keep (`list`, *optional*):
+ The controlnet keep values to use for the denoising process. Can be generated in
+ prepare_controlnet_inputs step.
+ **denoiser_input_fields (`None`, *optional*):
+ All conditional model inputs that need to be prepared with guider. It should contain
+ prompt_embeds/negative_prompt_embeds, add_time_ids/negative_add_time_ids,
+ pooled_prompt_embeds/negative_pooled_prompt_embeds, and ip_adapter_embeds/negative_ip_adapter_embeds
+ (optional).please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when
+ they are created and added to the pipeline state
+ eta (`None`, *optional*, defaults to 0.0):
+ TODO: Add description.
+ output_type (`None`, *optional*, defaults to pil):
+ TODO: Add description.
+
+ Outputs:
+ images (`list`):
+ Generated images.
+ """
+
block_classes = [
StableDiffusionXLTextEncoderStep,
StableDiffusionXLAutoIPAdapterStep,
@@ -293,103 +447,66 @@ class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
"decode",
]
- @property
- def description(self):
- return (
- "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL.\n"
- + "- for image-to-image generation, you need to provide either `image` or `image_latents`\n"
- + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
- + "- to run the controlnet workflow, you need to provide `control_image`\n"
- + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
- + "- to run the ip_adapter workflow, you need to provide `ip_adapter_image`\n"
- + "- for text-to-image generation, all you need to provide is `prompt`"
- )
-
-
-# controlnet (input + denoise step)
-class StableDiffusionXLAutoControlnetStep(SequentialPipelineBlocks):
- block_classes = [
- StableDiffusionXLAutoControlNetInputStep,
- StableDiffusionXLAutoControlNetDenoiseStep,
- ]
- block_names = ["controlnet_input", "controlnet_denoise"]
+ _workflow_map = {
+ "text2image": {"prompt": True},
+ "image2image": {"image": True, "prompt": True},
+ "inpainting": {"mask_image": True, "image": True, "prompt": True},
+ "controlnet_text2image": {"control_image": True, "prompt": True},
+ "controlnet_image2image": {"control_image": True, "image": True, "prompt": True},
+ "controlnet_inpainting": {"control_image": True, "mask_image": True, "image": True, "prompt": True},
+ "controlnet_union_text2image": {"control_image": True, "control_mode": True, "prompt": True},
+ "controlnet_union_image2image": {"control_image": True, "control_mode": True, "image": True, "prompt": True},
+ "controlnet_union_inpainting": {
+ "control_image": True,
+ "control_mode": True,
+ "mask_image": True,
+ "image": True,
+ "prompt": True,
+ },
+ "ip_adapter_text2image": {"ip_adapter_image": True, "prompt": True},
+ "ip_adapter_image2image": {"ip_adapter_image": True, "image": True, "prompt": True},
+ "ip_adapter_inpainting": {"ip_adapter_image": True, "mask_image": True, "image": True, "prompt": True},
+ "ip_adapter_controlnet_text2image": {"ip_adapter_image": True, "control_image": True, "prompt": True},
+ "ip_adapter_controlnet_image2image": {
+ "ip_adapter_image": True,
+ "control_image": True,
+ "image": True,
+ "prompt": True,
+ },
+ "ip_adapter_controlnet_inpainting": {
+ "ip_adapter_image": True,
+ "control_image": True,
+ "mask_image": True,
+ "image": True,
+ "prompt": True,
+ },
+ "ip_adapter_controlnet_union_text2image": {
+ "ip_adapter_image": True,
+ "control_image": True,
+ "control_mode": True,
+ "prompt": True,
+ },
+ "ip_adapter_controlnet_union_image2image": {
+ "ip_adapter_image": True,
+ "control_image": True,
+ "control_mode": True,
+ "image": True,
+ "prompt": True,
+ },
+ "ip_adapter_controlnet_union_inpainting": {
+ "ip_adapter_image": True,
+ "control_image": True,
+ "control_mode": True,
+ "mask_image": True,
+ "image": True,
+ "prompt": True,
+ },
+ }
@property
def description(self):
- return (
- "Controlnet auto step that prepare the controlnet input and denoise the latents. "
- + "It works for both controlnet and controlnet_union and supports text2img, img2img and inpainting tasks."
- + " (it should be replace at 'denoise' step)"
- )
-
-
-TEXT2IMAGE_BLOCKS = InsertableDict(
- [
- ("text_encoder", StableDiffusionXLTextEncoderStep),
- ("input", StableDiffusionXLInputStep),
- ("set_timesteps", StableDiffusionXLSetTimestepsStep),
- ("prepare_latents", StableDiffusionXLPrepareLatentsStep),
- ("prepare_add_cond", StableDiffusionXLPrepareAdditionalConditioningStep),
- ("denoise", StableDiffusionXLDenoiseStep),
- ("decode", StableDiffusionXLDecodeStep),
- ]
-)
+ return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL."
-IMAGE2IMAGE_BLOCKS = InsertableDict(
- [
- ("text_encoder", StableDiffusionXLTextEncoderStep),
- ("vae_encoder", StableDiffusionXLVaeEncoderStep),
- ("input", StableDiffusionXLInputStep),
- ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
- ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
- ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
- ("denoise", StableDiffusionXLDenoiseStep),
- ("decode", StableDiffusionXLDecodeStep),
- ]
-)
-
-INPAINT_BLOCKS = InsertableDict(
- [
- ("text_encoder", StableDiffusionXLTextEncoderStep),
- ("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep),
- ("input", StableDiffusionXLInputStep),
- ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
- ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
- ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
- ("denoise", StableDiffusionXLInpaintDenoiseStep),
- ("decode", StableDiffusionXLInpaintDecodeStep),
- ]
-)
-
-CONTROLNET_BLOCKS = InsertableDict(
- [
- ("denoise", StableDiffusionXLAutoControlnetStep),
- ]
-)
-
-
-IP_ADAPTER_BLOCKS = InsertableDict(
- [
- ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
- ]
-)
-
-AUTO_BLOCKS = InsertableDict(
- [
- ("text_encoder", StableDiffusionXLTextEncoderStep),
- ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
- ("vae_encoder", StableDiffusionXLAutoVaeEncoderStep),
- ("denoise", StableDiffusionXLCoreDenoiseStep),
- ("decode", StableDiffusionXLAutoDecodeStep),
- ]
-)
-
-
-ALL_BLOCKS = {
- "text2img": TEXT2IMAGE_BLOCKS,
- "img2img": IMAGE2IMAGE_BLOCKS,
- "inpaint": INPAINT_BLOCKS,
- "controlnet": CONTROLNET_BLOCKS,
- "ip_adapter": IP_ADAPTER_BLOCKS,
- "auto": AUTO_BLOCKS,
-}
+ @property
+ def outputs(self):
+ return [OutputParam.template("images")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
index d01a86ca09b5..b641c6cd7fcc 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py
@@ -14,6 +14,7 @@
from ...utils import logging
from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
from .before_denoise import (
WanPrepareLatentsStep,
WanSetTimestepsStep,
@@ -37,7 +38,45 @@
# inputs(text) -> set_timesteps -> prepare_latents -> denoise
+# auto_docstring
class WanCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ denoise block that takes encoded conditions and runs the denoising process.
+
+ Components:
+ transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+
+ Inputs:
+ num_videos_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ num_frames (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
model_name = "wan"
block_classes = [
WanTextInputStep,
@@ -49,14 +88,11 @@ class WanCoreDenoiseStep(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "denoise block that takes encoded conditions and runs the denoising process.\n"
- + "This is a sequential pipeline blocks:\n"
- + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
- + " - `WanSetTimestepsStep` is used to set the timesteps\n"
- + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
- + " - `WanDenoiseStep` is used to denoise the latents\n"
- )
+ return "denoise block that takes encoded conditions and runs the denoising process."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("latents")]
# ====================
@@ -64,7 +100,51 @@ def description(self):
# ====================
+# auto_docstring
class WanBlocks(SequentialPipelineBlocks):
+ """
+ Modular pipeline blocks for Wan2.1.
+
+ Components:
+ text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer
+ (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) vae (`AutoencoderKLWan`) video_processor
+ (`VideoProcessor`)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ negative_prompt (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`None`, *optional*, defaults to 512):
+ TODO: Add description.
+ num_videos_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ num_frames (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ output_type (`str`, *optional*, defaults to np):
+ The output type of the decoded videos
+
+ Outputs:
+ videos (`list`):
+ The generated videos.
+ """
+
model_name = "wan"
block_classes = [
WanTextEncoderStep,
@@ -75,9 +155,8 @@ class WanBlocks(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "Modular pipeline blocks for Wan2.1.\n"
- + "- `WanTextEncoderStep` is used to encode the text\n"
- + "- `WanCoreDenoiseStep` is used to denoise the latents\n"
- + "- `WanVaeDecoderStep` is used to decode the latents to images"
- )
+ return "Modular pipeline blocks for Wan2.1."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
index 21164422f3d9..9f602c24713b 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py
@@ -14,6 +14,7 @@
from ...utils import logging
from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
from .before_denoise import (
WanPrepareLatentsStep,
WanSetTimestepsStep,
@@ -38,7 +39,50 @@
# inputs(text) -> set_timesteps -> prepare_latents -> denoise
+# auto_docstring
class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ denoise block that takes encoded conditions and runs the denoising process.
+
+ Components:
+ transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+ guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+ Configs:
+ boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+ noise stages.
+
+ Inputs:
+ num_videos_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ num_frames (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
model_name = "wan"
block_classes = [
WanTextInputStep,
@@ -50,14 +94,11 @@ class Wan22CoreDenoiseStep(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "denoise block that takes encoded conditions and runs the denoising process.\n"
- + "This is a sequential pipeline blocks:\n"
- + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
- + " - `WanSetTimestepsStep` is used to set the timesteps\n"
- + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
- + " - `Wan22DenoiseStep` is used to denoise the latents in wan2.2\n"
- )
+ return "denoise block that takes encoded conditions and runs the denoising process."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("latents")]
# ====================
@@ -65,7 +106,55 @@ def description(self):
# ====================
+# auto_docstring
class Wan22Blocks(SequentialPipelineBlocks):
+ """
+ Modular pipeline for text-to-video using Wan2.2.
+
+ Components:
+ text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer
+ (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider_2 (`ClassifierFreeGuidance`)
+ transformer_2 (`WanTransformer3DModel`) vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+ Configs:
+ boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+ noise stages.
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ negative_prompt (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`None`, *optional*, defaults to 512):
+ TODO: Add description.
+ num_videos_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ num_frames (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ output_type (`str`, *optional*, defaults to np):
+ The output type of the decoded videos
+
+ Outputs:
+ videos (`list`):
+ The generated videos.
+ """
+
model_name = "wan"
block_classes = [
WanTextEncoderStep,
@@ -80,9 +169,8 @@ class Wan22Blocks(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "Modular pipeline for text-to-video using Wan2.2.\n"
- + " - `WanTextEncoderStep` encodes the text\n"
- + " - `Wan22CoreDenoiseStep` denoes the latents\n"
- + " - `WanVaeDecoderStep` decodes the latents to video frames\n"
- )
+ return "Modular pipeline for text-to-video using Wan2.2."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
index 3db1c8fa837b..8e55b7a50f08 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py
@@ -14,6 +14,7 @@
from ...utils import logging
from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
from .before_denoise import (
WanAdditionalInputsStep,
WanPrepareLatentsStep,
@@ -40,7 +41,36 @@
# ====================
+# auto_docstring
class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks):
+ """
+ Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent
+ representation
+
+ Components:
+ vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+ Inputs:
+ image (`Image`):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+ num_frames (`int`, *optional*, defaults to 81):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ resized_image (`Image`):
+ TODO: Add description.
+ first_frame_latents (`Tensor`):
+ video latent representation with the first frame image condition
+ image_condition_latents (`Tensor | NoneType`):
+ TODO: Add description.
+ """
+
model_name = "wan-i2v"
block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep]
block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"]
@@ -56,7 +86,52 @@ def description(self):
# inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents)
+# auto_docstring
class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+ Components:
+ transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+ guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+ Configs:
+ boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+ noise stages.
+
+ Inputs:
+ num_videos_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ num_frames (`None`, *optional*):
+ TODO: Add description.
+ image_condition_latents (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
model_name = "wan-i2v"
block_classes = [
WanTextInputStep,
@@ -75,15 +150,11 @@ class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
- + "This is a sequential pipeline blocks:\n"
- + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
- + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
- + " - `WanSetTimestepsStep` is used to set the timesteps\n"
- + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
- + " - `Wan22Image2VideoDenoiseStep` is used to denoise the latents in wan2.2\n"
- )
+ return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("latents")]
# ====================
@@ -91,7 +162,57 @@ def description(self):
# ====================
+# auto_docstring
class Wan22Image2VideoBlocks(SequentialPipelineBlocks):
+ """
+ Modular pipeline for image-to-video using Wan2.2.
+
+ Components:
+ text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) vae
+ (`AutoencoderKLWan`) video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler
+ (`UniPCMultistepScheduler`) guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`)
+
+ Configs:
+ boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low
+ noise stages.
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ negative_prompt (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`None`, *optional*, defaults to 512):
+ TODO: Add description.
+ image (`Image`):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+ num_frames (`int`, *optional*, defaults to 81):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_videos_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ output_type (`str`, *optional*, defaults to np):
+ The output type of the decoded videos
+
+ Outputs:
+ videos (`list`):
+ The generated videos.
+ """
+
model_name = "wan-i2v"
block_classes = [
WanTextEncoderStep,
@@ -108,10 +229,8 @@ class Wan22Image2VideoBlocks(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "Modular pipeline for image-to-video using Wan2.2.\n"
- + " - `WanTextEncoderStep` encodes the text\n"
- + " - `WanImage2VideoVaeEncoderStep` encodes the image\n"
- + " - `Wan22Image2VideoCoreDenoiseStep` denoes the latents\n"
- + " - `WanVaeDecoderStep` decodes the latents to video frames\n"
- )
+ return "Modular pipeline for image-to-video using Wan2.2."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
index d07ab8ecf473..c08db62c469a 100644
--- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
+++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py
@@ -14,6 +14,7 @@
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
from .before_denoise import (
WanAdditionalInputsStep,
WanPrepareLatentsStep,
@@ -45,7 +46,29 @@
# wan2.1 I2V (first frame only)
+# auto_docstring
class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks):
+ """
+ Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings
+
+ Components:
+ image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+ Inputs:
+ image (`Image`):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+
+ Outputs:
+ resized_image (`Image`):
+ TODO: Add description.
+ image_embeds (`Tensor`):
+ The image embeddings
+ """
+
model_name = "wan-i2v"
block_classes = [WanImageResizeStep, WanImageEncoderStep]
block_names = ["image_resize", "image_encoder"]
@@ -56,7 +79,34 @@ def description(self):
# wan2.1 FLF2V (first and last frame)
+# auto_docstring
class WanFLF2VImageEncoderStep(SequentialPipelineBlocks):
+ """
+ FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image
+ embeddings
+
+ Components:
+ image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+ Inputs:
+ image (`Image`):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+ last_image (`Image`):
+ The last frameimage
+
+ Outputs:
+ resized_image (`Image`):
+ TODO: Add description.
+ resized_last_image (`Image`):
+ TODO: Add description.
+ image_embeds (`Tensor`):
+ The image embeddings
+ """
+
model_name = "wan-i2v"
block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep]
block_names = ["image_resize", "last_image_resize", "image_encoder"]
@@ -67,7 +117,36 @@ def description(self):
# wan2.1 Auto Image Encoder
+# auto_docstring
class WanAutoImageEncoderStep(AutoPipelineBlocks):
+ """
+ Image Encoder step that encode the image to generate the image embeddingsThis is an auto pipeline block that works
+ for image2video tasks. - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided. -
+ `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is
+ not provided, step will be skipped.
+
+ Components:
+ image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`)
+
+ Inputs:
+ image (`Image`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+ last_image (`Image`, *optional*):
+ The last frameimage
+
+ Outputs:
+ resized_image (`Image`):
+ TODO: Add description.
+ resized_last_image (`Image`):
+ TODO: Add description.
+ image_embeds (`Tensor`):
+ The image embeddings
+ """
+
block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep]
block_names = ["flf2v_image_encoder", "image2video_image_encoder"]
block_trigger_inputs = ["last_image", "image"]
@@ -90,7 +169,36 @@ def description(self):
# wan2.1 I2V (first frame only)
+# auto_docstring
class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks):
+ """
+ Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent
+ representation
+
+ Components:
+ vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+ Inputs:
+ image (`Image`):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+ num_frames (`int`, *optional*, defaults to 81):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ resized_image (`Image`):
+ TODO: Add description.
+ first_frame_latents (`Tensor`):
+ video latent representation with the first frame image condition
+ image_condition_latents (`Tensor | NoneType`):
+ TODO: Add description.
+ """
+
model_name = "wan-i2v"
block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep]
block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"]
@@ -101,7 +209,40 @@ def description(self):
# wan2.1 FLF2V (first and last frame)
+# auto_docstring
class WanFLF2VVaeEncoderStep(SequentialPipelineBlocks):
+ """
+ FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the
+ latent conditions
+
+ Components:
+ vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+ Inputs:
+ image (`Image`):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+ last_image (`Image`):
+ The last frameimage
+ num_frames (`int`, *optional*, defaults to 81):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ resized_image (`Image`):
+ TODO: Add description.
+ resized_last_image (`Image`):
+ TODO: Add description.
+ first_last_frame_latents (`Tensor`):
+ video latent representation with the first and last frame images condition
+ image_condition_latents (`Tensor | NoneType`):
+ TODO: Add description.
+ """
+
model_name = "wan-i2v"
block_classes = [
WanImageResizeStep,
@@ -117,7 +258,44 @@ def description(self):
# wan2.1 Auto Vae Encoder
+# auto_docstring
class WanAutoVaeEncoderStep(AutoPipelineBlocks):
+ """
+ Vae Image Encoder step that encode the image to generate the image latentsThis is an auto pipeline block that works
+ for image2video tasks. - `WanFLF2VVaeEncoderStep` (flf2v) is used when `last_image` is provided. -
+ `WanImage2VideoVaeEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is not
+ provided, step will be skipped.
+
+ Components:
+ vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+ Inputs:
+ image (`Image`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+ last_image (`Image`, *optional*):
+ The last frameimage
+ num_frames (`int`, *optional*, defaults to 81):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ resized_image (`Image`):
+ TODO: Add description.
+ resized_last_image (`Image`):
+ TODO: Add description.
+ first_last_frame_latents (`Tensor`):
+ video latent representation with the first and last frame images condition
+ image_condition_latents (`Tensor | NoneType`):
+ TODO: Add description.
+ first_frame_latents (`Tensor`):
+ video latent representation with the first frame image condition
+ """
+
model_name = "wan-i2v"
block_classes = [WanFLF2VVaeEncoderStep, WanImage2VideoVaeEncoderStep]
block_names = ["flf2v_vae_encoder", "image2video_vae_encoder"]
@@ -141,7 +319,53 @@ def description(self):
# wan2.1 I2V core denoise (support both I2V and FLF2V)
# inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents)
+# auto_docstring
class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+ Components:
+ transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`)
+
+ Inputs:
+ num_videos_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`Tensor`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`Tensor`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ num_frames (`None`, *optional*):
+ TODO: Add description.
+ image_condition_latents (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_embeds (`Tensor`):
+ TODO: Add description.
+
+ Outputs:
+ batch_size (`int`):
+ Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt
+ dtype (`dtype`):
+ Data type of model tensor inputs (determined by `transformer.dtype`)
+ latents (`Tensor`):
+ The initial latents to use for the denoising process
+ """
+
model_name = "wan-i2v"
block_classes = [
WanTextInputStep,
@@ -160,15 +384,7 @@ class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
@property
def description(self):
- return (
- "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
- + "This is a sequential pipeline blocks:\n"
- + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n"
- + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
- + " - `WanSetTimestepsStep` is used to set the timesteps\n"
- + " - `WanPrepareLatentsStep` is used to prepare the latents\n"
- + " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n"
- )
+ return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
# ====================
@@ -177,7 +393,64 @@ def description(self):
# wan2.1 Image2Video Auto Blocks
+# auto_docstring
class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
+ """
+ Auto Modular pipeline for image-to-video using Wan.
+
+ Supported workflows:
+ - `image2video`: requires `image`, `prompt`
+ - `flf2v`: requires `last_image`, `image`, `prompt`
+
+ Components:
+ text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`)
+ image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) vae (`AutoencoderKLWan`)
+ video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler
+ (`UniPCMultistepScheduler`)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ negative_prompt (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`None`, *optional*, defaults to 512):
+ TODO: Add description.
+ image (`Image`, *optional*):
+ TODO: Add description.
+ height (`int`, *optional*, defaults to 480):
+ TODO: Add description.
+ width (`int`, *optional*, defaults to 832):
+ TODO: Add description.
+ last_image (`Image`, *optional*):
+ The last frameimage
+ num_frames (`int`, *optional*, defaults to 81):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_videos_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ image_condition_latents (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 50):
+ TODO: Add description.
+ timesteps (`None`, *optional*):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ attention_kwargs (`None`, *optional*):
+ TODO: Add description.
+ image_embeds (`Tensor`):
+ TODO: Add description.
+ output_type (`str`, *optional*, defaults to np):
+ The output type of the decoded videos
+
+ Outputs:
+ videos (`list`):
+ The generated videos.
+ """
+
model_name = "wan-i2v"
block_classes = [
WanTextEncoderStep,
@@ -194,10 +467,15 @@ class WanImage2VideoAutoBlocks(SequentialPipelineBlocks):
"decode",
]
+ _workflow_map = {
+ "image2video": {"image": True, "prompt": True},
+ "flf2v": {"last_image": True, "image": True, "prompt": True},
+ }
+
@property
def description(self):
- return (
- "Auto Modular pipeline for image-to-video using Wan.\n"
- + "- for I2V workflow, all you need to provide is `image`"
- + "- for FLF2V workflow, all you need to provide is `last_image` and `image`"
- )
+ return "Auto Modular pipeline for image-to-video using Wan."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/z_image/__init__.py b/src/diffusers/modular_pipelines/z_image/__init__.py
index c8a8c14396c0..5c04008d3305 100644
--- a/src/diffusers/modular_pipelines/z_image/__init__.py
+++ b/src/diffusers/modular_pipelines/z_image/__init__.py
@@ -21,12 +21,7 @@
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
- _import_structure["decoders"] = ["ZImageVaeDecoderStep"]
- _import_structure["encoders"] = ["ZImageTextEncoderStep", "ZImageVaeImageEncoderStep"]
- _import_structure["modular_blocks"] = [
- "ALL_BLOCKS",
- "ZImageAutoBlocks",
- ]
+ _import_structure["modular_blocks_z_image"] = ["ZImageAutoBlocks"]
_import_structure["modular_pipeline"] = ["ZImageModularPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -36,12 +31,7 @@
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
- from .decoders import ZImageVaeDecoderStep
- from .encoders import ZImageTextEncoderStep
- from .modular_blocks import (
- ALL_BLOCKS,
- ZImageAutoBlocks,
- )
+ from .modular_blocks_z_image import ZImageAutoBlocks
from .modular_pipeline import ZImageModularPipeline
else:
import sys
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks.py b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
deleted file mode 100644
index a54baeccaf0c..000000000000
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import logging
-from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
-from .before_denoise import (
- ZImageAdditionalInputsStep,
- ZImagePrepareLatentsStep,
- ZImagePrepareLatentswithImageStep,
- ZImageSetTimestepsStep,
- ZImageSetTimestepsWithStrengthStep,
- ZImageTextInputStep,
-)
-from .decoders import ZImageVaeDecoderStep
-from .denoise import (
- ZImageDenoiseStep,
-)
-from .encoders import (
- ZImageTextEncoderStep,
- ZImageVaeImageEncoderStep,
-)
-
-
-logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-
-
-# z-image
-# text2image
-class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
- block_classes = [
- ZImageTextInputStep,
- ZImagePrepareLatentsStep,
- ZImageSetTimestepsStep,
- ZImageDenoiseStep,
- ]
- block_names = ["input", "prepare_latents", "set_timesteps", "denoise"]
-
- @property
- def description(self):
- return (
- "denoise block that takes encoded conditions and runs the denoising process.\n"
- + "This is a sequential pipeline blocks:\n"
- + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n"
- + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n"
- + " - `ZImageSetTimestepsStep` is used to set the timesteps\n"
- + " - `ZImageDenoiseStep` is used to denoise the latents\n"
- )
-
-
-# z-image: image2image
-## denoise
-class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
- block_classes = [
- ZImageTextInputStep,
- ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
- ZImagePrepareLatentsStep,
- ZImageSetTimestepsStep,
- ZImageSetTimestepsWithStrengthStep,
- ZImagePrepareLatentswithImageStep,
- ZImageDenoiseStep,
- ]
- block_names = [
- "input",
- "additional_inputs",
- "prepare_latents",
- "set_timesteps",
- "set_timesteps_with_strength",
- "prepare_latents_with_image",
- "denoise",
- ]
-
- @property
- def description(self):
- return (
- "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
- + "This is a sequential pipeline blocks:\n"
- + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n"
- + " - `ZImageAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
- + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n"
- + " - `ZImageSetTimestepsStep` is used to set the timesteps\n"
- + " - `ZImageSetTimestepsWithStrengthStep` is used to set the timesteps with strength\n"
- + " - `ZImagePrepareLatentswithImageStep` is used to prepare the latents with image\n"
- + " - `ZImageDenoiseStep` is used to denoise the latents\n"
- )
-
-
-## auto blocks
-class ZImageAutoDenoiseStep(AutoPipelineBlocks):
- block_classes = [
- ZImageImage2ImageCoreDenoiseStep,
- ZImageCoreDenoiseStep,
- ]
- block_names = ["image2image", "text2image"]
- block_trigger_inputs = ["image_latents", None]
-
- @property
- def description(self) -> str:
- return (
- "Denoise step that iteratively denoise the latents. "
- "This is a auto pipeline block that works for text2image and image2image tasks."
- " - `ZImageCoreDenoiseStep` (text2image) for text2image tasks."
- " - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks."
- + " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n"
- + " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n"
- )
-
-
-class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
- block_classes = [ZImageVaeImageEncoderStep]
- block_names = ["vae_encoder"]
- block_trigger_inputs = ["image"]
-
- @property
- def description(self) -> str:
- return "Vae Image Encoder step that encode the image to generate the image latents"
- +"This is an auto pipeline block that works for image2image tasks."
- +" - `ZImageVaeImageEncoderStep` is used when `image` is provided."
- +" - if `image` is not provided, step will be skipped."
-
-
-class ZImageAutoBlocks(SequentialPipelineBlocks):
- block_classes = [
- ZImageTextEncoderStep,
- ZImageAutoVaeImageEncoderStep,
- ZImageAutoDenoiseStep,
- ZImageVaeDecoderStep,
- ]
- block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
-
- @property
- def description(self) -> str:
- return "Auto Modular pipeline for text-to-image and image-to-image using ZImage.\n"
- +" - for text-to-image generation, all you need to provide is `prompt`\n"
- +" - for image-to-image generation, you need to provide `image`\n"
- +" - if `image` is not provided, step will be skipped."
-
-
-# presets
-TEXT2IMAGE_BLOCKS = InsertableDict(
- [
- ("text_encoder", ZImageTextEncoderStep),
- ("input", ZImageTextInputStep),
- ("prepare_latents", ZImagePrepareLatentsStep),
- ("set_timesteps", ZImageSetTimestepsStep),
- ("denoise", ZImageDenoiseStep),
- ("decode", ZImageVaeDecoderStep),
- ]
-)
-
-IMAGE2IMAGE_BLOCKS = InsertableDict(
- [
- ("text_encoder", ZImageTextEncoderStep),
- ("vae_encoder", ZImageVaeImageEncoderStep),
- ("input", ZImageTextInputStep),
- ("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])),
- ("prepare_latents", ZImagePrepareLatentsStep),
- ("set_timesteps", ZImageSetTimestepsStep),
- ("set_timesteps_with_strength", ZImageSetTimestepsWithStrengthStep),
- ("prepare_latents_with_image", ZImagePrepareLatentswithImageStep),
- ("denoise", ZImageDenoiseStep),
- ("decode", ZImageVaeDecoderStep),
- ]
-)
-
-
-AUTO_BLOCKS = InsertableDict(
- [
- ("text_encoder", ZImageTextEncoderStep),
- ("vae_encoder", ZImageAutoVaeImageEncoderStep),
- ("denoise", ZImageAutoDenoiseStep),
- ("decode", ZImageVaeDecoderStep),
- ]
-)
-
-ALL_BLOCKS = {
- "text2image": TEXT2IMAGE_BLOCKS,
- "image2image": IMAGE2IMAGE_BLOCKS,
- "auto": AUTO_BLOCKS,
-}
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
new file mode 100644
index 000000000000..23e20d55fb1e
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py
@@ -0,0 +1,334 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
+from .before_denoise import (
+ ZImageAdditionalInputsStep,
+ ZImagePrepareLatentsStep,
+ ZImagePrepareLatentswithImageStep,
+ ZImageSetTimestepsStep,
+ ZImageSetTimestepsWithStrengthStep,
+ ZImageTextInputStep,
+)
+from .decoders import ZImageVaeDecoderStep
+from .denoise import (
+ ZImageDenoiseStep,
+)
+from .encoders import (
+ ZImageTextEncoderStep,
+ ZImageVaeImageEncoderStep,
+)
+
+
+logger = logging.get_logger(__name__) # pylint: disable=invalid-name
+
+
+# ====================
+# 1. DENOISE
+# ====================
+
+
+# text2image: inputs(text) -> set_timesteps -> prepare_latents -> denoise
+# auto_docstring
+class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ denoise block that takes encoded conditions and runs the denoising process.
+
+ Components:
+ transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+ (`ClassifierFreeGuidance`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`list`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`list`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ height (`int`, *optional*):
+ TODO: Add description.
+ width (`int`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 9):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ **denoiser_input_fields (`None`, *optional*):
+ The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
+ block_classes = [
+ ZImageTextInputStep,
+ ZImagePrepareLatentsStep,
+ ZImageSetTimestepsStep,
+ ZImageDenoiseStep,
+ ]
+ block_names = ["input", "prepare_latents", "set_timesteps", "denoise"]
+
+ @property
+ def description(self):
+ return "denoise block that takes encoded conditions and runs the denoising process."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("latents")]
+
+
+# image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise
+# auto_docstring
+class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
+ """
+ denoise block that takes encoded text and image latent conditions and runs the denoising process.
+
+ Components:
+ transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+ (`ClassifierFreeGuidance`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`list`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`list`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`, *optional*, defaults to 9):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ strength (`None`, *optional*, defaults to 0.6):
+ TODO: Add description.
+ **denoiser_input_fields (`None`, *optional*):
+ The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
+ block_classes = [
+ ZImageTextInputStep,
+ ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+ ZImagePrepareLatentsStep,
+ ZImageSetTimestepsStep,
+ ZImageSetTimestepsWithStrengthStep,
+ ZImagePrepareLatentswithImageStep,
+ ZImageDenoiseStep,
+ ]
+ block_names = [
+ "input",
+ "additional_inputs",
+ "prepare_latents",
+ "set_timesteps",
+ "set_timesteps_with_strength",
+ "prepare_latents_with_image",
+ "denoise",
+ ]
+
+ @property
+ def description(self):
+ return "denoise block that takes encoded text and image latent conditions and runs the denoising process."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("latents")]
+
+
+# auto_docstring
+class ZImageAutoDenoiseStep(AutoPipelineBlocks):
+ """
+ Denoise step that iteratively denoise the latents. This is a auto pipeline block that works for text2image and
+ image2image tasks. - `ZImageCoreDenoiseStep` (text2image) for text2image tasks. -
+ `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks. - if `image_latents` is provided,
+ `ZImageImage2ImageCoreDenoiseStep` will be used.
+ - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.
+
+ Components:
+ transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+ (`ClassifierFreeGuidance`)
+
+ Inputs:
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ prompt_embeds (`list`):
+ Pre-generated text embeddings. Can be generated from text_encoder step.
+ negative_prompt_embeds (`list`, *optional*):
+ Pre-generated negative text embeddings. Can be generated from text_encoder step.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_inference_steps (`None`):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ strength (`None`, *optional*, defaults to 0.6):
+ TODO: Add description.
+ **denoiser_input_fields (`None`, *optional*):
+ The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+ Outputs:
+ latents (`Tensor`):
+ Denoised latents.
+ """
+
+ block_classes = [
+ ZImageImage2ImageCoreDenoiseStep,
+ ZImageCoreDenoiseStep,
+ ]
+ block_names = ["image2image", "text2image"]
+ block_trigger_inputs = ["image_latents", None]
+
+ @property
+ def description(self) -> str:
+ return (
+ "Denoise step that iteratively denoise the latents. "
+ "This is a auto pipeline block that works for text2image and image2image tasks."
+ " - `ZImageCoreDenoiseStep` (text2image) for text2image tasks."
+ " - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks."
+ + " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n"
+ + " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n"
+ )
+
+
+# auto_docstring
+class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
+ """
+ Vae Image Encoder step that encode the image to generate the image latents
+
+ Components:
+ vae (`AutoencoderKL`) image_processor (`VaeImageProcessor`)
+
+ Inputs:
+ image (`Image`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+
+ Outputs:
+ image_latents (`Tensor`):
+ video latent representation with the first frame image condition
+ """
+
+ block_classes = [ZImageVaeImageEncoderStep]
+ block_names = ["vae_encoder"]
+ block_trigger_inputs = ["image"]
+
+ @property
+ def description(self) -> str:
+ return "Vae Image Encoder step that encode the image to generate the image latents"
+ +"This is an auto pipeline block that works for image2image tasks."
+ +" - `ZImageVaeImageEncoderStep` is used when `image` is provided."
+ +" - if `image` is not provided, step will be skipped."
+
+
+# auto_docstring
+class ZImageAutoBlocks(SequentialPipelineBlocks):
+ """
+ Auto Modular pipeline for text-to-image and image-to-image using ZImage.
+
+ Supported workflows:
+ - `text2image`: requires `prompt`
+ - `image2image`: requires `image`, `prompt`
+
+ Components:
+ text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) guider (`ClassifierFreeGuidance`) vae
+ (`AutoencoderKL`) image_processor (`VaeImageProcessor`) transformer (`ZImageTransformer2DModel`) scheduler
+ (`FlowMatchEulerDiscreteScheduler`)
+
+ Inputs:
+ prompt (`None`, *optional*):
+ TODO: Add description.
+ negative_prompt (`None`, *optional*):
+ TODO: Add description.
+ max_sequence_length (`None`, *optional*, defaults to 512):
+ TODO: Add description.
+ image (`Image`, *optional*):
+ TODO: Add description.
+ height (`None`, *optional*):
+ TODO: Add description.
+ width (`None`, *optional*):
+ TODO: Add description.
+ generator (`None`, *optional*):
+ TODO: Add description.
+ num_images_per_prompt (`None`, *optional*, defaults to 1):
+ TODO: Add description.
+ image_latents (`None`, *optional*):
+ TODO: Add description.
+ latents (`Tensor | NoneType`):
+ TODO: Add description.
+ num_inference_steps (`None`):
+ TODO: Add description.
+ sigmas (`None`, *optional*):
+ TODO: Add description.
+ strength (`None`, *optional*, defaults to 0.6):
+ TODO: Add description.
+ **denoiser_input_fields (`None`, *optional*):
+ The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+ output_type (`str`, *optional*, defaults to pil):
+ The type of the output images, can be 'pil', 'np', 'pt'
+
+ Outputs:
+ images (`list`):
+ Generated images.
+ """
+
+ block_classes = [
+ ZImageTextEncoderStep,
+ ZImageAutoVaeImageEncoderStep,
+ ZImageAutoDenoiseStep,
+ ZImageVaeDecoderStep,
+ ]
+ block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+ _workflow_map = {
+ "text2image": {"prompt": True},
+ "image2image": {"image": True, "prompt": True},
+ }
+
+ @property
+ def description(self) -> str:
+ return "Auto Modular pipeline for text-to-image and image-to-image using ZImage."
+
+ @property
+ def outputs(self):
+ return [OutputParam.template("images")]
diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
index 854b5218c617..9a6b4b9b6fb4 100644
--- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -33,6 +33,19 @@
from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+FLUX_TEXT2IMAGE_WORKFLOWS = {
+ "text2image": [
+ ("text_encoder", "FluxTextEncoderStep"),
+ ("denoise.input", "FluxTextInputStep"),
+ ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+ ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+ ("denoise.denoise", "FluxDenoiseStep"),
+ ("decode", "FluxDecodeStep"),
+ ]
+}
+
+
class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
pipeline_class = FluxModularPipeline
pipeline_blocks_class = FluxAutoBlocks
@@ -40,6 +53,7 @@ class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
params = frozenset(["prompt", "height", "width", "guidance_scale"])
batch_params = frozenset(["prompt"])
+ expected_workflow_blocks = FLUX_TEXT2IMAGE_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
@@ -59,6 +73,23 @@ def test_float16_inference(self):
super().test_float16_inference(9e-2)
+FLUX_IMAGE2IMAGE_WORKFLOWS = {
+ "image2image": [
+ ("text_encoder", "FluxTextEncoderStep"),
+ ("vae_encoder.preprocess", "FluxProcessImagesInputStep"),
+ ("vae_encoder.encode", "FluxVaeEncoderStep"),
+ ("denoise.input.text_inputs", "FluxTextInputStep"),
+ ("denoise.input.additional_inputs", "FluxAdditionalInputsStep"),
+ ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+ ("denoise.before_denoise.set_timesteps", "FluxImg2ImgSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_img2img_latents", "FluxImg2ImgPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+ ("denoise.denoise", "FluxDenoiseStep"),
+ ("decode", "FluxDecodeStep"),
+ ]
+}
+
+
class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
pipeline_class = FluxModularPipeline
pipeline_blocks_class = FluxAutoBlocks
@@ -66,6 +97,7 @@ class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
batch_params = frozenset(["prompt", "image"])
+ expected_workflow_blocks = FLUX_IMAGE2IMAGE_WORKFLOWS
def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
pipeline = super().get_pipeline(components_manager, torch_dtype)
@@ -125,6 +157,32 @@ def test_float16_inference(self):
super().test_float16_inference(8e-2)
+FLUX_KONTEXT_WORKFLOWS = {
+ "text2image": [
+ ("text_encoder", "FluxTextEncoderStep"),
+ ("denoise.input", "FluxTextInputStep"),
+ ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+ ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"),
+ ("denoise.denoise", "FluxKontextDenoiseStep"),
+ ("decode", "FluxDecodeStep"),
+ ],
+ "image_conditioned": [
+ ("text_encoder", "FluxTextEncoderStep"),
+ ("vae_encoder.preprocess", "FluxKontextProcessImagesInputStep"),
+ ("vae_encoder.encode", "FluxVaeEncoderStep"),
+ ("denoise.input.set_resolution", "FluxKontextSetResolutionStep"),
+ ("denoise.input.text_inputs", "FluxTextInputStep"),
+ ("denoise.input.additional_inputs", "FluxKontextAdditionalInputsStep"),
+ ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"),
+ ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_rope_inputs", "FluxKontextRoPEInputsStep"),
+ ("denoise.denoise", "FluxKontextDenoiseStep"),
+ ("decode", "FluxDecodeStep"),
+ ],
+}
+
+
class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
pipeline_class = FluxKontextModularPipeline
pipeline_blocks_class = FluxKontextAutoBlocks
@@ -132,6 +190,7 @@ class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
batch_params = frozenset(["prompt", "image"])
+ expected_workflow_blocks = FLUX_KONTEXT_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
index 8fd529e97e71..3045af636841 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
@@ -28,6 +28,21 @@
from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+FLUX2_TEXT2IMAGE_WORKFLOWS = {
+ "text2image": [
+ ("text_encoder", "Flux2TextEncoderStep"),
+ ("denoise.input", "Flux2TextInputStep"),
+ ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+ ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+ ("denoise.prepare_guidance", "Flux2PrepareGuidanceStep"),
+ ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+ ("denoise.denoise", "Flux2DenoiseStep"),
+ ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+ ("decode", "Flux2DecodeStep"),
+ ],
+}
+
+
class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
pipeline_class = Flux2ModularPipeline
pipeline_blocks_class = Flux2AutoBlocks
@@ -35,6 +50,7 @@ class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
params = frozenset(["prompt", "height", "width", "guidance_scale"])
batch_params = frozenset(["prompt"])
+ expected_workflow_blocks = FLUX2_TEXT2IMAGE_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
@@ -56,6 +72,24 @@ def test_float16_inference(self):
super().test_float16_inference(9e-2)
+FLUX2_IMAGE_CONDITIONED_WORKFLOWS = {
+ "image_conditioned": [
+ ("text_encoder", "Flux2TextEncoderStep"),
+ ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+ ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+ ("denoise.input", "Flux2TextInputStep"),
+ ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+ ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+ ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+ ("denoise.prepare_guidance", "Flux2PrepareGuidanceStep"),
+ ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+ ("denoise.denoise", "Flux2DenoiseStep"),
+ ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+ ("decode", "Flux2DecodeStep"),
+ ],
+}
+
+
class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
pipeline_class = Flux2ModularPipeline
pipeline_blocks_class = Flux2AutoBlocks
@@ -63,6 +97,7 @@ class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
batch_params = frozenset(["prompt", "image"])
+ expected_workflow_blocks = FLUX2_IMAGE_CONDITIONED_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
index 26653b20f8c4..ad295a961357 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
@@ -28,13 +28,28 @@
from ..test_modular_pipelines_common import ModularPipelineTesterMixin
-class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
+FLUX2_KLEIN_WORKFLOWS = {
+ "text2image": [
+ ("text_encoder", "Flux2KleinTextEncoderStep"),
+ ("denoise.input", "Flux2TextInputStep"),
+ ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+ ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+ ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+ ("denoise.denoise", "Flux2KleinDenoiseStep"),
+ ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+ ("decode", "Flux2DecodeStep"),
+ ],
+}
+
+
+class TestFlux2KleinModularPipelineFast(ModularPipelineTesterMixin):
pipeline_class = Flux2KleinModularPipeline
pipeline_blocks_class = Flux2KleinAutoBlocks
pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular"
params = frozenset(["prompt", "height", "width"])
batch_params = frozenset(["prompt"])
+ expected_workflow_blocks = FLUX2_KLEIN_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
@@ -55,13 +70,31 @@ def test_float16_inference(self):
super().test_float16_inference(9e-2)
-class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+FLUX2_KLEIN_IMAGE_CONDITIONED_WORKFLOWS = {
+ "image_conditioned": [
+ ("text_encoder", "Flux2KleinTextEncoderStep"),
+ ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+ ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+ ("denoise.input", "Flux2TextInputStep"),
+ ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+ ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+ ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+ ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"),
+ ("denoise.denoise", "Flux2KleinDenoiseStep"),
+ ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+ ("decode", "Flux2DecodeStep"),
+ ],
+}
+
+
+class TestFlux2KleinImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
pipeline_class = Flux2KleinModularPipeline
pipeline_blocks_class = Flux2KleinAutoBlocks
pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular"
params = frozenset(["prompt", "height", "width", "image"])
batch_params = frozenset(["prompt", "image"])
+ expected_workflow_blocks = FLUX2_KLEIN_IMAGE_CONDITIONED_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
index 701dd0fed896..b3aa79040317 100644
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
@@ -21,20 +21,35 @@
from diffusers.modular_pipelines import (
Flux2KleinBaseAutoBlocks,
- Flux2KleinModularPipeline,
+ Flux2KleinBaseModularPipeline,
)
from ...testing_utils import floats_tensor, torch_device
from ..test_modular_pipelines_common import ModularPipelineTesterMixin
-class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
- pipeline_class = Flux2KleinModularPipeline
+FLUX2_KLEIN_BASE_WORKFLOWS = {
+ "text2image": [
+ ("text_encoder", "Flux2KleinBaseTextEncoderStep"),
+ ("denoise.input", "Flux2KleinBaseTextInputStep"),
+ ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+ ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+ ("denoise.prepare_rope_inputs", "Flux2KleinBaseRoPEInputsStep"),
+ ("denoise.denoise", "Flux2KleinBaseDenoiseStep"),
+ ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+ ("decode", "Flux2DecodeStep"),
+ ],
+}
+
+
+class TestFlux2KleinBaseModularPipelineFast(ModularPipelineTesterMixin):
+ pipeline_class = Flux2KleinBaseModularPipeline
pipeline_blocks_class = Flux2KleinBaseAutoBlocks
pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular"
params = frozenset(["prompt", "height", "width"])
batch_params = frozenset(["prompt"])
+ expected_workflow_blocks = FLUX2_KLEIN_BASE_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
@@ -55,13 +70,31 @@ def test_float16_inference(self):
super().test_float16_inference(9e-2)
-class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
- pipeline_class = Flux2KleinModularPipeline
+FLUX2_KLEIN_BASE_IMAGE_CONDITIONED_WORKFLOWS = {
+ "image_conditioned": [
+ ("text_encoder", "Flux2KleinBaseTextEncoderStep"),
+ ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"),
+ ("vae_encoder.encode", "Flux2VaeEncoderStep"),
+ ("denoise.input", "Flux2KleinBaseTextInputStep"),
+ ("denoise.prepare_latents", "Flux2PrepareLatentsStep"),
+ ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"),
+ ("denoise.set_timesteps", "Flux2SetTimestepsStep"),
+ ("denoise.prepare_rope_inputs", "Flux2KleinBaseRoPEInputsStep"),
+ ("denoise.denoise", "Flux2KleinBaseDenoiseStep"),
+ ("denoise.after_denoise", "Flux2UnpackLatentsStep"),
+ ("decode", "Flux2DecodeStep"),
+ ],
+}
+
+
+class TestFlux2KleinBaseImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+ pipeline_class = Flux2KleinBaseModularPipeline
pipeline_blocks_class = Flux2KleinBaseAutoBlocks
pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular"
params = frozenset(["prompt", "height", "width", "image"])
batch_params = frozenset(["prompt", "image"])
+ expected_workflow_blocks = FLUX2_KLEIN_BASE_IMAGE_CONDITIONED_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
index f4bd27b7ea47..92573c202e49 100644
--- a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
+++ b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py
@@ -30,6 +30,103 @@
from ..test_modular_pipelines_common import ModularGuiderTesterMixin, ModularPipelineTesterMixin
+QWEN_IMAGE_TEXT2IMAGE_WORKFLOWS = {
+ "text2image": [
+ ("text_encoder", "QwenImageTextEncoderStep"),
+ ("denoise.input", "QwenImageTextInputsStep"),
+ ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+ ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+ ("denoise.denoise", "QwenImageDenoiseStep"),
+ ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+ ("decode.decode", "QwenImageDecoderStep"),
+ ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+ ],
+ "image2image": [
+ ("text_encoder", "QwenImageTextEncoderStep"),
+ ("vae_encoder.preprocess", "QwenImageProcessImagesInputStep"),
+ ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+ ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+ ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+ ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+ ("denoise.prepare_img2img_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+ ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+ ("denoise.denoise", "QwenImageDenoiseStep"),
+ ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+ ("decode.decode", "QwenImageDecoderStep"),
+ ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+ ],
+ "inpainting": [
+ ("text_encoder", "QwenImageTextEncoderStep"),
+ ("vae_encoder.preprocess", "QwenImageInpaintProcessImagesInputStep"),
+ ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+ ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+ ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+ ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+ ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+ ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+ ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+ ("denoise.denoise", "QwenImageInpaintDenoiseStep"),
+ ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+ ("decode.decode", "QwenImageDecoderStep"),
+ ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+ ],
+ "controlnet_text2image": [
+ ("text_encoder", "QwenImageTextEncoderStep"),
+ ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+ ("denoise.input", "QwenImageTextInputsStep"),
+ ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+ ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+ ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+ ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+ ("denoise.controlnet_denoise", "QwenImageControlNetDenoiseStep"),
+ ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+ ("decode.decode", "QwenImageDecoderStep"),
+ ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+ ],
+ "controlnet_image2image": [
+ ("text_encoder", "QwenImageTextEncoderStep"),
+ ("vae_encoder.preprocess", "QwenImageProcessImagesInputStep"),
+ ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+ ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+ ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+ ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+ ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+ ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+ ("denoise.prepare_img2img_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+ ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+ ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+ ("denoise.controlnet_denoise", "QwenImageControlNetDenoiseStep"),
+ ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+ ("decode.decode", "QwenImageDecoderStep"),
+ ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+ ],
+ "controlnet_inpainting": [
+ ("text_encoder", "QwenImageTextEncoderStep"),
+ ("vae_encoder.preprocess", "QwenImageInpaintProcessImagesInputStep"),
+ ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+ ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"),
+ ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+ ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+ ("denoise.controlnet_input", "QwenImageControlNetInputsStep"),
+ ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+ ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+ ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+ ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"),
+ ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"),
+ ("denoise.controlnet_denoise", "QwenImageInpaintControlNetDenoiseStep"),
+ ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+ ("decode.decode", "QwenImageDecoderStep"),
+ ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+ ],
+}
+
+
class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
pipeline_class = QwenImageModularPipeline
pipeline_blocks_class = QwenImageAutoBlocks
@@ -37,6 +134,7 @@ class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuider
params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+ expected_workflow_blocks = QWEN_IMAGE_TEXT2IMAGE_WORKFLOWS
def get_dummy_inputs(self):
generator = self.get_generator()
@@ -56,6 +154,44 @@ def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=5e-4)
+QWEN_IMAGE_EDIT_WORKFLOWS = {
+ "image_conditioned": [
+ ("text_encoder.resize", "QwenImageEditResizeStep"),
+ ("text_encoder.encode", "QwenImageEditTextEncoderStep"),
+ ("vae_encoder.resize", "QwenImageEditResizeStep"),
+ ("vae_encoder.preprocess", "QwenImageEditProcessImagesInputStep"),
+ ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+ ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+ ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+ ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "QwenImageSetTimestepsStep"),
+ ("denoise.prepare_rope_inputs", "QwenImageEditRoPEInputsStep"),
+ ("denoise.denoise", "QwenImageEditDenoiseStep"),
+ ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+ ("decode.decode", "QwenImageDecoderStep"),
+ ("decode.postprocess", "QwenImageProcessImagesOutputStep"),
+ ],
+ "image_conditioned_inpainting": [
+ ("text_encoder.resize", "QwenImageEditResizeStep"),
+ ("text_encoder.encode", "QwenImageEditTextEncoderStep"),
+ ("vae_encoder.resize", "QwenImageEditResizeStep"),
+ ("vae_encoder.preprocess", "QwenImageEditInpaintProcessImagesInputStep"),
+ ("vae_encoder.encode", "QwenImageVaeEncoderStep"),
+ ("denoise.input.text_inputs", "QwenImageTextInputsStep"),
+ ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"),
+ ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"),
+ ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"),
+ ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"),
+ ("denoise.prepare_rope_inputs", "QwenImageEditRoPEInputsStep"),
+ ("denoise.denoise", "QwenImageEditInpaintDenoiseStep"),
+ ("denoise.after_denoise", "QwenImageAfterDenoiseStep"),
+ ("decode.decode", "QwenImageDecoderStep"),
+ ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"),
+ ],
+}
+
+
class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
pipeline_class = QwenImageEditModularPipeline
pipeline_blocks_class = QwenImageEditAutoBlocks
@@ -63,6 +199,7 @@ class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGu
params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
+ expected_workflow_blocks = QWEN_IMAGE_EDIT_WORKFLOWS
def get_dummy_inputs(self):
generator = self.get_generator()
diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
index 7b55933e4caf..f640f0ec83f2 100644
--- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
+++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py
@@ -267,6 +267,60 @@ def test_controlnet_cfg(self):
assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
+TEXT2IMAGE_WORKFLOWS = {
+ "text2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+ ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+ "controlnet_text2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+ ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+ ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+ "controlnet_union_text2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+ ("denoise.controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+ ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+ "ip_adapter_text2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+ ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+ "ip_adapter_controlnet_text2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"),
+ ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+ ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+}
+
+
class TestSDXLModularPipelineFast(
SDXLModularTesterMixin,
SDXLModularIPAdapterTesterMixin,
@@ -291,6 +345,8 @@ class TestSDXLModularPipelineFast(
batch_params = frozenset(["prompt", "negative_prompt"])
expected_image_output_shape = (1, 3, 64, 64)
+ expected_workflow_blocks = TEXT2IMAGE_WORKFLOWS
+
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
inputs = {
@@ -314,6 +370,65 @@ def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+IMAGE2IMAGE_WORKFLOWS = {
+ "image2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+ "controlnet_image2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+ ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+ "controlnet_union_image2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("denoise.controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+ ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+ "ip_adapter_image2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+ ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("denoise.denoise", "StableDiffusionXLDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+ "ip_adapter_controlnet_image2image": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+ ("vae_encoder", "StableDiffusionXLVaeEncoderStep"),
+ ("denoise.input", "StableDiffusionXLInputStep"),
+ ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"),
+ ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"),
+ ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLDecodeStep"),
+ ],
+}
+
+
class TestSDXLImg2ImgModularPipelineFast(
SDXLModularTesterMixin,
SDXLModularIPAdapterTesterMixin,
@@ -338,6 +453,7 @@ class TestSDXLImg2ImgModularPipelineFast(
)
batch_params = frozenset(["prompt", "negative_prompt", "image"])
expected_image_output_shape = (1, 3, 64, 64)
+ expected_workflow_blocks = IMAGE2IMAGE_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)
@@ -367,6 +483,65 @@ def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
+INPAINTING_WORKFLOWS = {
+ "inpainting": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+ ("input", "StableDiffusionXLInputStep"),
+ ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+ ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("denoise", "StableDiffusionXLInpaintDenoiseStep"),
+ ("decode", "StableDiffusionXLInpaintDecodeStep"),
+ ],
+ "controlnet_inpainting": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+ ("input", "StableDiffusionXLInputStep"),
+ ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+ ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+ ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLInpaintDecodeStep"),
+ ],
+ "controlnet_union_inpainting": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+ ("input", "StableDiffusionXLInputStep"),
+ ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+ ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("controlnet_input", "StableDiffusionXLControlNetUnionInputStep"),
+ ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLInpaintDecodeStep"),
+ ],
+ "ip_adapter_inpainting": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+ ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+ ("input", "StableDiffusionXLInputStep"),
+ ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+ ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("denoise", "StableDiffusionXLInpaintDenoiseStep"),
+ ("decode", "StableDiffusionXLInpaintDecodeStep"),
+ ],
+ "ip_adapter_controlnet_inpainting": [
+ ("text_encoder", "StableDiffusionXLTextEncoderStep"),
+ ("ip_adapter", "StableDiffusionXLIPAdapterStep"),
+ ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"),
+ ("input", "StableDiffusionXLInputStep"),
+ ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"),
+ ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"),
+ ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"),
+ ("controlnet_input", "StableDiffusionXLControlNetInputStep"),
+ ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"),
+ ("decode", "StableDiffusionXLInpaintDecodeStep"),
+ ],
+}
+
+
class SDXLInpaintingModularPipelineFastTests(
SDXLModularTesterMixin,
SDXLModularIPAdapterTesterMixin,
@@ -392,6 +567,7 @@ class SDXLInpaintingModularPipelineFastTests(
)
batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
expected_image_output_shape = (1, 3, 64, 64)
+ expected_workflow_blocks = INPAINTING_WORKFLOWS
def get_dummy_inputs(self, device, seed=0):
generator = self.get_generator(seed)
diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py
index 937a9ccec880..e97b543ff85d 100644
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -100,6 +100,14 @@ def batch_params(self) -> frozenset:
"See existing pipeline tests for reference."
)
+ @property
+ def expected_workflow_blocks(self) -> dict:
+ raise NotImplementedError(
+ "You need to set the attribute `expected_workflow_blocks` in the child test class. "
+ "`expected_workflow_blocks` is a dictionary that maps workflow names to list of block names. "
+ "See existing pipeline tests for reference."
+ )
+
def setup_method(self):
# clean up the VRAM before each test
torch.compiler.reset()
@@ -341,6 +349,34 @@ def test_save_from_pretrained(self):
assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+ def test_workflow_map(self):
+ blocks = self.pipeline_blocks_class()
+ if blocks._workflow_map is None:
+ pytest.skip("Skipping test as _workflow_map is not set")
+
+ assert hasattr(self, "expected_workflow_blocks") and self.expected_workflow_blocks, (
+ "expected_workflow_blocks must be defined in the test class"
+ )
+
+ for workflow_name, expected_blocks in self.expected_workflow_blocks.items():
+ workflow_blocks = blocks.get_workflow(workflow_name)
+ actual_blocks = list(workflow_blocks.sub_blocks.items())
+
+ # Check that the number of blocks matches
+ assert len(actual_blocks) == len(expected_blocks), (
+ f"Workflow '{workflow_name}' has {len(actual_blocks)} blocks, expected {len(expected_blocks)}"
+ )
+
+ # Check that each block name and type matches
+ for i, ((actual_name, actual_block), (expected_name, expected_class_name)) in enumerate(
+ zip(actual_blocks, expected_blocks)
+ ):
+ assert actual_name == expected_name
+ assert actual_block.__class__.__name__ == expected_class_name, (
+ f"Workflow '{workflow_name}': block '{actual_name}' has type "
+ f"{actual_block.__class__.__name__}, expected {expected_class_name}"
+ )
+
class ModularGuiderTesterMixin:
def test_guider_cfg(self, expected_max_diff=1e-2):
diff --git a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
index 29da18fce61b..ab45def3ef30 100644
--- a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
+++ b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py
@@ -19,6 +19,30 @@
from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+ZIMAGE_WORKFLOWS = {
+ "text2image": [
+ ("text_encoder", "ZImageTextEncoderStep"),
+ ("denoise.input", "ZImageTextInputStep"),
+ ("denoise.prepare_latents", "ZImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "ZImageSetTimestepsStep"),
+ ("denoise.denoise", "ZImageDenoiseStep"),
+ ("decode", "ZImageVaeDecoderStep"),
+ ],
+ "image2image": [
+ ("text_encoder", "ZImageTextEncoderStep"),
+ ("vae_encoder", "ZImageVaeImageEncoderStep"),
+ ("denoise.input", "ZImageTextInputStep"),
+ ("denoise.additional_inputs", "ZImageAdditionalInputsStep"),
+ ("denoise.prepare_latents", "ZImagePrepareLatentsStep"),
+ ("denoise.set_timesteps", "ZImageSetTimestepsStep"),
+ ("denoise.set_timesteps_with_strength", "ZImageSetTimestepsWithStrengthStep"),
+ ("denoise.prepare_latents_with_image", "ZImagePrepareLatentswithImageStep"),
+ ("denoise.denoise", "ZImageDenoiseStep"),
+ ("decode", "ZImageVaeDecoderStep"),
+ ],
+}
+
+
class TestZImageModularPipelineFast(ModularPipelineTesterMixin):
pipeline_class = ZImageModularPipeline
pipeline_blocks_class = ZImageAutoBlocks
@@ -26,6 +50,7 @@ class TestZImageModularPipelineFast(ModularPipelineTesterMixin):
params = frozenset(["prompt", "height", "width"])
batch_params = frozenset(["prompt"])
+ expected_workflow_blocks = ZIMAGE_WORKFLOWS
def get_dummy_inputs(self, seed=0):
generator = self.get_generator(seed)