diff --git a/docs/source/en/modular_diffusers/guiders.md b/docs/source/en/modular_diffusers/guiders.md index 6abe4fad2736..ffe039f41556 100644 --- a/docs/source/en/modular_diffusers/guiders.md +++ b/docs/source/en/modular_diffusers/guiders.md @@ -89,10 +89,8 @@ t2i_pipeline.guider ## Changing guider parameters -The guider parameters can be adjusted with either the [`~ComponentSpec.create`] method or with [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value. +The guider parameters can be adjusted with the [`~ComponentSpec.create`] method and [`~ModularPipeline.update_components`]. The example below changes the `guidance_scale` value. - - ```py guider_spec = t2i_pipeline.get_component_spec("guider") @@ -100,18 +98,6 @@ guider = guider_spec.create(guidance_scale=10) t2i_pipeline.update_components(guider=guider) ``` - - - -```py -guider_spec = t2i_pipeline.get_component_spec("guider") -guider_spec.config["guidance_scale"] = 10 -t2i_pipeline.update_components(guider=guider_spec) -``` - - - - ## Uploading custom guiders Call the [`~utils.PushToHubMixin.push_to_hub`] method on a custom guider to share it to the Hub. diff --git a/docs/source/zh/modular_diffusers/guiders.md b/docs/source/zh/modular_diffusers/guiders.md index 50436f90c4a5..2315625a197a 100644 --- a/docs/source/zh/modular_diffusers/guiders.md +++ b/docs/source/zh/modular_diffusers/guiders.md @@ -86,10 +86,7 @@ t2i_pipeline.guider ## 更改引导器参数 -引导器参数可以通过 [`~ComponentSpec.create`] 方法或 [`~ModularPipeline.update_components`] 方法进行调整。下面的示例更改了 `guidance_scale` 值。 - - - +引导器参数可以通过 [`~ComponentSpec.create`] 方法以及 [`~ModularPipeline.update_components`] 方法进行调整。下面的示例更改了 `guidance_scale` 值。 ```py guider_spec = t2i_pipeline.get_component_spec("guider") @@ -97,18 +94,6 @@ guider = guider_spec.create(guidance_scale=10) t2i_pipeline.update_components(guider=guider) ``` - - - -```py -guider_spec = t2i_pipeline.get_component_spec("guider") -guider_spec.config["guidance_scale"] = 10 -t2i_pipeline.update_components(guider=guider_spec) -``` - - - - ## 上传自定义引导器 在自定义引导器上调用 [`~utils.PushToHubMixin.push_to_hub`] 方法,将其分享到 Hub。 diff --git a/src/diffusers/modular_pipelines/flux/__init__.py b/src/diffusers/modular_pipelines/flux/__init__.py index ec00986611c8..4754ed01ce6a 100644 --- a/src/diffusers/modular_pipelines/flux/__init__.py +++ b/src/diffusers/modular_pipelines/flux/__init__.py @@ -21,21 +21,8 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["encoders"] = ["FluxTextEncoderStep"] - _import_structure["modular_blocks"] = [ - "ALL_BLOCKS", - "AUTO_BLOCKS", - "AUTO_BLOCKS_KONTEXT", - "FLUX_KONTEXT_BLOCKS", - "TEXT2IMAGE_BLOCKS", - "FluxAutoBeforeDenoiseStep", - "FluxAutoBlocks", - "FluxAutoDecodeStep", - "FluxAutoDenoiseStep", - "FluxKontextAutoBlocks", - "FluxKontextAutoDenoiseStep", - "FluxKontextBeforeDenoiseStep", - ] + _import_structure["modular_blocks_flux"] = ["FluxAutoBlocks"] + _import_structure["modular_blocks_flux_kontext"] = ["FluxKontextAutoBlocks"] _import_structure["modular_pipeline"] = ["FluxKontextModularPipeline", "FluxModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -45,21 +32,8 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .encoders import FluxTextEncoderStep - from .modular_blocks import ( - ALL_BLOCKS, - AUTO_BLOCKS, - AUTO_BLOCKS_KONTEXT, - FLUX_KONTEXT_BLOCKS, - TEXT2IMAGE_BLOCKS, - FluxAutoBeforeDenoiseStep, - FluxAutoBlocks, - FluxAutoDecodeStep, - FluxAutoDenoiseStep, - FluxKontextAutoBlocks, - FluxKontextAutoDenoiseStep, - FluxKontextBeforeDenoiseStep, - ) + from .modular_blocks_flux import FluxAutoBlocks + from .modular_blocks_flux_kontext import FluxKontextAutoBlocks from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py index 4f94a17d88eb..583c139ff22e 100644 --- a/src/diffusers/modular_pipelines/flux/encoders.py +++ b/src/diffusers/modular_pipelines/flux/encoders.py @@ -205,7 +205,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState): return components, state -class FluxVaeEncoderDynamicStep(ModularPipelineBlocks): +class FluxVaeEncoderStep(ModularPipelineBlocks): model_name = "flux" def __init__( diff --git a/src/diffusers/modular_pipelines/flux/inputs.py b/src/diffusers/modular_pipelines/flux/inputs.py index dbf42e0c6df4..9d2f69dbe26f 100644 --- a/src/diffusers/modular_pipelines/flux/inputs.py +++ b/src/diffusers/modular_pipelines/flux/inputs.py @@ -121,7 +121,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip # Adapted from `QwenImageAdditionalInputsStep` -class FluxInputsDynamicStep(ModularPipelineBlocks): +class FluxAdditionalInputsStep(ModularPipelineBlocks): model_name = "flux" def __init__( @@ -243,7 +243,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip return components, state -class FluxKontextInputsDynamicStep(FluxInputsDynamicStep): +class FluxKontextAdditionalInputsStep(FluxAdditionalInputsStep): model_name = "flux-kontext" def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState: @@ -256,7 +256,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip continue # 1. Calculate height/width from latents - # Unlike the `FluxInputsDynamicStep`, we don't overwrite the `block.height` and `block.width` + # Unlike the `FluxAdditionalInputsStep`, we don't overwrite the `block.height` and `block.width` height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor) if not hasattr(block_state, "image_height"): block_state.image_height = height @@ -303,6 +303,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip class FluxKontextSetResolutionStep(ModularPipelineBlocks): model_name = "flux-kontext" + @property def description(self): return ( "Determines the height and width to be used during the subsequent computations.\n" diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py deleted file mode 100644 index bd9b2d1b40c9..000000000000 --- a/src/diffusers/modular_pipelines/flux/modular_blocks.py +++ /dev/null @@ -1,446 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ...utils import logging -from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict -from .before_denoise import ( - FluxImg2ImgPrepareLatentsStep, - FluxImg2ImgSetTimestepsStep, - FluxKontextRoPEInputsStep, - FluxPrepareLatentsStep, - FluxRoPEInputsStep, - FluxSetTimestepsStep, -) -from .decoders import FluxDecodeStep -from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep -from .encoders import ( - FluxKontextProcessImagesInputStep, - FluxProcessImagesInputStep, - FluxTextEncoderStep, - FluxVaeEncoderDynamicStep, -) -from .inputs import ( - FluxInputsDynamicStep, - FluxKontextInputsDynamicStep, - FluxKontextSetResolutionStep, - FluxTextInputStep, -) - - -logger = logging.get_logger(__name__) # pylint: disable=invalid-name - - -# vae encoder (run before before_denoise) -FluxImg2ImgVaeEncoderBlocks = InsertableDict( - [("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep())] -) - - -class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks): - model_name = "flux" - - block_classes = FluxImg2ImgVaeEncoderBlocks.values() - block_names = FluxImg2ImgVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that preprocess andencode the image inputs into their latent representations." - - -class FluxAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [FluxImg2ImgVaeEncoderStep] - block_names = ["img2img"] - block_trigger_inputs = ["image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations.\n" - + "This is an auto pipeline block that works for img2img tasks.\n" - + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided." - + " - if `image` is not provided, step will be skipped." - ) - - -# Flux Kontext vae encoder (run before before_denoise) - -FluxKontextVaeEncoderBlocks = InsertableDict( - [("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep(sample_mode="argmax"))] -) - - -class FluxKontextVaeEncoderStep(SequentialPipelineBlocks): - model_name = "flux-kontext" - - block_classes = FluxKontextVaeEncoderBlocks.values() - block_names = FluxKontextVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that preprocess andencode the image inputs into their latent representations." - - -class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [FluxKontextVaeEncoderStep] - block_names = ["img2img"] - block_trigger_inputs = ["image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations.\n" - + "This is an auto pipeline block that works for img2img tasks.\n" - + " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided." - + " - if `image` is not provided, step will be skipped." - ) - - -# before_denoise: text2img -FluxBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", FluxPrepareLatentsStep()), - ("set_timesteps", FluxSetTimestepsStep()), - ("prepare_rope_inputs", FluxRoPEInputsStep()), - ] -) - - -class FluxBeforeDenoiseStep(SequentialPipelineBlocks): - block_classes = FluxBeforeDenoiseBlocks.values() - block_names = FluxBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation." - - -# before_denoise: img2img -FluxImg2ImgBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", FluxPrepareLatentsStep()), - ("set_timesteps", FluxImg2ImgSetTimestepsStep()), - ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()), - ("prepare_rope_inputs", FluxRoPEInputsStep()), - ] -) - - -class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): - block_classes = FluxImg2ImgBeforeDenoiseBlocks.values() - block_names = FluxImg2ImgBeforeDenoiseBlocks.keys() - - @property - def description(self): - return "Before denoise step that prepare the inputs for the denoise step for img2img task." - - -# before_denoise: all task (text2img, img2img) -class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks): - model_name = "flux-kontext" - block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep] - block_names = ["img2img", "text2image"] - block_trigger_inputs = ["image_latents", None] - - @property - def description(self): - return ( - "Before denoise step that prepare the inputs for the denoise step.\n" - + "This is an auto pipeline block that works for text2image.\n" - + " - `FluxBeforeDenoiseStep` (text2image) is used.\n" - + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n" - ) - - -# before_denoise: FluxKontext - -FluxKontextBeforeDenoiseBlocks = InsertableDict( - [ - ("prepare_latents", FluxPrepareLatentsStep()), - ("set_timesteps", FluxSetTimestepsStep()), - ("prepare_rope_inputs", FluxKontextRoPEInputsStep()), - ] -) - - -class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks): - block_classes = FluxKontextBeforeDenoiseBlocks.values() - block_names = FluxKontextBeforeDenoiseBlocks.keys() - - @property - def description(self): - return ( - "Before denoise step that prepare the inputs for the denoise step\n" - "for img2img/text2img task for Flux Kontext." - ) - - -class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks): - block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep] - block_names = ["img2img", "text2image"] - block_trigger_inputs = ["image_latents", None] - - @property - def description(self): - return ( - "Before denoise step that prepare the inputs for the denoise step.\n" - + "This is an auto pipeline block that works for text2image.\n" - + " - `FluxBeforeDenoiseStep` (text2image) is used.\n" - + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n" - ) - - -# denoise: text2image -class FluxAutoDenoiseStep(AutoPipelineBlocks): - block_classes = [FluxDenoiseStep] - block_names = ["denoise"] - block_trigger_inputs = [None] - - @property - def description(self) -> str: - return ( - "Denoise step that iteratively denoise the latents. " - "This is a auto pipeline block that works for text2image and img2img tasks." - " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks." - ) - - -# denoise: Flux Kontext - - -class FluxKontextAutoDenoiseStep(AutoPipelineBlocks): - block_classes = [FluxKontextDenoiseStep] - block_names = ["denoise"] - block_trigger_inputs = [None] - - @property - def description(self) -> str: - return ( - "Denoise step that iteratively denoise the latents for Flux Kontext. " - "This is a auto pipeline block that works for text2image and img2img tasks." - " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks." - ) - - -# decode: all task (text2img, img2img) -class FluxAutoDecodeStep(AutoPipelineBlocks): - block_classes = [FluxDecodeStep] - block_names = ["non-inpaint"] - block_trigger_inputs = [None] - - @property - def description(self): - return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`" - - -# inputs: text2image/img2img -FluxImg2ImgBlocks = InsertableDict( - [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())] -) - - -class FluxImg2ImgInputStep(SequentialPipelineBlocks): - model_name = "flux" - block_classes = FluxImg2ImgBlocks.values() - block_names = FluxImg2ImgBlocks.keys() - - @property - def description(self): - return "Input step that prepares the inputs for the img2img denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" - " - update height/width based `image_latents`, patchify `image_latents`." - - -class FluxAutoInputStep(AutoPipelineBlocks): - block_classes = [FluxImg2ImgInputStep, FluxTextInputStep] - block_names = ["img2img", "text2image"] - block_trigger_inputs = ["image_latents", None] - - @property - def description(self): - return ( - "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" - " This is an auto pipeline block that works for text2image/img2img tasks.\n" - + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n" - + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n" - ) - - -# inputs: Flux Kontext - -FluxKontextBlocks = InsertableDict( - [ - ("set_resolution", FluxKontextSetResolutionStep()), - ("text_inputs", FluxTextInputStep()), - ("additional_inputs", FluxKontextInputsDynamicStep()), - ] -) - - -class FluxKontextInputStep(SequentialPipelineBlocks): - model_name = "flux-kontext" - block_classes = FluxKontextBlocks.values() - block_names = FluxKontextBlocks.keys() - - @property - def description(self): - return ( - "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n" - " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" - " - update height/width based `image_latents`, patchify `image_latents`." - ) - - -class FluxKontextAutoInputStep(AutoPipelineBlocks): - block_classes = [FluxKontextInputStep, FluxTextInputStep] - # block_classes = [FluxKontextInputStep] - block_names = ["img2img", "text2img"] - # block_names = ["img2img"] - block_trigger_inputs = ["image_latents", None] - # block_trigger_inputs = ["image_latents"] - - @property - def description(self): - return ( - "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" - " This is an auto pipeline block that works for text2image/img2img tasks.\n" - + " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n" - + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present." - ) - - -class FluxCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "flux" - block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep] - block_names = ["input", "before_denoise", "denoise"] - - @property - def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" - + "This step supports text-to-image and image-to-image tasks for Flux:\n" - + " - for image-to-image generation, you need to provide `image_latents`\n" - + " - for text-to-image generation, all you need to provide is prompt embeddings." - ) - - -class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "flux-kontext" - block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextAutoDenoiseStep] - block_names = ["input", "before_denoise", "denoise"] - - @property - def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `FluxKontextAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" - + "This step supports text-to-image and image-to-image tasks for Flux:\n" - + " - for image-to-image generation, you need to provide `image_latents`\n" - + " - for text-to-image generation, all you need to provide is prompt embeddings." - ) - - -# Auto blocks (text2image and img2img) -AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", FluxTextEncoderStep()), - ("vae_encoder", FluxAutoVaeEncoderStep()), - ("denoise", FluxCoreDenoiseStep()), - ("decode", FluxDecodeStep()), - ] -) - -AUTO_BLOCKS_KONTEXT = InsertableDict( - [ - ("text_encoder", FluxTextEncoderStep()), - ("vae_encoder", FluxKontextAutoVaeEncoderStep()), - ("denoise", FluxKontextCoreDenoiseStep()), - ("decode", FluxDecodeStep()), - ] -) - - -class FluxAutoBlocks(SequentialPipelineBlocks): - model_name = "flux" - - block_classes = AUTO_BLOCKS.values() - block_names = AUTO_BLOCKS.keys() - - @property - def description(self): - return ( - "Auto Modular pipeline for text-to-image and image-to-image using Flux.\n" - + "- for text-to-image generation, all you need to provide is `prompt`\n" - + "- for image-to-image generation, you need to provide either `image` or `image_latents`" - ) - - -class FluxKontextAutoBlocks(FluxAutoBlocks): - model_name = "flux-kontext" - - block_classes = AUTO_BLOCKS_KONTEXT.values() - block_names = AUTO_BLOCKS_KONTEXT.keys() - - -TEXT2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", FluxTextEncoderStep()), - ("input", FluxTextInputStep()), - ("prepare_latents", FluxPrepareLatentsStep()), - ("set_timesteps", FluxSetTimestepsStep()), - ("prepare_rope_inputs", FluxRoPEInputsStep()), - ("denoise", FluxDenoiseStep()), - ("decode", FluxDecodeStep()), - ] -) - -IMAGE2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", FluxTextEncoderStep()), - ("vae_encoder", FluxVaeEncoderDynamicStep()), - ("input", FluxImg2ImgInputStep()), - ("prepare_latents", FluxPrepareLatentsStep()), - ("set_timesteps", FluxImg2ImgSetTimestepsStep()), - ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()), - ("prepare_rope_inputs", FluxRoPEInputsStep()), - ("denoise", FluxDenoiseStep()), - ("decode", FluxDecodeStep()), - ] -) - -FLUX_KONTEXT_BLOCKS = InsertableDict( - [ - ("text_encoder", FluxTextEncoderStep()), - ("vae_encoder", FluxVaeEncoderDynamicStep(sample_mode="argmax")), - ("input", FluxKontextInputStep()), - ("prepare_latents", FluxPrepareLatentsStep()), - ("set_timesteps", FluxSetTimestepsStep()), - ("prepare_rope_inputs", FluxKontextRoPEInputsStep()), - ("denoise", FluxKontextDenoiseStep()), - ("decode", FluxDecodeStep()), - ] -) - -ALL_BLOCKS = { - "text2image": TEXT2IMAGE_BLOCKS, - "img2img": IMAGE2IMAGE_BLOCKS, - "auto": AUTO_BLOCKS, - "auto_kontext": AUTO_BLOCKS_KONTEXT, - "kontext": FLUX_KONTEXT_BLOCKS, -} diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py new file mode 100644 index 000000000000..f2e78e933448 --- /dev/null +++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux.py @@ -0,0 +1,586 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import InsertableDict, OutputParam +from .before_denoise import ( + FluxImg2ImgPrepareLatentsStep, + FluxImg2ImgSetTimestepsStep, + FluxPrepareLatentsStep, + FluxRoPEInputsStep, + FluxSetTimestepsStep, +) +from .decoders import FluxDecodeStep +from .denoise import FluxDenoiseStep +from .encoders import ( + FluxProcessImagesInputStep, + FluxTextEncoderStep, + FluxVaeEncoderStep, +) +from .inputs import ( + FluxAdditionalInputsStep, + FluxTextInputStep, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +# vae encoder (run before before_denoise) + + +# auto_docstring +class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks): + """ + Vae encoder step that preprocess andencode the image inputs into their latent representations. + + Components: + image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) + + Inputs: + resized_image (`None`, *optional*): + TODO: Add description. + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + processed_image (`None`): + TODO: Add description. + image_latents (`Tensor`): + The latents representing the reference image + """ + + model_name = "flux" + + block_classes = [FluxProcessImagesInputStep(), FluxVaeEncoderStep()] + block_names = ["preprocess", "encode"] + + @property + def description(self) -> str: + return "Vae encoder step that preprocess andencode the image inputs into their latent representations." + + +# auto_docstring +class FluxAutoVaeEncoderStep(AutoPipelineBlocks): + """ + Vae encoder step that encode the image inputs into their latent representations. + This is an auto pipeline block that works for img2img tasks. + - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided. - if `image` is not provided, + step will be skipped. + + Components: + image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) + + Inputs: + resized_image (`None`, *optional*): + TODO: Add description. + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + processed_image (`None`): + TODO: Add description. + image_latents (`Tensor`): + The latents representing the reference image + """ + + model_name = "flux" + block_classes = [FluxImg2ImgVaeEncoderStep] + block_names = ["img2img"] + block_trigger_inputs = ["image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations.\n" + + "This is an auto pipeline block that works for img2img tasks.\n" + + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided." + + " - if `image` is not provided, step will be skipped." + ) + + +# before_denoise: text2img +# auto_docstring +class FluxBeforeDenoiseStep(SequentialPipelineBlocks): + """ + Before denoise step that prepares the inputs for the denoise step in text-to-image generation. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + num_images_per_prompt (`int`, *optional*, defaults to 1): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. + Can be generated in input step. + dtype (`dtype`, *optional*): + The dtype of the model inputs + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + prompt_embeds (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + The initial latents to use for the denoising process + timesteps (`Tensor`): + The timesteps to use for inference + num_inference_steps (`int`): + The number of denoising steps to perform at inference time + guidance (`Tensor`): + Optional guidance to be used. + txt_ids (`list`): + The sequence lengths of the prompt embeds, used for RoPE calculation. + img_ids (`list`): + The sequence lengths of the image latents, used for RoPE calculation. + """ + + model_name = "flux" + block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()] + block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"] + + @property + def description(self): + return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation." + + +# before_denoise: img2img +# auto_docstring +class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): + """ + Before denoise step that prepare the inputs for the denoise step for img2img task. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + num_images_per_prompt (`int`, *optional*, defaults to 1): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. + Can be generated in input step. + dtype (`dtype`, *optional*): + The dtype of the model inputs + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + strength (`None`, *optional*, defaults to 0.6): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + image_latents (`Tensor`): + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. + prompt_embeds (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + The initial latents to use for the denoising process + timesteps (`Tensor`): + The timesteps to use for inference + num_inference_steps (`int`): + The number of denoising steps to perform at inference time + guidance (`Tensor`): + Optional guidance to be used. + initial_noise (`Tensor`): + The initial random noised used for inpainting denoising. + txt_ids (`list`): + The sequence lengths of the prompt embeds, used for RoPE calculation. + img_ids (`list`): + The sequence lengths of the image latents, used for RoPE calculation. + """ + + model_name = "flux" + block_classes = [ + FluxPrepareLatentsStep(), + FluxImg2ImgSetTimestepsStep(), + FluxImg2ImgPrepareLatentsStep(), + FluxRoPEInputsStep(), + ] + block_names = ["prepare_latents", "set_timesteps", "prepare_img2img_latents", "prepare_rope_inputs"] + + @property + def description(self): + return "Before denoise step that prepare the inputs for the denoise step for img2img task." + + +# before_denoise: all task (text2img, img2img) +# auto_docstring +class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks): + """ + Before denoise step that prepare the inputs for the denoise step. + This is an auto pipeline block that works for text2image. + - `FluxBeforeDenoiseStep` (text2image) is used. + - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + height (`int`): + TODO: Add description. + width (`int`): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + num_images_per_prompt (`int`, *optional*, defaults to 1): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. + Can be generated in input step. + dtype (`dtype`, *optional*): + The dtype of the model inputs + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + strength (`None`, *optional*, defaults to 0.6): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + image_latents (`Tensor`, *optional*): + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. + prompt_embeds (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + The initial latents to use for the denoising process + timesteps (`Tensor`): + The timesteps to use for inference + num_inference_steps (`int`): + The number of denoising steps to perform at inference time + guidance (`Tensor`): + Optional guidance to be used. + initial_noise (`Tensor`): + The initial random noised used for inpainting denoising. + txt_ids (`list`): + The sequence lengths of the prompt embeds, used for RoPE calculation. + img_ids (`list`): + The sequence lengths of the image latents, used for RoPE calculation. + """ + + model_name = "flux" + block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep] + block_names = ["img2img", "text2image"] + block_trigger_inputs = ["image_latents", None] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs for the denoise step.\n" + + "This is an auto pipeline block that works for text2image.\n" + + " - `FluxBeforeDenoiseStep` (text2image) is used.\n" + + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n" + ) + + +# inputs: text2image/img2img + + +# auto_docstring +class FluxImg2ImgInputStep(SequentialPipelineBlocks): + """ + Input step that prepares the inputs for the img2img denoising step. It: + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + pooled_prompt_embeds (`Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be generated from text_encoder step. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + image_latents (`None`, *optional*): + TODO: Add description. + + Outputs: + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation + pooled_prompt_embeds (`Tensor`): + pooled text embeddings used to guide the image generation + image_height (`int`): + The height of the image latents + image_width (`int`): + The width of the image latents + """ + + model_name = "flux" + block_classes = [FluxTextInputStep(), FluxAdditionalInputsStep()] + block_names = ["text_inputs", "additional_inputs"] + + @property + def description(self): + return "Input step that prepares the inputs for the img2img denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +# auto_docstring +class FluxAutoInputStep(AutoPipelineBlocks): + """ + Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, + and patchified. + This is an auto pipeline block that works for text2image/img2img tasks. + - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided. + - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided. + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + pooled_prompt_embeds (`Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be generated from text_encoder step. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + image_latents (`None`, *optional*): + TODO: Add description. + + Outputs: + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation + pooled_prompt_embeds (`Tensor`): + pooled text embeddings used to guide the image generation + image_height (`int`): + The height of the image latents + image_width (`int`): + The width of the image latents + """ + + model_name = "flux" + + block_classes = [FluxImg2ImgInputStep, FluxTextInputStep] + block_names = ["img2img", "text2image"] + block_trigger_inputs = ["image_latents", None] + + @property + def description(self): + return ( + "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" + " This is an auto pipeline block that works for text2image/img2img tasks.\n" + + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n" + + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n" + ) + + +# auto_docstring +class FluxCoreDenoiseStep(SequentialPipelineBlocks): + """ + Core step that performs the denoising process for Flux. + This step supports text-to-image and image-to-image tasks for Flux: + - for image-to-image generation, you need to provide `image_latents` + - for text-to-image generation, all you need to provide is prompt embeddings. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + pooled_prompt_embeds (`Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be generated from text_encoder step. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + image_latents (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + strength (`None`, *optional*, defaults to 0.6): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + model_name = "flux" + block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxDenoiseStep] + block_names = ["input", "before_denoise", "denoise"] + + @property + def description(self): + return ( + "Core step that performs the denoising process for Flux.\n" + + "This step supports text-to-image and image-to-image tasks for Flux:\n" + + " - for image-to-image generation, you need to provide `image_latents`\n" + + " - for text-to-image generation, all you need to provide is prompt embeddings." + ) + + @property + def outputs(self): + return [ + OutputParam.template("latents"), + ] + + +# Auto blocks (text2image and img2img) +AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", FluxTextEncoderStep()), + ("vae_encoder", FluxAutoVaeEncoderStep()), + ("denoise", FluxCoreDenoiseStep()), + ("decode", FluxDecodeStep()), + ] +) + + +# auto_docstring +class FluxAutoBlocks(SequentialPipelineBlocks): + """ + Auto Modular pipeline for text-to-image and image-to-image using Flux. + + Supported workflows: + - `text2image`: requires `prompt` + - `image2image`: requires `image`, `prompt` + + Components: + text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2 + (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler + (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + prompt_2 (`None`, *optional*): + TODO: Add description. + max_sequence_length (`int`, *optional*, defaults to 512): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + resized_image (`None`, *optional*): + TODO: Add description. + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + image_latents (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + strength (`None`, *optional*, defaults to 0.6): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + output_type (`None`, *optional*, defaults to pil): + TODO: Add description. + + Outputs: + images (`list`): + Generated images. + """ + + model_name = "flux" + + block_classes = AUTO_BLOCKS.values() + block_names = AUTO_BLOCKS.keys() + + _workflow_map = { + "text2image": {"prompt": True}, + "image2image": {"image": True, "prompt": True}, + } + + @property + def description(self): + return "Auto Modular pipeline for text-to-image and image-to-image using Flux." + + @property + def outputs(self): + return [OutputParam.template("images")] diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py new file mode 100644 index 000000000000..b5a5dbf78c0e --- /dev/null +++ b/src/diffusers/modular_pipelines/flux/modular_blocks_flux_kontext.py @@ -0,0 +1,585 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import InsertableDict, OutputParam +from .before_denoise import ( + FluxKontextRoPEInputsStep, + FluxPrepareLatentsStep, + FluxRoPEInputsStep, + FluxSetTimestepsStep, +) +from .decoders import FluxDecodeStep +from .denoise import FluxKontextDenoiseStep +from .encoders import ( + FluxKontextProcessImagesInputStep, + FluxTextEncoderStep, + FluxVaeEncoderStep, +) +from .inputs import ( + FluxKontextAdditionalInputsStep, + FluxKontextSetResolutionStep, + FluxTextInputStep, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +# Flux Kontext vae encoder (run before before_denoise) +# auto_docstring +class FluxKontextVaeEncoderStep(SequentialPipelineBlocks): + """ + Vae encoder step that preprocess andencode the image inputs into their latent representations. + + Components: + image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) + + Inputs: + image (`None`, *optional*): + TODO: Add description. + _auto_resize (`bool`, *optional*, defaults to True): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + processed_image (`None`): + TODO: Add description. + image_latents (`Tensor`): + The latents representing the reference image + """ + + model_name = "flux-kontext" + + block_classes = [FluxKontextProcessImagesInputStep(), FluxVaeEncoderStep(sample_mode="argmax")] + block_names = ["preprocess", "encode"] + + @property + def description(self) -> str: + return "Vae encoder step that preprocess andencode the image inputs into their latent representations." + + +# auto_docstring +class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks): + """ + Vae encoder step that encode the image inputs into their latent representations. + This is an auto pipeline block that works for image-conditioned tasks. + - `FluxKontextVaeEncoderStep` (image_conditioned) is used when only `image` is provided. - if `image` is not + provided, step will be skipped. + + Components: + image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) + + Inputs: + image (`None`, *optional*): + TODO: Add description. + _auto_resize (`bool`, *optional*, defaults to True): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + processed_image (`None`): + TODO: Add description. + image_latents (`Tensor`): + The latents representing the reference image + """ + + model_name = "flux-kontext" + + block_classes = [FluxKontextVaeEncoderStep] + block_names = ["image_conditioned"] + block_trigger_inputs = ["image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations.\n" + + "This is an auto pipeline block that works for image-conditioned tasks.\n" + + " - `FluxKontextVaeEncoderStep` (image_conditioned) is used when only `image` is provided." + + " - if `image` is not provided, step will be skipped." + ) + + +# before_denoise: text2img +# auto_docstring +class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks): + """ + Before denoise step that prepares the inputs for the denoise step for Flux Kontext + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + num_images_per_prompt (`int`, *optional*, defaults to 1): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. + Can be generated in input step. + dtype (`dtype`, *optional*): + The dtype of the model inputs + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + prompt_embeds (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + The initial latents to use for the denoising process + timesteps (`Tensor`): + The timesteps to use for inference + num_inference_steps (`int`): + The number of denoising steps to perform at inference time + guidance (`Tensor`): + Optional guidance to be used. + txt_ids (`list`): + The sequence lengths of the prompt embeds, used for RoPE calculation. + img_ids (`list`): + The sequence lengths of the image latents, used for RoPE calculation. + """ + + model_name = "flux-kontext" + + block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxRoPEInputsStep()] + block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"] + + @property + def description(self): + return "Before denoise step that prepares the inputs for the denoise step for Flux Kontext\n" + "for text-to-image tasks." + + +# before_denoise: image-conditioned +# auto_docstring +class FluxKontextImageConditionedBeforeDenoiseStep(SequentialPipelineBlocks): + """ + Before denoise step that prepare the inputs for the denoise step for Flux Kontext + for image-conditioned tasks. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + num_images_per_prompt (`int`, *optional*, defaults to 1): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. + Can be generated in input step. + dtype (`dtype`, *optional*): + The dtype of the model inputs + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + image_height (`None`, *optional*): + TODO: Add description. + image_width (`None`, *optional*): + TODO: Add description. + prompt_embeds (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + The initial latents to use for the denoising process + timesteps (`Tensor`): + The timesteps to use for inference + num_inference_steps (`int`): + The number of denoising steps to perform at inference time + guidance (`Tensor`): + Optional guidance to be used. + txt_ids (`list`): + The sequence lengths of the prompt embeds, used for RoPE calculation. + img_ids (`list`): + The sequence lengths of the image latents, used for RoPE calculation. + """ + + model_name = "flux-kontext" + + block_classes = [FluxPrepareLatentsStep(), FluxSetTimestepsStep(), FluxKontextRoPEInputsStep()] + block_names = ["prepare_latents", "set_timesteps", "prepare_rope_inputs"] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs for the denoise step for Flux Kontext\n" + "for image-conditioned tasks." + ) + + +# auto_docstring +class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks): + """ + Before denoise step that prepare the inputs for the denoise step. + This is an auto pipeline block that works for text2image. + - `FluxKontextBeforeDenoiseStep` (text2image) is used. + - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is + provided. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + num_images_per_prompt (`int`, *optional*, defaults to 1): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. + Can be generated in input step. + dtype (`dtype`, *optional*): + The dtype of the model inputs + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + image_height (`None`, *optional*): + TODO: Add description. + image_width (`None`, *optional*): + TODO: Add description. + prompt_embeds (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + The initial latents to use for the denoising process + timesteps (`Tensor`): + The timesteps to use for inference + num_inference_steps (`int`): + The number of denoising steps to perform at inference time + guidance (`Tensor`): + Optional guidance to be used. + txt_ids (`list`): + The sequence lengths of the prompt embeds, used for RoPE calculation. + img_ids (`list`): + The sequence lengths of the image latents, used for RoPE calculation. + """ + + model_name = "flux-kontext" + + block_classes = [FluxKontextImageConditionedBeforeDenoiseStep, FluxKontextBeforeDenoiseStep] + block_names = ["image_conditioned", "text2image"] + block_trigger_inputs = ["image_latents", None] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs for the denoise step.\n" + + "This is an auto pipeline block that works for text2image.\n" + + " - `FluxKontextBeforeDenoiseStep` (text2image) is used.\n" + + " - `FluxKontextImageConditionedBeforeDenoiseStep` (image_conditioned) is used when only `image_latents` is provided.\n" + ) + + +# inputs: Flux Kontext +# auto_docstring +class FluxKontextInputStep(SequentialPipelineBlocks): + """ + Input step that prepares the inputs for the both text2img and img2img denoising step. It: + - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`). + - update height/width based `image_latents`, patchify `image_latents`. + + Inputs: + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + max_area (`int`, *optional*, defaults to 1048576): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + pooled_prompt_embeds (`Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be generated from text_encoder step. + image_latents (`None`, *optional*): + TODO: Add description. + + Outputs: + height (`int`): + The height of the initial noisy latents + width (`int`): + The width of the initial noisy latents + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation + pooled_prompt_embeds (`Tensor`): + pooled text embeddings used to guide the image generation + image_height (`int`): + The height of the image latents + image_width (`int`): + The width of the image latents + """ + + model_name = "flux-kontext" + block_classes = [FluxKontextSetResolutionStep(), FluxTextInputStep(), FluxKontextAdditionalInputsStep()] + block_names = ["set_resolution", "text_inputs", "additional_inputs"] + + @property + def description(self): + return ( + "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." + ) + + +# auto_docstring +class FluxKontextAutoInputStep(AutoPipelineBlocks): + """ + Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, + and patchified. + This is an auto pipeline block that works for text2image/img2img tasks. + - `FluxKontextInputStep` (image_conditioned) is used when `image_latents` is provided. + - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present. + + Inputs: + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + max_area (`int`, *optional*, defaults to 1048576): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + pooled_prompt_embeds (`Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be generated from text_encoder step. + image_latents (`None`, *optional*): + TODO: Add description. + + Outputs: + height (`int`): + The height of the initial noisy latents + width (`int`): + The width of the initial noisy latents + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation + pooled_prompt_embeds (`Tensor`): + pooled text embeddings used to guide the image generation + image_height (`int`): + The height of the image latents + image_width (`int`): + The width of the image latents + """ + + model_name = "flux-kontext" + block_classes = [FluxKontextInputStep, FluxTextInputStep] + block_names = ["image_conditioned", "text2image"] + block_trigger_inputs = ["image_latents", None] + + @property + def description(self): + return ( + "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" + " This is an auto pipeline block that works for text2image/img2img tasks.\n" + + " - `FluxKontextInputStep` (image_conditioned) is used when `image_latents` is provided.\n" + + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present." + ) + + +# auto_docstring +class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks): + """ + Core step that performs the denoising process for Flux Kontext. + This step supports text-to-image and image-conditioned tasks for Flux Kontext: + - for image-conditioned generation, you need to provide `image_latents` + - for text-to-image generation, all you need to provide is prompt embeddings. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`) + + Inputs: + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + max_area (`int`, *optional*, defaults to 1048576): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + pooled_prompt_embeds (`Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be generated from text_encoder step. + image_latents (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + model_name = "flux-kontext" + block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextDenoiseStep] + block_names = ["input", "before_denoise", "denoise"] + + @property + def description(self): + return ( + "Core step that performs the denoising process for Flux Kontext.\n" + + "This step supports text-to-image and image-conditioned tasks for Flux Kontext:\n" + + " - for image-conditioned generation, you need to provide `image_latents`\n" + + " - for text-to-image generation, all you need to provide is prompt embeddings." + ) + + @property + def outputs(self): + return [ + OutputParam.template("latents"), + ] + + +AUTO_BLOCKS_KONTEXT = InsertableDict( + [ + ("text_encoder", FluxTextEncoderStep()), + ("vae_encoder", FluxKontextAutoVaeEncoderStep()), + ("denoise", FluxKontextCoreDenoiseStep()), + ("decode", FluxDecodeStep()), + ] +) + + +# auto_docstring +class FluxKontextAutoBlocks(SequentialPipelineBlocks): + """ + Modular pipeline for image-to-image using Flux Kontext. + + Supported workflows: + - `image_conditioned`: requires `image`, `prompt` + - `text2image`: requires `prompt` + + Components: + text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2 + (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler + (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + prompt_2 (`None`, *optional*): + TODO: Add description. + max_sequence_length (`int`, *optional*, defaults to 512): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image (`None`, *optional*): + TODO: Add description. + _auto_resize (`bool`, *optional*, defaults to True): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + max_area (`int`, *optional*, defaults to 1048576): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + image_latents (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 3.5): + TODO: Add description. + output_type (`None`, *optional*, defaults to pil): + TODO: Add description. + + Outputs: + images (`list`): + Generated images. + """ + + model_name = "flux-kontext" + + block_classes = AUTO_BLOCKS_KONTEXT.values() + block_names = AUTO_BLOCKS_KONTEXT.keys() + _workflow_map = { + "image_conditioned": {"image": True, "prompt": True}, + "text2image": {"prompt": True}, + } + + @property + def description(self): + return "Modular pipeline for image-to-image using Flux Kontext." + + @property + def outputs(self): + return [OutputParam.template("images")] diff --git a/src/diffusers/modular_pipelines/flux2/__init__.py b/src/diffusers/modular_pipelines/flux2/__init__.py index 74907a9af806..d7cc8badcaf7 100644 --- a/src/diffusers/modular_pipelines/flux2/__init__.py +++ b/src/diffusers/modular_pipelines/flux2/__init__.py @@ -21,44 +21,14 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["encoders"] = [ - "Flux2TextEncoderStep", - "Flux2RemoteTextEncoderStep", - "Flux2VaeEncoderStep", - ] - _import_structure["before_denoise"] = [ - "Flux2SetTimestepsStep", - "Flux2PrepareLatentsStep", - "Flux2RoPEInputsStep", - "Flux2PrepareImageLatentsStep", - ] - _import_structure["denoise"] = [ - "Flux2LoopDenoiser", - "Flux2LoopAfterDenoiser", - "Flux2DenoiseLoopWrapper", - "Flux2DenoiseStep", - ] - _import_structure["decoders"] = ["Flux2DecodeStep"] - _import_structure["inputs"] = [ - "Flux2ProcessImagesInputStep", - "Flux2TextInputStep", - ] - _import_structure["modular_blocks_flux2"] = [ - "ALL_BLOCKS", - "AUTO_BLOCKS", - "REMOTE_AUTO_BLOCKS", - "TEXT2IMAGE_BLOCKS", - "IMAGE_CONDITIONED_BLOCKS", - "Flux2AutoBlocks", - "Flux2AutoVaeEncoderStep", - "Flux2CoreDenoiseStep", - "Flux2VaeEncoderSequentialStep", - ] - _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks", "Flux2KleinBaseAutoBlocks"] + _import_structure["encoders"] = ["Flux2RemoteTextEncoderStep"] + _import_structure["modular_blocks_flux2"] = ["Flux2AutoBlocks"] + _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks"] + _import_structure["modular_blocks_flux2_klein_base"] = ["Flux2KleinBaseAutoBlocks"] _import_structure["modular_pipeline"] = [ - "Flux2ModularPipeline", - "Flux2KleinModularPipeline", "Flux2KleinBaseModularPipeline", + "Flux2KleinModularPipeline", + "Flux2ModularPipeline", ] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -68,43 +38,10 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .before_denoise import ( - Flux2PrepareImageLatentsStep, - Flux2PrepareLatentsStep, - Flux2RoPEInputsStep, - Flux2SetTimestepsStep, - ) - from .decoders import Flux2DecodeStep - from .denoise import ( - Flux2DenoiseLoopWrapper, - Flux2DenoiseStep, - Flux2LoopAfterDenoiser, - Flux2LoopDenoiser, - ) - from .encoders import ( - Flux2RemoteTextEncoderStep, - Flux2TextEncoderStep, - Flux2VaeEncoderStep, - ) - from .inputs import ( - Flux2ProcessImagesInputStep, - Flux2TextInputStep, - ) - from .modular_blocks_flux2 import ( - ALL_BLOCKS, - AUTO_BLOCKS, - IMAGE_CONDITIONED_BLOCKS, - REMOTE_AUTO_BLOCKS, - TEXT2IMAGE_BLOCKS, - Flux2AutoBlocks, - Flux2AutoVaeEncoderStep, - Flux2CoreDenoiseStep, - Flux2VaeEncoderSequentialStep, - ) - from .modular_blocks_flux2_klein import ( - Flux2KleinAutoBlocks, - Flux2KleinBaseAutoBlocks, - ) + from .encoders import Flux2RemoteTextEncoderStep + from .modular_blocks_flux2 import Flux2AutoBlocks + from .modular_blocks_flux2_klein import Flux2KleinAutoBlocks + from .modular_blocks_flux2_klein_base import Flux2KleinBaseAutoBlocks from .modular_pipeline import Flux2KleinBaseModularPipeline, Flux2KleinModularPipeline, Flux2ModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py index 41a0ff7dee28..b1033a7dff9e 100644 --- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py +++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List - -import PIL.Image -import torch from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks @@ -30,7 +26,6 @@ from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep from .denoise import Flux2DenoiseStep from .encoders import ( - Flux2RemoteTextEncoderStep, Flux2TextEncoderStep, Flux2VaeEncoderStep, ) @@ -43,26 +38,69 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -Flux2VaeEncoderBlocks = InsertableDict( - [ - ("preprocess", Flux2ProcessImagesInputStep()), - ("encode", Flux2VaeEncoderStep()), - ] -) - - +# auto_docstring class Flux2VaeEncoderSequentialStep(SequentialPipelineBlocks): + """ + VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning. + + Components: + image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) + + Inputs: + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + condition_images (`list`): + TODO: Add description. + image_latents (`list`): + List of latent representations for each reference image + """ + model_name = "flux2" - block_classes = Flux2VaeEncoderBlocks.values() - block_names = Flux2VaeEncoderBlocks.keys() + block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()] + block_names = ["preprocess", "encode"] @property def description(self) -> str: return "VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning." +# auto_docstring class Flux2AutoVaeEncoderStep(AutoPipelineBlocks): + """ + VAE encoder step that encodes the image inputs into their latent representations. + This is an auto pipeline block that works for image conditioning tasks. + - `Flux2VaeEncoderSequentialStep` is used when `image` is provided. + - If `image` is not provided, step will be skipped. + + Components: + image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) + + Inputs: + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + condition_images (`list`): + TODO: Add description. + image_latents (`list`): + List of latent representations for each reference image + """ + block_classes = [Flux2VaeEncoderSequentialStep] block_names = ["img_conditioning"] block_trigger_inputs = ["image"] @@ -80,7 +118,6 @@ def description(self): Flux2CoreDenoiseBlocks = InsertableDict( [ ("input", Flux2TextInputStep()), - ("prepare_image_latents", Flux2PrepareImageLatentsStep()), ("prepare_latents", Flux2PrepareLatentsStep()), ("set_timesteps", Flux2SetTimestepsStep()), ("prepare_guidance", Flux2PrepareGuidanceStep()), @@ -91,7 +128,47 @@ def description(self): ) +# auto_docstring class Flux2CoreDenoiseStep(SequentialPipelineBlocks): + """ + Core denoise step that performs the denoising process for Flux2-dev. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 4.0): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image_latents (`Tensor`, *optional*): + Packed image latents for conditioning. Shape: (B, img_seq_len, C) + image_latent_ids (`Tensor`, *optional*): + Position IDs for image latents. Shape: (B, img_seq_len, 4) + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "flux2" block_classes = Flux2CoreDenoiseBlocks.values() @@ -99,108 +176,181 @@ class Flux2CoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): - return ( - "Core denoise step that performs the denoising process for Flux2-dev.\n" - " - `Flux2TextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n" - " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n" - " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n" - " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n" - " - `Flux2PrepareGuidanceStep` (prepare_guidance) prepares the guidance tensor for the denoising step.\n" - " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n" - " - `Flux2DenoiseStep` (denoise) iteratively denoises the latents.\n" - " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n" - ) + return "Core denoise step that performs the denoising process for Flux2-dev." @property def outputs(self): return [ - OutputParam( - name="latents", - type_hint=torch.Tensor, - description="The latents from the denoising step.", - ) + OutputParam.template("latents"), ] -AUTO_BLOCKS = InsertableDict( +Flux2ImageConditionedCoreDenoiseBlocks = InsertableDict( [ - ("text_encoder", Flux2TextEncoderStep()), - ("vae_encoder", Flux2AutoVaeEncoderStep()), - ("denoise", Flux2CoreDenoiseStep()), - ("decode", Flux2DecodeStep()), + ("input", Flux2TextInputStep()), + ("prepare_image_latents", Flux2PrepareImageLatentsStep()), + ("prepare_latents", Flux2PrepareLatentsStep()), + ("set_timesteps", Flux2SetTimestepsStep()), + ("prepare_guidance", Flux2PrepareGuidanceStep()), + ("prepare_rope_inputs", Flux2RoPEInputsStep()), + ("denoise", Flux2DenoiseStep()), + ("after_denoise", Flux2UnpackLatentsStep()), ] ) -REMOTE_AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", Flux2RemoteTextEncoderStep()), - ("vae_encoder", Flux2AutoVaeEncoderStep()), - ("denoise", Flux2CoreDenoiseStep()), - ("decode", Flux2DecodeStep()), - ] -) - +# auto_docstring +class Flux2ImageConditionedCoreDenoiseStep(SequentialPipelineBlocks): + """ + Core denoise step that performs the denoising process for Flux2-dev with image conditioning. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + image_latents (`list`, *optional*): + TODO: Add description. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 4.0): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ -class Flux2AutoBlocks(SequentialPipelineBlocks): model_name = "flux2" - block_classes = AUTO_BLOCKS.values() - block_names = AUTO_BLOCKS.keys() + block_classes = Flux2ImageConditionedCoreDenoiseBlocks.values() + block_names = Flux2ImageConditionedCoreDenoiseBlocks.keys() @property def description(self): - return ( - "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.\n" - "- For text-to-image generation, all you need to provide is `prompt`.\n" - "- For image-conditioned generation, you need to provide `image` (list of PIL images)." - ) + return "Core denoise step that performs the denoising process for Flux2-dev with image conditioning." @property def outputs(self): return [ - OutputParam( - name="images", - type_hint=List[PIL.Image.Image], - description="The images from the decoding step.", - ) + OutputParam.template("latents"), ] -TEXT2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", Flux2TextEncoderStep()), - ("text_input", Flux2TextInputStep()), - ("prepare_latents", Flux2PrepareLatentsStep()), - ("set_timesteps", Flux2SetTimestepsStep()), - ("prepare_guidance", Flux2PrepareGuidanceStep()), - ("prepare_rope_inputs", Flux2RoPEInputsStep()), - ("denoise", Flux2DenoiseStep()), - ("after_denoise", Flux2UnpackLatentsStep()), - ("decode", Flux2DecodeStep()), - ] -) +class Flux2AutoCoreDenoiseStep(AutoPipelineBlocks): + model_name = "flux2" + + block_classes = [Flux2ImageConditionedCoreDenoiseStep, Flux2CoreDenoiseStep] + block_names = ["image_conditioned", "text2image"] + block_trigger_inputs = ["image_latents", None] + + @property + def description(self): + return ( + "Auto core denoise step that performs the denoising process for Flux2-dev." + "This is an auto pipeline block that works for text-to-image and image-conditioned generation." + " - `Flux2CoreDenoiseStep` is used for text-to-image generation.\n" + " - `Flux2ImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n" + ) + -IMAGE_CONDITIONED_BLOCKS = InsertableDict( +AUTO_BLOCKS = InsertableDict( [ ("text_encoder", Flux2TextEncoderStep()), - ("text_input", Flux2TextInputStep()), - ("preprocess_images", Flux2ProcessImagesInputStep()), - ("vae_encoder", Flux2VaeEncoderStep()), - ("prepare_image_latents", Flux2PrepareImageLatentsStep()), - ("prepare_latents", Flux2PrepareLatentsStep()), - ("set_timesteps", Flux2SetTimestepsStep()), - ("prepare_guidance", Flux2PrepareGuidanceStep()), - ("prepare_rope_inputs", Flux2RoPEInputsStep()), - ("denoise", Flux2DenoiseStep()), - ("after_denoise", Flux2UnpackLatentsStep()), + ("vae_encoder", Flux2AutoVaeEncoderStep()), + ("denoise", Flux2AutoCoreDenoiseStep()), ("decode", Flux2DecodeStep()), ] ) -ALL_BLOCKS = { - "text2image": TEXT2IMAGE_BLOCKS, - "image_conditioned": IMAGE_CONDITIONED_BLOCKS, - "auto": AUTO_BLOCKS, - "remote": REMOTE_AUTO_BLOCKS, -} + +# auto_docstring +class Flux2AutoBlocks(SequentialPipelineBlocks): + """ + Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2. + + Supported workflows: + - `text2image`: requires `prompt` + - `image_conditioned`: requires `image`, `prompt` + + Components: + text_encoder (`Mistral3ForConditionalGeneration`) tokenizer (`AutoProcessor`) image_processor + (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer + (`Flux2Transformer2DModel`) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + max_sequence_length (`int`, *optional*, defaults to 512): + TODO: Add description. + text_encoder_out_layers (`tuple`, *optional*, defaults to (10, 20, 30)): + TODO: Add description. + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + image_latents (`list`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`): + TODO: Add description. + num_inference_steps (`None`): + TODO: Add description. + timesteps (`None`): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + guidance_scale (`None`, *optional*, defaults to 4.0): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image_latent_ids (`Tensor`, *optional*): + Position IDs for image latents. Shape: (B, img_seq_len, 4) + output_type (`None`, *optional*, defaults to pil): + TODO: Add description. + + Outputs: + images (`list`): + Generated images. + """ + + model_name = "flux2" + + block_classes = AUTO_BLOCKS.values() + block_names = AUTO_BLOCKS.keys() + _workflow_map = { + "text2image": {"prompt": True}, + "image_conditioned": {"image": True, "prompt": True}, + } + + @property + def description(self): + return "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2." + + @property + def outputs(self): + return [ + OutputParam.template("images"), + ] diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py index 984832d77be5..5dbae43a5a7f 100644 --- a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py +++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py @@ -12,30 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List - -import PIL.Image -import torch from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict, OutputParam from .before_denoise import ( - Flux2KleinBaseRoPEInputsStep, Flux2PrepareImageLatentsStep, Flux2PrepareLatentsStep, Flux2RoPEInputsStep, Flux2SetTimestepsStep, ) from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep -from .denoise import Flux2KleinBaseDenoiseStep, Flux2KleinDenoiseStep +from .denoise import Flux2KleinDenoiseStep from .encoders import ( - Flux2KleinBaseTextEncoderStep, Flux2KleinTextEncoderStep, Flux2VaeEncoderStep, ) from .inputs import ( - Flux2KleinBaseTextInputStep, Flux2ProcessImagesInputStep, Flux2TextInputStep, ) @@ -47,26 +40,72 @@ # VAE encoder ################ -Flux2KleinVaeEncoderBlocks = InsertableDict( - [ - ("preprocess", Flux2ProcessImagesInputStep()), - ("encode", Flux2VaeEncoderStep()), - ] -) - +# auto_docstring class Flux2KleinVaeEncoderSequentialStep(SequentialPipelineBlocks): - model_name = "flux2" + """ + VAE encoder step that preprocesses and encodes the image inputs into their latent representations. + + Components: + image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) + + Inputs: + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + condition_images (`list`): + TODO: Add description. + image_latents (`list`): + List of latent representations for each reference image + """ + + model_name = "flux2-klein" - block_classes = Flux2KleinVaeEncoderBlocks.values() - block_names = Flux2KleinVaeEncoderBlocks.keys() + block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()] + block_names = ["preprocess", "encode"] @property def description(self) -> str: return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations." +# auto_docstring class Flux2KleinAutoVaeEncoderStep(AutoPipelineBlocks): + """ + VAE encoder step that encodes the image inputs into their latent representations. + This is an auto pipeline block that works for image conditioning tasks. + - `Flux2KleinVaeEncoderSequentialStep` is used when `image` is provided. + - If `image` is not provided, step will be skipped. + + Components: + image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) + + Inputs: + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + condition_images (`list`): + TODO: Add description. + image_latents (`list`): + List of latent representations for each reference image + """ + + model_name = "flux2-klein" + block_classes = [Flux2KleinVaeEncoderSequentialStep] block_names = ["img_conditioning"] block_trigger_inputs = ["image"] @@ -88,7 +127,6 @@ def description(self): Flux2KleinCoreDenoiseBlocks = InsertableDict( [ ("input", Flux2TextInputStep()), - ("prepare_image_latents", Flux2PrepareImageLatentsStep()), ("prepare_latents", Flux2PrepareLatentsStep()), ("set_timesteps", Flux2SetTimestepsStep()), ("prepare_rope_inputs", Flux2RoPEInputsStep()), @@ -98,7 +136,46 @@ def description(self): ) +# auto_docstring class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks): + """ + Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image + generation. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image_latents (`Tensor`, *optional*): + Packed image latents for conditioning. Shape: (B, img_seq_len, C) + image_latent_ids (`Tensor`, *optional*): + Position IDs for image latents. Shape: (B, img_seq_len, 4) + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "flux2-klein" block_classes = Flux2KleinCoreDenoiseBlocks.values() @@ -106,127 +183,218 @@ class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): - return ( - "Core denoise step that performs the denoising process for Flux2-Klein (distilled model).\n" - " - `Flux2KleinTextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n" - " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n" - " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n" - " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n" - " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n" - " - `Flux2KleinDenoiseStep` (denoise) iteratively denoises the latents.\n" - " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n" - ) + return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model), for text-to-image generation." @property def outputs(self): return [ - OutputParam( - name="latents", - type_hint=torch.Tensor, - description="The latents from the denoising step.", - ) + OutputParam.template("latents"), ] -Flux2KleinBaseCoreDenoiseBlocks = InsertableDict( +Flux2KleinImageConditionedCoreDenoiseBlocks = InsertableDict( [ - ("input", Flux2KleinBaseTextInputStep()), - ("prepare_latents", Flux2PrepareLatentsStep()), + ("input", Flux2TextInputStep()), ("prepare_image_latents", Flux2PrepareImageLatentsStep()), + ("prepare_latents", Flux2PrepareLatentsStep()), ("set_timesteps", Flux2SetTimestepsStep()), - ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()), - ("denoise", Flux2KleinBaseDenoiseStep()), + ("prepare_rope_inputs", Flux2RoPEInputsStep()), + ("denoise", Flux2KleinDenoiseStep()), ("after_denoise", Flux2UnpackLatentsStep()), ] ) -class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks): +# auto_docstring +class Flux2KleinImageConditionedCoreDenoiseStep(SequentialPipelineBlocks): + """ + Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + image_latents (`list`, *optional*): + TODO: Add description. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "flux2-klein" - block_classes = Flux2KleinBaseCoreDenoiseBlocks.values() - block_names = Flux2KleinBaseCoreDenoiseBlocks.keys() + + block_classes = Flux2KleinImageConditionedCoreDenoiseBlocks.values() + block_names = Flux2KleinImageConditionedCoreDenoiseBlocks.keys() @property def description(self): - return "Core denoise step that performs the denoising process for Flux2-Klein (base model)." - return ( - "Core denoise step that performs the denoising process for Flux2-Klein (base model).\n" - " - `Flux2KleinBaseTextInputStep` (input) standardizes the text inputs (prompt_embeds + negative_prompt_embeds) for the denoising step.\n" - " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n" - " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n" - " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n" - " - `Flux2KleinBaseRoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids + negative_txt_ids) for the denoising step.\n" - " - `Flux2KleinBaseDenoiseStep` (denoise) iteratively denoises the latents using Classifier-Free Guidance.\n" - " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n" - ) + return "Core denoise step that performs the denoising process for Flux2-Klein (distilled model) with image conditioning." @property def outputs(self): return [ - OutputParam( - name="latents", - type_hint=torch.Tensor, - description="The latents from the denoising step.", - ) + OutputParam.template("latents"), ] -### -### Auto blocks -### -class Flux2KleinAutoBlocks(SequentialPipelineBlocks): +# auto_docstring +class Flux2KleinAutoCoreDenoiseStep(AutoPipelineBlocks): + """ + Auto core denoise step that performs the denoising process for Flux2-Klein. + This is an auto pipeline block that works for text-to-image and image-conditioned generation. + - `Flux2KleinCoreDenoiseStep` is used for text-to-image generation. + - `Flux2KleinImageConditionedCoreDenoiseStep` is used for image-conditioned generation. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + image_latents (`list`, *optional*): + TODO: Add description. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`): + TODO: Add description. + timesteps (`None`): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image_latent_ids (`Tensor`, *optional*): + Position IDs for image latents. Shape: (B, img_seq_len, 4) + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "flux2-klein" - block_classes = [ - Flux2KleinTextEncoderStep(), - Flux2KleinAutoVaeEncoderStep(), - Flux2KleinCoreDenoiseStep(), - Flux2DecodeStep(), - ] - block_names = ["text_encoder", "vae_encoder", "denoise", "decode"] + block_classes = [Flux2KleinImageConditionedCoreDenoiseStep, Flux2KleinCoreDenoiseStep] + block_names = ["image_conditioned", "text2image"] + block_trigger_inputs = ["image_latents", None] @property def description(self): return ( - "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.\n" - + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n" - + " - for text-to-image generation, all you need to provide is `prompt`.\n" + "Auto core denoise step that performs the denoising process for Flux2-Klein.\n" + "This is an auto pipeline block that works for text-to-image and image-conditioned generation.\n" + " - `Flux2KleinCoreDenoiseStep` is used for text-to-image generation.\n" + " - `Flux2KleinImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n" ) - @property - def outputs(self): - return [ - OutputParam( - name="images", - type_hint=List[PIL.Image.Image], - description="The images from the decoding step.", - ) - ] + +### +### Auto blocks +### -class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks): +# auto_docstring +class Flux2KleinAutoBlocks(SequentialPipelineBlocks): + """ + Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein. + + Supported workflows: + - `text2image`: requires `prompt` + - `image_conditioned`: requires `image`, `prompt` + + Components: + text_encoder (`Qwen3ForCausalLM`) tokenizer (`Qwen2TokenizerFast`) image_processor (`Flux2ImageProcessor`) + vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer + (`Flux2Transformer2DModel`) + + Configs: + is_distilled (default: True) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + max_sequence_length (`int`, *optional*, defaults to 512): + TODO: Add description. + text_encoder_out_layers (`tuple`, *optional*, defaults to (9, 18, 27)): + TODO: Add description. + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + image_latents (`list`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`): + TODO: Add description. + num_inference_steps (`None`): + TODO: Add description. + timesteps (`None`): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image_latent_ids (`Tensor`, *optional*): + Position IDs for image latents. Shape: (B, img_seq_len, 4) + output_type (`None`, *optional*, defaults to pil): + TODO: Add description. + + Outputs: + images (`list`): + Generated images. + """ + model_name = "flux2-klein" block_classes = [ - Flux2KleinBaseTextEncoderStep(), + Flux2KleinTextEncoderStep(), Flux2KleinAutoVaeEncoderStep(), - Flux2KleinBaseCoreDenoiseStep(), + Flux2KleinAutoCoreDenoiseStep(), Flux2DecodeStep(), ] block_names = ["text_encoder", "vae_encoder", "denoise", "decode"] + _workflow_map = { + "text2image": {"prompt": True}, + "image_conditioned": {"image": True, "prompt": True}, + } @property def description(self): - return ( - "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).\n" - + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n" - + " - for text-to-image generation, all you need to provide is `prompt`.\n" - ) + return "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein." @property def outputs(self): return [ - OutputParam( - name="images", - type_hint=List[PIL.Image.Image], - description="The images from the decoding step.", - ) + OutputParam.template("images"), ] diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py new file mode 100644 index 000000000000..42e025c622b4 --- /dev/null +++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein_base.py @@ -0,0 +1,413 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import InsertableDict, OutputParam +from .before_denoise import ( + Flux2KleinBaseRoPEInputsStep, + Flux2PrepareImageLatentsStep, + Flux2PrepareLatentsStep, + Flux2SetTimestepsStep, +) +from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep +from .denoise import Flux2KleinBaseDenoiseStep +from .encoders import ( + Flux2KleinBaseTextEncoderStep, + Flux2VaeEncoderStep, +) +from .inputs import ( + Flux2KleinBaseTextInputStep, + Flux2ProcessImagesInputStep, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +################ +# VAE encoder +################ + + +# auto_docstring +class Flux2KleinBaseVaeEncoderSequentialStep(SequentialPipelineBlocks): + """ + VAE encoder step that preprocesses and encodes the image inputs into their latent representations. + + Components: + image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) + + Inputs: + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + condition_images (`list`): + TODO: Add description. + image_latents (`list`): + List of latent representations for each reference image + """ + + model_name = "flux2" + + block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()] + block_names = ["preprocess", "encode"] + + @property + def description(self) -> str: + return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations." + + +# auto_docstring +class Flux2KleinBaseAutoVaeEncoderStep(AutoPipelineBlocks): + """ + VAE encoder step that encodes the image inputs into their latent representations. + This is an auto pipeline block that works for image conditioning tasks. + - `Flux2KleinBaseVaeEncoderSequentialStep` is used when `image` is provided. + - If `image` is not provided, step will be skipped. + + Components: + image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) + + Inputs: + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + condition_images (`list`): + TODO: Add description. + image_latents (`list`): + List of latent representations for each reference image + """ + + block_classes = [Flux2KleinBaseVaeEncoderSequentialStep] + block_names = ["img_conditioning"] + block_trigger_inputs = ["image"] + + @property + def description(self): + return ( + "VAE encoder step that encodes the image inputs into their latent representations.\n" + "This is an auto pipeline block that works for image conditioning tasks.\n" + " - `Flux2KleinBaseVaeEncoderSequentialStep` is used when `image` is provided.\n" + " - If `image` is not provided, step will be skipped." + ) + + +### +### Core denoise +### + +Flux2KleinBaseCoreDenoiseBlocks = InsertableDict( + [ + ("input", Flux2KleinBaseTextInputStep()), + ("prepare_latents", Flux2PrepareLatentsStep()), + ("set_timesteps", Flux2SetTimestepsStep()), + ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()), + ("denoise", Flux2KleinBaseDenoiseStep()), + ("after_denoise", Flux2UnpackLatentsStep()), + ] +) + + +# auto_docstring +class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks): + """ + Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider + (`ClassifierFreeGuidance`) + + Configs: + is_distilled (default: False) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image_latents (`Tensor`, *optional*): + Packed image latents for conditioning. Shape: (B, img_seq_len, C) + image_latent_ids (`Tensor`, *optional*): + Position IDs for image latents. Shape: (B, img_seq_len, 4) + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + model_name = "flux2-klein" + block_classes = Flux2KleinBaseCoreDenoiseBlocks.values() + block_names = Flux2KleinBaseCoreDenoiseBlocks.keys() + + @property + def description(self): + return "Core denoise step that performs the denoising process for Flux2-Klein (base model), for text-to-image generation." + + @property + def outputs(self): + return [ + OutputParam.template("latents"), + ] + + +Flux2KleinBaseImageConditionedCoreDenoiseBlocks = InsertableDict( + [ + ("input", Flux2KleinBaseTextInputStep()), + ("prepare_latents", Flux2PrepareLatentsStep()), + ("prepare_image_latents", Flux2PrepareImageLatentsStep()), + ("set_timesteps", Flux2SetTimestepsStep()), + ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()), + ("denoise", Flux2KleinBaseDenoiseStep()), + ("after_denoise", Flux2UnpackLatentsStep()), + ] +) + + +# auto_docstring +class Flux2KleinBaseImageConditionedCoreDenoiseStep(SequentialPipelineBlocks): + """ + Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider + (`ClassifierFreeGuidance`) + + Configs: + is_distilled (default: False) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + image_latents (`list`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + model_name = "flux2-klein" + block_classes = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.values() + block_names = Flux2KleinBaseImageConditionedCoreDenoiseBlocks.keys() + + @property + def description(self): + return "Core denoise step that performs the denoising process for Flux2-Klein (base model) with image conditioning." + + @property + def outputs(self): + return [ + OutputParam.template("latents"), + ] + + +# auto_docstring +class Flux2KleinBaseAutoCoreDenoiseStep(AutoPipelineBlocks): + """ + Auto core denoise step that performs the denoising process for Flux2-Klein (base model). + This is an auto pipeline block that works for text-to-image and image-conditioned generation. + - `Flux2KleinBaseCoreDenoiseStep` is used for text-to-image generation. + - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) guider + (`ClassifierFreeGuidance`) + + Configs: + is_distilled (default: False) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + image_latents (`list`, *optional*): + TODO: Add description. + num_inference_steps (`None`): + TODO: Add description. + timesteps (`None`): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image_latent_ids (`Tensor`, *optional*): + Position IDs for image latents. Shape: (B, img_seq_len, 4) + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + model_name = "flux2-klein" + block_classes = [Flux2KleinBaseImageConditionedCoreDenoiseStep, Flux2KleinBaseCoreDenoiseStep] + block_names = ["image_conditioned", "text2image"] + block_trigger_inputs = ["image_latents", None] + + @property + def description(self): + return ( + "Auto core denoise step that performs the denoising process for Flux2-Klein (base model).\n" + "This is an auto pipeline block that works for text-to-image and image-conditioned generation.\n" + " - `Flux2KleinBaseCoreDenoiseStep` is used for text-to-image generation.\n" + " - `Flux2KleinBaseImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n" + ) + + +### +### Auto blocks +### + + +# auto_docstring +class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks): + """ + Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model). + + Supported workflows: + - `text2image`: requires `prompt` + - `image_conditioned`: requires `image`, `prompt` + + Components: + text_encoder (`Qwen3ForCausalLM`) tokenizer (`Qwen2TokenizerFast`) guider (`ClassifierFreeGuidance`) + image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler + (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`) + + Configs: + is_distilled (default: False) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + max_sequence_length (`int`, *optional*, defaults to 512): + TODO: Add description. + text_encoder_out_layers (`tuple`, *optional*, defaults to (9, 18, 27)): + TODO: Add description. + image (`None`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + latents (`Tensor | NoneType`): + TODO: Add description. + image_latents (`list`, *optional*): + TODO: Add description. + num_inference_steps (`None`): + TODO: Add description. + timesteps (`None`): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + joint_attention_kwargs (`None`, *optional*): + TODO: Add description. + image_latent_ids (`Tensor`, *optional*): + Position IDs for image latents. Shape: (B, img_seq_len, 4) + output_type (`None`, *optional*, defaults to pil): + TODO: Add description. + + Outputs: + images (`list`): + Generated images. + """ + + model_name = "flux2-klein" + block_classes = [ + Flux2KleinBaseTextEncoderStep(), + Flux2KleinBaseAutoVaeEncoderStep(), + Flux2KleinBaseAutoCoreDenoiseStep(), + Flux2DecodeStep(), + ] + block_names = ["text_encoder", "vae_encoder", "denoise", "decode"] + _workflow_map = { + "text2image": {"prompt": True}, + "image_conditioned": {"image": True, "prompt": True}, + } + + @property + def description(self): + return "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model)." + + @property + def outputs(self): + return [ + OutputParam.template("images"), + ] diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index 0eff85926fc5..76a850b63c4e 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -40,8 +40,11 @@ InputParam, InsertableDict, OutputParam, + combine_inputs, + combine_outputs, format_components, format_configs, + format_workflow, generate_modular_model_card_content, make_doc_string, ) @@ -287,6 +290,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin): config_name = "modular_config.json" model_name = None + _workflow_map = None @classmethod def _get_signature_keys(cls, obj): @@ -342,6 +346,35 @@ def _get_outputs(self): def outputs(self) -> list[OutputParam]: return self._get_outputs() + # currentlyonly ConditionalPipelineBlocks and SequentialPipelineBlocks support `get_execution_blocks` + def get_execution_blocks(self, **kwargs): + """ + Get the block(s) that would execute given the inputs. Must be implemented by subclasses that support + conditional block selection. + + Args: + **kwargs: Input names and values. Only trigger inputs affect block selection. + """ + raise NotImplementedError(f"`get_execution_blocks` is not implemented for {self.__class__.__name__}") + + # currently only SequentialPipelineBlocks support workflows + @property + def available_workflows(self): + """ + Returns a list of available workflow names. Must be implemented by subclasses that define `_workflow_map`. + """ + raise NotImplementedError(f"`available_workflows` is not implemented for {self.__class__.__name__}") + + def get_workflow(self, workflow_name: str): + """ + Get the execution blocks for a specific workflow. Must be implemented by subclasses that define + `_workflow_map`. + + Args: + workflow_name: Name of the workflow to retrieve. + """ + raise NotImplementedError(f"`get_workflow` is not implemented for {self.__class__.__name__}") + @classmethod def from_pretrained( cls, @@ -480,72 +513,6 @@ def set_block_state(self, state: PipelineState, block_state: BlockState): if current_value is not param: # Using identity comparison to check if object was modified state.set(param_name, param, input_param.kwargs_type) - @staticmethod - def combine_inputs(*named_input_lists: list[tuple[str, list[InputParam]]]) -> list[InputParam]: - """ - Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if - current default value is None and new default value is not None. Warns if multiple non-None default values - exist for the same input. - - Args: - named_input_lists: list of tuples containing (block_name, input_param_list) pairs - - Returns: - list[InputParam]: Combined list of unique InputParam objects - """ - combined_dict = {} # name -> InputParam - value_sources = {} # name -> block_name - - for block_name, inputs in named_input_lists: - for input_param in inputs: - if input_param.name is None and input_param.kwargs_type is not None: - input_name = "*_" + input_param.kwargs_type - else: - input_name = input_param.name - if input_name in combined_dict: - current_param = combined_dict[input_name] - if ( - current_param.default is not None - and input_param.default is not None - and current_param.default != input_param.default - ): - warnings.warn( - f"Multiple different default values found for input '{input_name}': " - f"{current_param.default} (from block '{value_sources[input_name]}') and " - f"{input_param.default} (from block '{block_name}'). Using {current_param.default}." - ) - if current_param.default is None and input_param.default is not None: - combined_dict[input_name] = input_param - value_sources[input_name] = block_name - else: - combined_dict[input_name] = input_param - value_sources[input_name] = block_name - - return list(combined_dict.values()) - - @staticmethod - def combine_outputs(*named_output_lists: list[tuple[str, list[OutputParam]]]) -> list[OutputParam]: - """ - Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first - occurrence of each output name. - - Args: - named_output_lists: list of tuples containing (block_name, output_param_list) pairs - - Returns: - list[OutputParam]: Combined list of unique OutputParam objects - """ - combined_dict = {} # name -> OutputParam - - for block_name, outputs in named_output_lists: - for output_param in outputs: - if (output_param.name not in combined_dict) or ( - combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None - ): - combined_dict[output_param.name] = output_param - - return list(combined_dict.values()) - @property def input_names(self) -> list[str]: return [input_param.name for input_param in self.inputs if input_param.name is not None] @@ -577,7 +544,8 @@ def doc(self): class ConditionalPipelineBlocks(ModularPipelineBlocks): """ A Pipeline Blocks that conditionally selects a block to run based on the inputs. Subclasses must implement the - `select_block` method to define the logic for selecting the block. + `select_block` method to define the logic for selecting the block. Currently, we only support selection logic based + on the presence or absence of inputs (i.e., whether they are `None` or not) This class inherits from [`ModularPipelineBlocks`]. Check the superclass documentation for the generic methods the library implements for all the pipeline blocks (such as loading or saving etc.) @@ -585,15 +553,20 @@ class ConditionalPipelineBlocks(ModularPipelineBlocks): > [!WARNING] > This is an experimental feature and is likely to change in the future. Attributes: - block_classes: List of block classes to be used - block_names: List of prefixes for each block - block_trigger_inputs: List of input names that select_block() uses to determine which block to run + block_classes: List of block classes to be used. Must have the same length as `block_names`. + block_names: List of names for each block. Must have the same length as `block_classes`. + block_trigger_inputs: List of input names that `select_block()` uses to determine which block to run. + For `ConditionalPipelineBlocks`, this does not need to correspond to `block_names` and `block_classes`. For + `AutoPipelineBlocks`, this must have the same length as `block_names` and `block_classes`, where each + element specifies the trigger input for the corresponding block. + default_block_name: Name of the default block to run when no trigger inputs match. + If None, this block can be skipped entirely when no trigger inputs are provided. """ block_classes = [] block_names = [] block_trigger_inputs = [] - default_block_name = None # name of the default block if no trigger inputs are provided, if None, this block can be skipped if no trigger inputs are provided + default_block_name = None def __init__(self): sub_blocks = InsertableDict() @@ -657,7 +630,7 @@ def required_inputs(self) -> list[str]: @property def inputs(self) -> list[tuple[str, Any]]: named_inputs = [(name, block.inputs) for name, block in self.sub_blocks.items()] - combined_inputs = self.combine_inputs(*named_inputs) + combined_inputs = combine_inputs(*named_inputs) # mark Required inputs only if that input is required by all the blocks for input_param in combined_inputs: if input_param.name in self.required_inputs: @@ -669,15 +642,16 @@ def inputs(self) -> list[tuple[str, Any]]: @property def intermediate_outputs(self) -> list[str]: named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()] - combined_outputs = self.combine_outputs(*named_outputs) + combined_outputs = combine_outputs(*named_outputs) return combined_outputs @property def outputs(self) -> list[str]: named_outputs = [(name, block.outputs) for name, block in self.sub_blocks.items()] - combined_outputs = self.combine_outputs(*named_outputs) + combined_outputs = combine_outputs(*named_outputs) return combined_outputs + # used for `__repr__` def _get_trigger_inputs(self) -> set: """ Returns a set of all unique trigger input values found in this block and nested blocks. @@ -706,16 +680,16 @@ def fn_recursive_get_trigger(blocks): return all_triggers - @property - def trigger_inputs(self): - """All trigger inputs including from nested blocks.""" - return self._get_trigger_inputs() - def select_block(self, **kwargs) -> str | None: """ Select the block to run based on the trigger inputs. Subclasses must implement this method to define the logic for selecting the block. + Note: When trigger inputs include intermediate outputs from earlier blocks, the selection logic should only + depend on the presence or absence of the input (i.e., whether it is None or not), not on its actual value. This + is because `get_execution_blocks()` resolves conditions statically by propagating intermediate output names + without their runtime values. + Args: **kwargs: Trigger input names and their values from the state. @@ -750,6 +724,39 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState: logger.error(error_msg) raise + def get_execution_blocks(self, **kwargs) -> ModularPipelineBlocks | None: + """ + Get the block(s) that would execute given the inputs. + + Recursively resolves nested ConditionalPipelineBlocks until reaching either: + - A leaf block (no sub_blocks or LoopSequentialPipelineBlocks) → returns single `ModularPipelineBlocks` + - A `SequentialPipelineBlocks` → delegates to its `get_execution_blocks()` which returns + a `SequentialPipelineBlocks` containing the resolved execution blocks + + Args: + **kwargs: Input names and values. Only trigger inputs affect block selection. + + Returns: + - `ModularPipelineBlocks`: A leaf block or resolved `SequentialPipelineBlocks` + - `None`: If this block would be skipped (no trigger matched and no default) + """ + trigger_kwargs = {name: kwargs.get(name) for name in self.block_trigger_inputs if name is not None} + block_name = self.select_block(**trigger_kwargs) + + if block_name is None: + block_name = self.default_block_name + + if block_name is None: + return None + + block = self.sub_blocks[block_name] + + # Recursively resolve until we hit a leaf block + if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks): + return block.get_execution_blocks(**kwargs) + + return block + def __repr__(self): class_name = self.__class__.__name__ base_class = self.__class__.__bases__[0].__name__ @@ -757,11 +764,11 @@ def __repr__(self): f"{class_name}(\n Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n" ) - if self.trigger_inputs: + if self._get_trigger_inputs(): header += "\n" header += " " + "=" * 100 + "\n" header += " This pipeline contains blocks that are selected at runtime based on inputs.\n" - header += f" Trigger Inputs: {sorted(self.trigger_inputs)}\n" + header += f" Trigger Inputs: {sorted(self._get_trigger_inputs())}\n" header += " " + "=" * 100 + "\n\n" # Format description with proper indentation @@ -828,24 +835,56 @@ def doc(self): class AutoPipelineBlocks(ConditionalPipelineBlocks): """ - A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs. + A Pipeline Blocks that automatically selects a block to run based on the presence of trigger inputs. + + This is a specialized version of `ConditionalPipelineBlocks` where: + - Each block has one corresponding trigger input (1:1 mapping) + - Block selection is automatic: the first block whose trigger input is present gets selected + - `block_trigger_inputs` must have the same length as `block_names` and `block_classes` + - Use `None` in `block_trigger_inputs` to specify the default block, i.e the block that will run if no trigger + inputs are present + + Attributes: + block_classes: + List of block classes to be used. Must have the same length as `block_names` and + `block_trigger_inputs`. + block_names: + List of names for each block. Must have the same length as `block_classes` and `block_trigger_inputs`. + block_trigger_inputs: + List of input names where each element specifies the trigger input for the corresponding block. Use + `None` to mark the default block. + + Example: + ```python + class MyAutoBlock(AutoPipelineBlocks): + block_classes = [InpaintEncoderBlock, ImageEncoderBlock, TextEncoderBlock] + block_names = ["inpaint", "img2img", "text2img"] + block_trigger_inputs = ["mask_image", "image", None] # text2img is the default + ``` + + With this definition: + - As long as `mask_image` is provided, "inpaint" block runs (regardless of `image` being provided or not) + - If `mask_image` is not provided but `image` is provided, "img2img" block runs + - Otherwise, "text2img" block runs (default, trigger is `None`) """ def __init__(self): super().__init__() + if self.default_block_name is not None: + raise ValueError( + f"In {self.__class__.__name__}, do not set `default_block_name` for AutoPipelineBlocks. " + f"Use `None` in `block_trigger_inputs` to specify the default block." + ) + if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)): raise ValueError( f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same." ) - @property - def default_block_name(self) -> str | None: - """Derive default_block_name from block_trigger_inputs (None entry).""" if None in self.block_trigger_inputs: idx = self.block_trigger_inputs.index(None) - return self.block_names[idx] - return None + self.default_block_name = self.block_names[idx] def select_block(self, **kwargs) -> str | None: """Select block based on which trigger input is present (not None).""" @@ -899,6 +938,29 @@ def expected_configs(self): expected_configs.append(config) return expected_configs + @property + def available_workflows(self): + if self._workflow_map is None: + raise NotImplementedError( + f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}" + ) + + return list(self._workflow_map.keys()) + + def get_workflow(self, workflow_name: str): + if self._workflow_map is None: + raise NotImplementedError( + f"workflows is not supported because _workflow_map is not set for {self.__class__.__name__}" + ) + + if workflow_name not in self._workflow_map: + raise ValueError(f"Workflow {workflow_name} not found in {self.__class__.__name__}") + + trigger_inputs = self._workflow_map[workflow_name] + workflow_blocks = self.get_execution_blocks(**trigger_inputs) + + return workflow_blocks + @classmethod def from_blocks_dict( cls, blocks_dict: dict[str, Any], description: str | None = None @@ -994,7 +1056,7 @@ def intermediate_outputs(self) -> list[str]: # filter out them here so they do not end up as intermediate_outputs if name not in inp_names: named_outputs.append((name, block.intermediate_outputs)) - combined_outputs = self.combine_outputs(*named_outputs) + combined_outputs = combine_outputs(*named_outputs) return combined_outputs # YiYi TODO: I think we can remove the outputs property @@ -1018,6 +1080,7 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState: raise return pipeline, state + # used for `__repr__` def _get_trigger_inputs(self): """ Returns a set of all unique trigger input values found in the blocks. @@ -1041,89 +1104,56 @@ def fn_recursive_get_trigger(blocks): return fn_recursive_get_trigger(self.sub_blocks) - @property - def trigger_inputs(self): - return self._get_trigger_inputs() - - def _traverse_trigger_blocks(self, active_inputs): + def get_execution_blocks(self, **kwargs) -> "SequentialPipelineBlocks": """ - Traverse blocks and select which ones would run given the active inputs. + Get the blocks that would execute given the specified inputs. + + As the traversal walks through sequential blocks, intermediate outputs from resolved blocks are added to the + active inputs. This means conditional blocks that depend on intermediates (e.g., "run img2img if image_latents + is present") will resolve correctly, as long as the condition is based on presence/absence (None or not None), + not on the actual value. + Args: - active_inputs: Dict of input names to values that are "present" + **kwargs: Input names and values. Only trigger inputs affect block selection. Returns: - OrderedDict of block_name -> block that would execute + SequentialPipelineBlocks containing only the blocks that would execute """ + # Copy kwargs so we can add outputs as we traverse + active_inputs = dict(kwargs) def fn_recursive_traverse(block, block_name, active_inputs): result_blocks = OrderedDict() # ConditionalPipelineBlocks (includes AutoPipelineBlocks) if isinstance(block, ConditionalPipelineBlocks): - trigger_kwargs = {name: active_inputs.get(name) for name in block.block_trigger_inputs} - selected_block_name = block.select_block(**trigger_kwargs) - - if selected_block_name is None: - selected_block_name = block.default_block_name - - if selected_block_name is None: + block = block.get_execution_blocks(**active_inputs) + if block is None: return result_blocks - selected_block = block.sub_blocks[selected_block_name] - - if selected_block.sub_blocks: - result_blocks.update(fn_recursive_traverse(selected_block, block_name, active_inputs)) - else: - result_blocks[block_name] = selected_block - if hasattr(selected_block, "outputs"): - for out in selected_block.outputs: - active_inputs[out.name] = True - - return result_blocks - - # SequentialPipelineBlocks or LoopSequentialPipelineBlocks - if block.sub_blocks: + # Has sub_blocks (SequentialPipelineBlocks/ConditionalPipelineBlocks) + if block.sub_blocks and not isinstance(block, LoopSequentialPipelineBlocks): for sub_block_name, sub_block in block.sub_blocks.items(): - blocks_to_update = fn_recursive_traverse(sub_block, sub_block_name, active_inputs) - blocks_to_update = {f"{block_name}.{k}": v for k, v in blocks_to_update.items()} - result_blocks.update(blocks_to_update) + nested_blocks = fn_recursive_traverse(sub_block, sub_block_name, active_inputs) + nested_blocks = {f"{block_name}.{k}": v for k, v in nested_blocks.items()} + result_blocks.update(nested_blocks) else: + # Leaf block: single ModularPipelineBlocks or LoopSequentialPipelineBlocks result_blocks[block_name] = block - if hasattr(block, "outputs"): - for out in block.outputs: + # Add outputs to active_inputs so subsequent blocks can use them as triggers + if hasattr(block, "intermediate_outputs"): + for out in block.intermediate_outputs: active_inputs[out.name] = True return result_blocks all_blocks = OrderedDict() for block_name, block in self.sub_blocks.items(): - blocks_to_update = fn_recursive_traverse(block, block_name, active_inputs) - all_blocks.update(blocks_to_update) - return all_blocks + nested_blocks = fn_recursive_traverse(block, block_name, active_inputs) + all_blocks.update(nested_blocks) - def get_execution_blocks(self, **kwargs): - """ - Get the blocks that would execute given the specified inputs. - - Args: - **kwargs: Input names and values. Only trigger inputs affect block selection. - Pass any inputs that would be non-None at runtime. - - Returns: - SequentialPipelineBlocks containing only the blocks that would execute - - Example: - # Get blocks for inpainting workflow blocks = pipeline.get_execution_blocks(prompt="a cat", mask=mask, - image=image) - - # Get blocks for text2image workflow blocks = pipeline.get_execution_blocks(prompt="a cat") - """ - # Filter out None values - active_inputs = {k: v for k, v in kwargs.items() if v is not None} - - blocks_triggered = self._traverse_trigger_blocks(active_inputs) - return SequentialPipelineBlocks.from_blocks_dict(blocks_triggered) + return SequentialPipelineBlocks.from_blocks_dict(all_blocks) def __repr__(self): class_name = self.__class__.__name__ @@ -1132,18 +1162,23 @@ def __repr__(self): f"{class_name}(\n Class: {base_class}\n" if base_class and base_class != "object" else f"{class_name}(\n" ) - if self.trigger_inputs: + if self._workflow_map is None and self._get_trigger_inputs(): header += "\n" header += " " + "=" * 100 + "\n" header += " This pipeline contains blocks that are selected at runtime based on inputs.\n" - header += f" Trigger Inputs: {[inp for inp in self.trigger_inputs if inp is not None]}\n" + header += f" Trigger Inputs: {[inp for inp in self._get_trigger_inputs() if inp is not None]}\n" # Get first trigger input as example - example_input = next(t for t in self.trigger_inputs if t is not None) + example_input = next(t for t in self._get_trigger_inputs() if t is not None) header += f" Use `get_execution_blocks()` to see selected blocks (e.g. `get_execution_blocks({example_input}=...)`).\n" header += " " + "=" * 100 + "\n\n" + description = self.description + if self._workflow_map is not None: + workflow_str = format_workflow(self._workflow_map) + description = f"{self.description}\n\n{workflow_str}" + # Format description with proper indentation - desc_lines = self.description.split("\n") + desc_lines = description.split("\n") desc = [] # First line with "Description:" label desc.append(f" Description: {desc_lines[0]}") @@ -1191,10 +1226,15 @@ def __repr__(self): @property def doc(self): + description = self.description + if self._workflow_map is not None: + workflow_str = format_workflow(self._workflow_map) + description = f"{self.description}\n\n{workflow_str}" + return make_doc_string( self.inputs, self.outputs, - self.description, + description=description, class_name=self.__class__.__name__, expected_components=self.expected_components, expected_configs=self.expected_configs, @@ -1327,7 +1367,7 @@ def required_inputs(self) -> list[str]: @property def intermediate_outputs(self) -> list[str]: named_outputs = [(name, block.intermediate_outputs) for name, block in self.sub_blocks.items()] - combined_outputs = self.combine_outputs(*named_outputs) + combined_outputs = combine_outputs(*named_outputs) for output in self.loop_intermediate_outputs: if output.name not in {output.name for output in combined_outputs}: combined_outputs.append(output) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index aa378f715974..cab17c2aed5c 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -14,6 +14,7 @@ import inspect import re +import warnings from collections import OrderedDict from dataclasses import dataclass, field from types import UnionType @@ -503,6 +504,10 @@ class ConfigSpec: "type_hint": list[PIL.Image.Image], "description": "Generated images.", }, + "videos": { + "type_hint": list[PIL.Image.Image], + "description": "The generated videos.", + }, "latents": { "type_hint": torch.Tensor, "description": "Denoised latents.", @@ -887,6 +892,30 @@ def format_configs(configs, indent_level=4, max_line_length=115, add_empty_lines return "\n".join(formatted_configs) +def format_workflow(workflow_map): + """Format a workflow map into a readable string representation. + + Args: + workflow_map: Dictionary mapping workflow names to trigger inputs + + Returns: + A formatted string representing all workflows + """ + if workflow_map is None: + return "" + + lines = ["Supported workflows:"] + for workflow_name, trigger_inputs in workflow_map.items(): + required_inputs = [k for k, v in trigger_inputs.items() if v] + if required_inputs: + inputs_str = ", ".join(f"`{t}`" for t in required_inputs) + lines.append(f" - `{workflow_name}`: requires {inputs_str}") + else: + lines.append(f" - `{workflow_name}`: default (no additional inputs required)") + + return "\n".join(lines) + + def make_doc_string( inputs, outputs, @@ -943,6 +972,72 @@ def make_doc_string( return output +def combine_inputs(*named_input_lists: list[tuple[str, list[InputParam]]]) -> list[InputParam]: + """ + Combines multiple lists of InputParam objects from different blocks. For duplicate inputs, updates only if current + default value is None and new default value is not None. Warns if multiple non-None default values exist for the + same input. + + Args: + named_input_lists: List of tuples containing (block_name, input_param_list) pairs + + Returns: + List[InputParam]: Combined list of unique InputParam objects + """ + combined_dict = {} # name -> InputParam + value_sources = {} # name -> block_name + + for block_name, inputs in named_input_lists: + for input_param in inputs: + if input_param.name is None and input_param.kwargs_type is not None: + input_name = "*_" + input_param.kwargs_type + else: + input_name = input_param.name + if input_name in combined_dict: + current_param = combined_dict[input_name] + if ( + current_param.default is not None + and input_param.default is not None + and current_param.default != input_param.default + ): + warnings.warn( + f"Multiple different default values found for input '{input_name}': " + f"{current_param.default} (from block '{value_sources[input_name]}') and " + f"{input_param.default} (from block '{block_name}'). Using {current_param.default}." + ) + if current_param.default is None and input_param.default is not None: + combined_dict[input_name] = input_param + value_sources[input_name] = block_name + else: + combined_dict[input_name] = input_param + value_sources[input_name] = block_name + + return list(combined_dict.values()) + + +def combine_outputs(*named_output_lists: list[tuple[str, list[OutputParam]]]) -> list[OutputParam]: + """ + Combines multiple lists of OutputParam objects from different blocks. For duplicate outputs, keeps the first + occurrence of each output name. + + Args: + named_output_lists: List of tuples containing (block_name, output_param_list) pairs + + Returns: + List[OutputParam]: Combined list of unique OutputParam objects + """ + combined_dict = {} # name -> OutputParam + + for block_name, outputs in named_output_lists: + for output_param in outputs: + if (output_param.name not in combined_dict) or ( + combined_dict[output_param.name].kwargs_type is None and output_param.kwargs_type is not None + ): + combined_dict[output_param.name] = output_param + + return list(combined_dict.values()) + + def generate_modular_model_card_content(blocks) -> dict[str, Any]: """ Generate model card content for a modular pipeline. diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py index 2b01a5b5a4b5..2e6af4495b37 100644 --- a/src/diffusers/modular_pipelines/qwenimage/__init__.py +++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py @@ -21,27 +21,15 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["modular_blocks_qwenimage"] = [ - "AUTO_BLOCKS", - "QwenImageAutoBlocks", - ] - _import_structure["modular_blocks_qwenimage_edit"] = [ - "EDIT_AUTO_BLOCKS", - "QwenImageEditAutoBlocks", - ] - _import_structure["modular_blocks_qwenimage_edit_plus"] = [ - "EDIT_PLUS_AUTO_BLOCKS", - "QwenImageEditPlusAutoBlocks", - ] - _import_structure["modular_blocks_qwenimage_layered"] = [ - "LAYERED_AUTO_BLOCKS", - "QwenImageLayeredAutoBlocks", - ] + _import_structure["modular_blocks_qwenimage"] = ["QwenImageAutoBlocks"] + _import_structure["modular_blocks_qwenimage_edit"] = ["QwenImageEditAutoBlocks"] + _import_structure["modular_blocks_qwenimage_edit_plus"] = ["QwenImageEditPlusAutoBlocks"] + _import_structure["modular_blocks_qwenimage_layered"] = ["QwenImageLayeredAutoBlocks"] _import_structure["modular_pipeline"] = [ "QwenImageEditModularPipeline", "QwenImageEditPlusModularPipeline", - "QwenImageModularPipeline", "QwenImageLayeredModularPipeline", + "QwenImageModularPipeline", ] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -51,22 +39,10 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .modular_blocks_qwenimage import ( - AUTO_BLOCKS, - QwenImageAutoBlocks, - ) - from .modular_blocks_qwenimage_edit import ( - EDIT_AUTO_BLOCKS, - QwenImageEditAutoBlocks, - ) - from .modular_blocks_qwenimage_edit_plus import ( - EDIT_PLUS_AUTO_BLOCKS, - QwenImageEditPlusAutoBlocks, - ) - from .modular_blocks_qwenimage_layered import ( - LAYERED_AUTO_BLOCKS, - QwenImageLayeredAutoBlocks, - ) + from .modular_blocks_qwenimage import QwenImageAutoBlocks + from .modular_blocks_qwenimage_edit import QwenImageEditAutoBlocks + from .modular_blocks_qwenimage_edit_plus import QwenImageEditPlusAutoBlocks + from .modular_blocks_qwenimage_layered import QwenImageLayeredAutoBlocks from .modular_pipeline import ( QwenImageEditModularPipeline, QwenImageEditPlusModularPipeline, diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index c4e14566a795..51b5c6ac8c3d 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -558,7 +558,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks): Inputs: num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. latents (`Tensor`): The initial random noised latents for the denoising process. Can be generated in prepare latents step. @@ -644,7 +644,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks): Inputs: num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. image_latents (`Tensor`): image latents used to guide the image generation. Can be generated from vae_encoder step. @@ -725,7 +725,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): Inputs: num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. latents (`Tensor`): The latents to use for the denoising process. Can be generated in prepare latents step. @@ -842,7 +842,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks): mask for the negative text embeddings. Can be generated from text_encoder step. Outputs: - img_shapes (`List`): + img_shapes (`list`): The shapes of the images latents, used for RoPE calculation """ @@ -917,7 +917,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): mask for the negative text embeddings. Can be generated from text_encoder step. Outputs: - img_shapes (`List`): + img_shapes (`list`): The shapes of the images latents, used for RoPE calculation """ @@ -995,9 +995,9 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): batch_size (`int`, *optional*, defaults to 1): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step. - image_height (`List`): + image_height (`list`): The heights of the reference images. Can be generated in input step. - image_width (`List`): + image_width (`list`): The widths of the reference images. Can be generated in input step. height (`int`): The height in pixels of the generated image. @@ -1009,11 +1009,11 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): mask for the negative text embeddings. Can be generated from text_encoder step. Outputs: - img_shapes (`List`): + img_shapes (`list`): The shapes of the image latents, used for RoPE calculation - txt_seq_lens (`List`): + txt_seq_lens (`list`): The sequence lengths of the prompt embeds, used for RoPE calculation - negative_txt_seq_lens (`List`): + negative_txt_seq_lens (`list`): The sequence lengths of the negative prompt embeds, used for RoPE calculation """ @@ -1123,11 +1123,11 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): mask for the negative text embeddings. Can be generated from text_encoder step. Outputs: - img_shapes (`List`): + img_shapes (`list`): The shapes of the image latents, used for RoPE calculation - txt_seq_lens (`List`): + txt_seq_lens (`list`): The sequence lengths of the prompt embeds, used for RoPE calculation - negative_txt_seq_lens (`List`): + negative_txt_seq_lens (`list`): The sequence lengths of the negative prompt embeds, used for RoPE calculation additional_t_cond (`Tensor`): The additional t cond, used for RoPE calculation @@ -1238,7 +1238,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): The timesteps to use for the denoising process. Can be generated in set_timesteps step. Outputs: - controlnet_keep (`List`): + controlnet_keep (`list`): The controlnet keep values """ diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 49183eed9cda..e4ccb6b8e047 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -191,7 +191,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks): step. Outputs: - images (`List`): + images (`list`): Generated images. (tensor output of the vae decoder.) """ @@ -268,7 +268,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks): Output format: 'pil', 'np', 'pt'. Outputs: - images (`List`): + images (`list`): Generated images. """ @@ -366,7 +366,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): Output format: 'pil', 'np', 'pt'. Outputs: - images (`List`): + images (`list`): Generated images. """ @@ -436,12 +436,12 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): the generated image tensor from decoders step output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`Dict`, *optional*): + mask_overlay_kwargs (`dict`, *optional*): The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: - images (`List`): + images (`list`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index 6724612361aa..de8ea05c5047 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -518,11 +518,11 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): The number of denoising steps. latents (`Tensor`): The initial latents to use for the denoising process. Can be generated in prepare_latent step. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - img_shapes (`List`): + img_shapes (`list`): The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step. Outputs: @@ -576,11 +576,11 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): The number of denoising steps. latents (`Tensor`): The initial latents to use for the denoising process. Can be generated in prepare_latent step. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - img_shapes (`List`): + img_shapes (`list`): The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step. mask (`Tensor`): The mask to use for the inpainting process. Can be generated in inpaint prepare latents step. @@ -645,13 +645,13 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.) - controlnet_keep (`List`): + controlnet_keep (`list`): The controlnet keep values. Can be generated in prepare_controlnet_inputs step. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - img_shapes (`List`): + img_shapes (`list`): The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step. Outputs: @@ -711,13 +711,13 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.) - controlnet_keep (`List`): + controlnet_keep (`list`): The controlnet keep values. Can be generated in prepare_controlnet_inputs step. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - img_shapes (`List`): + img_shapes (`list`): The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step. mask (`Tensor`): The mask to use for the inpainting process. Can be generated in inpaint prepare latents step. @@ -787,11 +787,11 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): The initial latents to use for the denoising process. Can be generated in prepare_latent step. image_latents (`Tensor`): image latents used to guide the image generation. Can be generated from vae_encoder step. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - img_shapes (`List`): + img_shapes (`list`): The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step. Outputs: @@ -846,11 +846,11 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): The initial latents to use for the denoising process. Can be generated in prepare_latent step. image_latents (`Tensor`): image latents used to guide the image generation. Can be generated from vae_encoder step. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - img_shapes (`List`): + img_shapes (`list`): The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step. mask (`Tensor`): The mask to use for the inpainting process. Can be generated in inpaint prepare latents step. @@ -910,11 +910,11 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): The initial latents to use for the denoising process. Can be generated in prepare_latent step. image_latents (`Tensor`): image latents used to guide the image generation. Can be generated from vae_encoder step. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - img_shapes (`List`): + img_shapes (`list`): The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step. Outputs: diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 6abcf7ce215a..527267dc0d6e 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -285,11 +285,11 @@ class QwenImageEditResizeStep(ModularPipelineBlocks): image_resize_processor (`VaeImageProcessor`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. Outputs: - resized_image (`List`): + resized_image (`list`): The resized images """ @@ -359,13 +359,13 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks): image_resize_processor (`VaeImageProcessor`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 Outputs: - resized_image (`List`): + resized_image (`list`): The resized images """ @@ -452,13 +452,13 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks): image_resize_processor (`VaeImageProcessor`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. Outputs: - resized_image (`List`): + resized_image (`list`): Images resized to 1024x1024 target area for VAE encoding - resized_cond_image (`List`): + resized_cond_image (`list`): Images resized to 384x384 target area for VL text encoding """ @@ -1058,7 +1058,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): Inputs: mask_image (`Image`): Mask image for inpainting. - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. @@ -1072,7 +1072,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): The processed image processed_mask_image (`Tensor`): The processed mask image - mask_overlay_kwargs (`Dict`): + mask_overlay_kwargs (`dict`): The kwargs for the postprocess step to apply the mask overlay """ @@ -1177,7 +1177,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): The processed image processed_mask_image (`Tensor`): The processed mask image - mask_overlay_kwargs (`Dict`): + mask_overlay_kwargs (`dict`): The kwargs for the postprocess step to apply the mask overlay """ @@ -1256,7 +1256,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks): image_processor (`VaeImageProcessor`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. @@ -1340,7 +1340,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks): image_processor (`VaeImageProcessor`) Inputs: - resized_image (`List`): + resized_image (`list`): The resized image. should be generated using a resize step Outputs: @@ -1412,7 +1412,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): image_processor (`VaeImageProcessor`) Inputs: - resized_image (`List`): + resized_image (`list`): The resized image. should be generated using a resize step Outputs: diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index ebe53940a4e5..faec7db245df 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -496,9 +496,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): image latents used to guide the image generation. Can be generated from vae_encoder step. Outputs: - image_height (`List`): + image_height (`list`): The image heights calculated from the image latents dimension - image_width (`List`): + image_width (`list`): The image widths calculated from the image latents dimension height (`int`): if not provided, updated to image height diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 7503e0c7684b..bf87028b2f90 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -119,7 +119,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): Inputs: mask_image (`Image`): Mask image for inpainting. - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. @@ -135,7 +135,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): The processed image processed_mask_image (`Tensor`): The processed mask image - mask_overlay_kwargs (`Dict`): + mask_overlay_kwargs (`dict`): The kwargs for the postprocess step to apply the mask overlay image_latents (`Tensor`): The latent representation of the input image. @@ -164,7 +164,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. @@ -476,9 +476,9 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -553,11 +553,11 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -632,11 +632,11 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -712,7 +712,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. @@ -720,7 +720,7 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): When to stop applying ControlNet. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -802,7 +802,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. @@ -812,7 +812,7 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): When to stop applying ControlNet. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -894,7 +894,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. @@ -904,7 +904,7 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): When to stop applying ControlNet. controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -1032,7 +1032,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): Output format: 'pil', 'np', 'pt'. Outputs: - images (`List`): + images (`list`): Generated images. (tensor output of the vae decoder.) """ @@ -1061,12 +1061,12 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`Dict`, *optional*): + mask_overlay_kwargs (`dict`, *optional*): The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: - images (`List`): + images (`list`): Generated images. (tensor output of the vae decoder.) """ @@ -1113,10 +1113,14 @@ def description(self): class QwenImageAutoBlocks(SequentialPipelineBlocks): """ Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage. - - for image-to-image generation, you need to provide `image` - - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`. - - to run the controlnet workflow, you need to provide `control_image` - - for text-to-image generation, all you need to provide is `prompt` + + Supported workflows: + - `text2image`: requires `prompt` + - `image2image`: requires `prompt`, `image` + - `inpainting`: requires `prompt`, `mask_image`, `image` + - `controlnet_text2image`: requires `prompt`, `control_image` + - `controlnet_image2image`: requires `prompt`, `image`, `control_image` + - `controlnet_inpainting`: requires `prompt`, `mask_image`, `image`, `control_image` Components: text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`): @@ -1134,7 +1138,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Maximum sequence length for prompt encoding. mask_image (`Image`, *optional*): Mask image for inpainting. - image (`Union[Image, List]`, *optional*): + image (`Image | list`, *optional*): Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. @@ -1160,9 +1164,9 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Pre-generated noisy latents for image generation. num_inference_steps (`int`): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -1183,12 +1187,12 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Scale for ControlNet conditioning. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`Dict`, *optional*): + mask_overlay_kwargs (`dict`, *optional*): The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: - images (`List`): + images (`list`): Generated images. """ @@ -1197,15 +1201,23 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): block_classes = AUTO_BLOCKS.values() block_names = AUTO_BLOCKS.keys() + # Workflow map defines the trigger conditions for each workflow. + # How to define: + # - Only include required inputs and trigger inputs (inputs that determine which blocks run) + # - currently, only supports `True` means the workflow triggers when the input is not None + + _workflow_map = { + "text2image": {"prompt": True}, + "image2image": {"prompt": True, "image": True}, + "inpainting": {"prompt": True, "mask_image": True, "image": True}, + "controlnet_text2image": {"prompt": True, "control_image": True}, + "controlnet_image2image": {"prompt": True, "image": True, "control_image": True}, + "controlnet_inpainting": {"prompt": True, "mask_image": True, "image": True, "control_image": True}, + } + @property def description(self): - return ( - "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" - + "- for image-to-image generation, you need to provide `image`\n" - + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n" - + "- to run the controlnet workflow, you need to provide `control_image`\n" - + "- for text-to-image generation, all you need to provide is `prompt`" - ) + return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage." @property def outputs(self): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 21a7044c9f6e..37b80b69ec7e 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -67,7 +67,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. prompt (`str`): The prompt or prompts to guide image generation. @@ -75,7 +75,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): The prompt or prompts not to guide the image generation. Outputs: - resized_image (`List`): + resized_image (`list`): The resized images prompt_embeds (`Tensor`): The prompt embeddings. @@ -115,13 +115,13 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): (`AutoencoderKLQwenImage`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): + resized_image (`list`): The resized images processed_image (`Tensor`): The processed image @@ -156,7 +156,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): (`AutoencoderKLQwenImage`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. mask_image (`Image`): Mask image for inpainting. @@ -166,13 +166,13 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): Torch generator for deterministic generation. Outputs: - resized_image (`List`): + resized_image (`list`): The resized images processed_image (`Tensor`): The processed image processed_mask_image (`Tensor`): The processed mask image - mask_overlay_kwargs (`Dict`): + mask_overlay_kwargs (`dict`): The kwargs for the postprocess step to apply the mask overlay image_latents (`Tensor`): The latent representation of the input image. @@ -450,9 +450,9 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -526,11 +526,11 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -627,7 +627,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): Output format: 'pil', 'np', 'pt'. Outputs: - images (`List`): + images (`list`): Generated images. (tensor output of the vae decoder.) """ @@ -656,12 +656,12 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`Dict`, *optional*): + mask_overlay_kwargs (`dict`, *optional*): The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: - images (`List`): + images (`list`): Generated images. (tensor output of the vae decoder.) """ @@ -718,6 +718,11 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + + Supported workflows: + - `image_conditioned`: requires `prompt`, `image` + - `image_conditioned_inpainting`: requires `prompt`, `mask_image`, `image` + Components: image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae @@ -725,7 +730,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. prompt (`str`): The prompt or prompts to guide image generation. @@ -751,28 +756,32 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): Pre-generated noisy latents for image generation. num_inference_steps (`int`): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. - mask_overlay_kwargs (`Dict`, *optional*): + mask_overlay_kwargs (`dict`, *optional*): The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. Outputs: - images (`List`): + images (`list`): Generated images. """ model_name = "qwenimage-edit" block_classes = EDIT_AUTO_BLOCKS.values() block_names = EDIT_AUTO_BLOCKS.keys() + _workflow_map = { + "image_conditioned": {"prompt": True, "image": True}, + "image_conditioned_inpainting": {"prompt": True, "mask_image": True, "image": True}, + } @property def description(self): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 56652c94c4b0..4a1f418d7b45 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -58,7 +58,7 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. prompt (`str`): The prompt or prompts to guide image generation. @@ -66,9 +66,9 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): The prompt or prompts not to guide the image generation. Outputs: - resized_image (`List`): + resized_image (`list`): Images resized to 1024x1024 target area for VAE encoding - resized_cond_image (`List`): + resized_cond_image (`list`): Images resized to 384x384 target area for VL text encoding prompt_embeds (`Tensor`): The prompt embeddings. @@ -108,15 +108,15 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): (`AutoencoderKLQwenImage`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): + resized_image (`list`): Images resized to 1024x1024 target area for VAE encoding - resized_cond_image (`List`): + resized_cond_image (`list`): Images resized to 384x384 target area for VL text encoding processed_image (`Tensor`): The processed image @@ -189,9 +189,9 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): The negative prompt embeddings. (batch-expanded) negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask. (batch-expanded) - image_height (`List`): + image_height (`list`): The image heights calculated from the image latents dimension - image_width (`List`): + image_width (`list`): The image widths calculated from the image latents dimension height (`int`): if not provided, updated to image height @@ -253,9 +253,9 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -315,7 +315,7 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): Output format: 'pil', 'np', 'pt'. Outputs: - images (`List`): + images (`list`): Generated images. (tensor output of the vae decoder.) """ @@ -357,7 +357,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): transformer (`QwenImageTransformer2DModel`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. prompt (`str`): The prompt or prompts to guide image generation. @@ -375,9 +375,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): Pre-generated noisy latents for image generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -385,7 +385,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): Output format: 'pil', 'np', 'pt'. Outputs: - images (`List`): + images (`list`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 43cefa5eb658..a10454f1fb0c 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -60,7 +60,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 @@ -74,7 +74,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): Maximum sequence length for prompt encoding. Outputs: - resized_image (`List`): + resized_image (`list`): The resized images prompt (`str`): The prompt or prompts to guide image generation. If not provided, updated using image caption @@ -117,7 +117,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): (`AutoencoderKLQwenImage`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 @@ -125,7 +125,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): Torch generator for deterministic generation. Outputs: - resized_image (`List`): + resized_image (`list`): The resized images processed_image (`Tensor`): The processed image @@ -250,9 +250,9 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -317,7 +317,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: - image (`Union[Image, List]`): + image (`Image | list`): Reference image(s) for denoising. Can be a single image or list of images. resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 @@ -339,9 +339,9 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Number of layers to extract from the image num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): + sigmas (`list`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): + attention_kwargs (`dict`, *optional*): Additional kwargs for attention processors. **denoiser_input_fields (`None`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. @@ -349,7 +349,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Output format: 'pil', 'np', 'pt'. Outputs: - images (`List`): + images (`list`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py index 59ec46dc6d36..44f1c555cef3 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/__init__.py @@ -21,21 +21,7 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["encoders"] = ["StableDiffusionXLTextEncoderStep"] - _import_structure["modular_blocks"] = [ - "ALL_BLOCKS", - "AUTO_BLOCKS", - "CONTROLNET_BLOCKS", - "IMAGE2IMAGE_BLOCKS", - "INPAINT_BLOCKS", - "IP_ADAPTER_BLOCKS", - "TEXT2IMAGE_BLOCKS", - "StableDiffusionXLAutoBlocks", - "StableDiffusionXLAutoControlnetStep", - "StableDiffusionXLAutoDecodeStep", - "StableDiffusionXLAutoIPAdapterStep", - "StableDiffusionXLAutoVaeEncoderStep", - ] + _import_structure["modular_blocks_stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks"] _import_structure["modular_pipeline"] = ["StableDiffusionXLModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -45,23 +31,7 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .encoders import ( - StableDiffusionXLTextEncoderStep, - ) - from .modular_blocks import ( - ALL_BLOCKS, - AUTO_BLOCKS, - CONTROLNET_BLOCKS, - IMAGE2IMAGE_BLOCKS, - INPAINT_BLOCKS, - IP_ADAPTER_BLOCKS, - TEXT2IMAGE_BLOCKS, - StableDiffusionXLAutoBlocks, - StableDiffusionXLAutoControlnetStep, - StableDiffusionXLAutoDecodeStep, - StableDiffusionXLAutoIPAdapterStep, - StableDiffusionXLAutoVaeEncoderStep, - ) + from .modular_blocks_stable_diffusion_xl import StableDiffusionXLAutoBlocks from .modular_pipeline import StableDiffusionXLModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py similarity index 55% rename from src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py rename to src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py index 68b5e33755b5..a7a18e514777 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks_stable_diffusion_xl.py @@ -14,7 +14,7 @@ from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict +from ..modular_pipeline_utils import OutputParam from .before_denoise import ( StableDiffusionXLControlNetInputStep, StableDiffusionXLControlNetUnionInputStep, @@ -277,7 +277,161 @@ def description(self): # ip-adapter, controlnet, text2img, img2img, inpainting +# auto_docstring class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks): + """ + Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion + XL. + + Supported workflows: + - `text2image`: requires `prompt` + - `image2image`: requires `image`, `prompt` + - `inpainting`: requires `mask_image`, `image`, `prompt` + - `controlnet_text2image`: requires `control_image`, `prompt` + - `controlnet_image2image`: requires `control_image`, `image`, `prompt` + - `controlnet_inpainting`: requires `control_image`, `mask_image`, `image`, `prompt` + - `controlnet_union_text2image`: requires `control_image`, `control_mode`, `prompt` + - `controlnet_union_image2image`: requires `control_image`, `control_mode`, `image`, `prompt` + - `controlnet_union_inpainting`: requires `control_image`, `control_mode`, `mask_image`, `image`, `prompt` + - `ip_adapter_text2image`: requires `ip_adapter_image`, `prompt` + - `ip_adapter_image2image`: requires `ip_adapter_image`, `image`, `prompt` + - `ip_adapter_inpainting`: requires `ip_adapter_image`, `mask_image`, `image`, `prompt` + - `ip_adapter_controlnet_text2image`: requires `ip_adapter_image`, `control_image`, `prompt` + - `ip_adapter_controlnet_image2image`: requires `ip_adapter_image`, `control_image`, `image`, `prompt` + - `ip_adapter_controlnet_inpainting`: requires `ip_adapter_image`, `control_image`, `mask_image`, `image`, + `prompt` + - `ip_adapter_controlnet_union_text2image`: requires `ip_adapter_image`, `control_image`, `control_mode`, + `prompt` + - `ip_adapter_controlnet_union_image2image`: requires `ip_adapter_image`, `control_image`, `control_mode`, + `image`, `prompt` + - `ip_adapter_controlnet_union_inpainting`: requires `ip_adapter_image`, `control_image`, `control_mode`, + `mask_image`, `image`, `prompt` + + Components: + text_encoder (`CLIPTextModel`) text_encoder_2 (`CLIPTextModelWithProjection`) tokenizer (`CLIPTokenizer`) + tokenizer_2 (`CLIPTokenizer`) guider (`ClassifierFreeGuidance`) image_encoder + (`CLIPVisionModelWithProjection`) feature_extractor (`CLIPImageProcessor`) unet (`UNet2DConditionModel`) vae + (`AutoencoderKL`) image_processor (`VaeImageProcessor`) mask_processor (`VaeImageProcessor`) scheduler + (`EulerDiscreteScheduler`) controlnet (`ControlNetUnionModel`) control_image_processor (`VaeImageProcessor`) + + Configs: + force_zeros_for_empty_prompt (default: True) requires_aesthetics_score (default: False) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + prompt_2 (`None`, *optional*): + TODO: Add description. + negative_prompt (`None`, *optional*): + TODO: Add description. + negative_prompt_2 (`None`, *optional*): + TODO: Add description. + cross_attention_kwargs (`None`, *optional*): + TODO: Add description. + clip_skip (`None`, *optional*): + TODO: Add description. + ip_adapter_image (`Image | ndarray | Tensor | list | list | list`, *optional*): + The image(s) to be used as ip adapter + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + image (`None`, *optional*): + TODO: Add description. + mask_image (`None`, *optional*): + TODO: Add description. + padding_mask_crop (`None`, *optional*): + TODO: Add description. + dtype (`dtype`, *optional*): + The dtype of the model inputs + generator (`None`, *optional*): + TODO: Add description. + preprocess_kwargs (`dict | NoneType`, *optional*): + A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under + `self.image_processor` in [diffusers.image_processor.VaeImageProcessor] + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + ip_adapter_embeds (`list`, *optional*): + Pre-generated image embeddings for IP-Adapter. Can be generated from ip_adapter step. + negative_ip_adapter_embeds (`list`, *optional*): + Pre-generated negative image embeddings for IP-Adapter. Can be generated from ip_adapter step. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + denoising_end (`None`, *optional*): + TODO: Add description. + strength (`None`, *optional*, defaults to 0.3): + TODO: Add description. + denoising_start (`None`, *optional*): + TODO: Add description. + latents (`None`): + TODO: Add description. + image_latents (`Tensor`, *optional*): + The latents representing the reference image for image-to-image/inpainting generation. Can be generated + in vae_encode step. + mask (`Tensor`, *optional*): + The mask for the inpainting generation. Can be generated in vae_encode step. + masked_image_latents (`Tensor`, *optional*): + The masked image latents for the inpainting generation (only for inpainting-specific unet). Can be + generated in vae_encode step. + original_size (`None`, *optional*): + TODO: Add description. + target_size (`None`, *optional*): + TODO: Add description. + negative_original_size (`None`, *optional*): + TODO: Add description. + negative_target_size (`None`, *optional*): + TODO: Add description. + crops_coords_top_left (`None`, *optional*, defaults to (0, 0)): + TODO: Add description. + negative_crops_coords_top_left (`None`, *optional*, defaults to (0, 0)): + TODO: Add description. + aesthetic_score (`None`, *optional*, defaults to 6.0): + TODO: Add description. + negative_aesthetic_score (`None`, *optional*, defaults to 2.0): + TODO: Add description. + control_image (`None`, *optional*): + TODO: Add description. + control_mode (`None`, *optional*): + TODO: Add description. + control_guidance_start (`None`, *optional*, defaults to 0.0): + TODO: Add description. + control_guidance_end (`None`, *optional*, defaults to 1.0): + TODO: Add description. + controlnet_conditioning_scale (`None`, *optional*, defaults to 1.0): + TODO: Add description. + guess_mode (`None`, *optional*, defaults to False): + TODO: Add description. + crops_coords (`tuple | NoneType`, *optional*): + The crop coordinates to use for preprocess/postprocess the image and mask, for inpainting task only. Can + be generated in vae_encode step. + controlnet_cond (`Tensor`, *optional*): + The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step. + conditioning_scale (`float`, *optional*): + The controlnet conditioning scale value to use for the denoising process. Can be generated in + prepare_controlnet_inputs step. + controlnet_keep (`list`, *optional*): + The controlnet keep values to use for the denoising process. Can be generated in + prepare_controlnet_inputs step. + **denoiser_input_fields (`None`, *optional*): + All conditional model inputs that need to be prepared with guider. It should contain + prompt_embeds/negative_prompt_embeds, add_time_ids/negative_add_time_ids, + pooled_prompt_embeds/negative_pooled_prompt_embeds, and ip_adapter_embeds/negative_ip_adapter_embeds + (optional).please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when + they are created and added to the pipeline state + eta (`None`, *optional*, defaults to 0.0): + TODO: Add description. + output_type (`None`, *optional*, defaults to pil): + TODO: Add description. + + Outputs: + images (`list`): + Generated images. + """ + block_classes = [ StableDiffusionXLTextEncoderStep, StableDiffusionXLAutoIPAdapterStep, @@ -293,103 +447,66 @@ class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks): "decode", ] - @property - def description(self): - return ( - "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL.\n" - + "- for image-to-image generation, you need to provide either `image` or `image_latents`\n" - + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" - + "- to run the controlnet workflow, you need to provide `control_image`\n" - + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n" - + "- to run the ip_adapter workflow, you need to provide `ip_adapter_image`\n" - + "- for text-to-image generation, all you need to provide is `prompt`" - ) - - -# controlnet (input + denoise step) -class StableDiffusionXLAutoControlnetStep(SequentialPipelineBlocks): - block_classes = [ - StableDiffusionXLAutoControlNetInputStep, - StableDiffusionXLAutoControlNetDenoiseStep, - ] - block_names = ["controlnet_input", "controlnet_denoise"] + _workflow_map = { + "text2image": {"prompt": True}, + "image2image": {"image": True, "prompt": True}, + "inpainting": {"mask_image": True, "image": True, "prompt": True}, + "controlnet_text2image": {"control_image": True, "prompt": True}, + "controlnet_image2image": {"control_image": True, "image": True, "prompt": True}, + "controlnet_inpainting": {"control_image": True, "mask_image": True, "image": True, "prompt": True}, + "controlnet_union_text2image": {"control_image": True, "control_mode": True, "prompt": True}, + "controlnet_union_image2image": {"control_image": True, "control_mode": True, "image": True, "prompt": True}, + "controlnet_union_inpainting": { + "control_image": True, + "control_mode": True, + "mask_image": True, + "image": True, + "prompt": True, + }, + "ip_adapter_text2image": {"ip_adapter_image": True, "prompt": True}, + "ip_adapter_image2image": {"ip_adapter_image": True, "image": True, "prompt": True}, + "ip_adapter_inpainting": {"ip_adapter_image": True, "mask_image": True, "image": True, "prompt": True}, + "ip_adapter_controlnet_text2image": {"ip_adapter_image": True, "control_image": True, "prompt": True}, + "ip_adapter_controlnet_image2image": { + "ip_adapter_image": True, + "control_image": True, + "image": True, + "prompt": True, + }, + "ip_adapter_controlnet_inpainting": { + "ip_adapter_image": True, + "control_image": True, + "mask_image": True, + "image": True, + "prompt": True, + }, + "ip_adapter_controlnet_union_text2image": { + "ip_adapter_image": True, + "control_image": True, + "control_mode": True, + "prompt": True, + }, + "ip_adapter_controlnet_union_image2image": { + "ip_adapter_image": True, + "control_image": True, + "control_mode": True, + "image": True, + "prompt": True, + }, + "ip_adapter_controlnet_union_inpainting": { + "ip_adapter_image": True, + "control_image": True, + "control_mode": True, + "mask_image": True, + "image": True, + "prompt": True, + }, + } @property def description(self): - return ( - "Controlnet auto step that prepare the controlnet input and denoise the latents. " - + "It works for both controlnet and controlnet_union and supports text2img, img2img and inpainting tasks." - + " (it should be replace at 'denoise' step)" - ) - - -TEXT2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", StableDiffusionXLTextEncoderStep), - ("input", StableDiffusionXLInputStep), - ("set_timesteps", StableDiffusionXLSetTimestepsStep), - ("prepare_latents", StableDiffusionXLPrepareLatentsStep), - ("prepare_add_cond", StableDiffusionXLPrepareAdditionalConditioningStep), - ("denoise", StableDiffusionXLDenoiseStep), - ("decode", StableDiffusionXLDecodeStep), - ] -) + return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL." -IMAGE2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", StableDiffusionXLTextEncoderStep), - ("vae_encoder", StableDiffusionXLVaeEncoderStep), - ("input", StableDiffusionXLInputStep), - ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep), - ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep), - ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep), - ("denoise", StableDiffusionXLDenoiseStep), - ("decode", StableDiffusionXLDecodeStep), - ] -) - -INPAINT_BLOCKS = InsertableDict( - [ - ("text_encoder", StableDiffusionXLTextEncoderStep), - ("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep), - ("input", StableDiffusionXLInputStep), - ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep), - ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep), - ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep), - ("denoise", StableDiffusionXLInpaintDenoiseStep), - ("decode", StableDiffusionXLInpaintDecodeStep), - ] -) - -CONTROLNET_BLOCKS = InsertableDict( - [ - ("denoise", StableDiffusionXLAutoControlnetStep), - ] -) - - -IP_ADAPTER_BLOCKS = InsertableDict( - [ - ("ip_adapter", StableDiffusionXLAutoIPAdapterStep), - ] -) - -AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", StableDiffusionXLTextEncoderStep), - ("ip_adapter", StableDiffusionXLAutoIPAdapterStep), - ("vae_encoder", StableDiffusionXLAutoVaeEncoderStep), - ("denoise", StableDiffusionXLCoreDenoiseStep), - ("decode", StableDiffusionXLAutoDecodeStep), - ] -) - - -ALL_BLOCKS = { - "text2img": TEXT2IMAGE_BLOCKS, - "img2img": IMAGE2IMAGE_BLOCKS, - "inpaint": INPAINT_BLOCKS, - "controlnet": CONTROLNET_BLOCKS, - "ip_adapter": IP_ADAPTER_BLOCKS, - "auto": AUTO_BLOCKS, -} + @property + def outputs(self): + return [OutputParam.template("images")] diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py index d01a86ca09b5..b641c6cd7fcc 100644 --- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py +++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan.py @@ -14,6 +14,7 @@ from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks +from ..modular_pipeline_utils import OutputParam from .before_denoise import ( WanPrepareLatentsStep, WanSetTimestepsStep, @@ -37,7 +38,45 @@ # inputs(text) -> set_timesteps -> prepare_latents -> denoise +# auto_docstring class WanCoreDenoiseStep(SequentialPipelineBlocks): + """ + denoise block that takes encoded conditions and runs the denoising process. + + Components: + transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`) + + Inputs: + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + num_frames (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "wan" block_classes = [ WanTextInputStep, @@ -49,14 +88,11 @@ class WanCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): - return ( - "denoise block that takes encoded conditions and runs the denoising process.\n" - + "This is a sequential pipeline blocks:\n" - + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" - + " - `WanSetTimestepsStep` is used to set the timesteps\n" - + " - `WanPrepareLatentsStep` is used to prepare the latents\n" - + " - `WanDenoiseStep` is used to denoise the latents\n" - ) + return "denoise block that takes encoded conditions and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] # ==================== @@ -64,7 +100,51 @@ def description(self): # ==================== +# auto_docstring class WanBlocks(SequentialPipelineBlocks): + """ + Modular pipeline blocks for Wan2.1. + + Components: + text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer + (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) vae (`AutoencoderKLWan`) video_processor + (`VideoProcessor`) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + negative_prompt (`None`, *optional*): + TODO: Add description. + max_sequence_length (`None`, *optional*, defaults to 512): + TODO: Add description. + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + num_frames (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + output_type (`str`, *optional*, defaults to np): + The output type of the decoded videos + + Outputs: + videos (`list`): + The generated videos. + """ + model_name = "wan" block_classes = [ WanTextEncoderStep, @@ -75,9 +155,8 @@ class WanBlocks(SequentialPipelineBlocks): @property def description(self): - return ( - "Modular pipeline blocks for Wan2.1.\n" - + "- `WanTextEncoderStep` is used to encode the text\n" - + "- `WanCoreDenoiseStep` is used to denoise the latents\n" - + "- `WanVaeDecoderStep` is used to decode the latents to images" - ) + return "Modular pipeline blocks for Wan2.1." + + @property + def outputs(self): + return [OutputParam.template("videos")] diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py index 21164422f3d9..9f602c24713b 100644 --- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py +++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22.py @@ -14,6 +14,7 @@ from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks +from ..modular_pipeline_utils import OutputParam from .before_denoise import ( WanPrepareLatentsStep, WanSetTimestepsStep, @@ -38,7 +39,50 @@ # inputs(text) -> set_timesteps -> prepare_latents -> denoise +# auto_docstring class Wan22CoreDenoiseStep(SequentialPipelineBlocks): + """ + denoise block that takes encoded conditions and runs the denoising process. + + Components: + transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`) + guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`) + + Configs: + boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low + noise stages. + + Inputs: + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + num_frames (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "wan" block_classes = [ WanTextInputStep, @@ -50,14 +94,11 @@ class Wan22CoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): - return ( - "denoise block that takes encoded conditions and runs the denoising process.\n" - + "This is a sequential pipeline blocks:\n" - + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" - + " - `WanSetTimestepsStep` is used to set the timesteps\n" - + " - `WanPrepareLatentsStep` is used to prepare the latents\n" - + " - `Wan22DenoiseStep` is used to denoise the latents in wan2.2\n" - ) + return "denoise block that takes encoded conditions and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] # ==================== @@ -65,7 +106,55 @@ def description(self): # ==================== +# auto_docstring class Wan22Blocks(SequentialPipelineBlocks): + """ + Modular pipeline for text-to-video using Wan2.2. + + Components: + text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) transformer + (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider_2 (`ClassifierFreeGuidance`) + transformer_2 (`WanTransformer3DModel`) vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) + + Configs: + boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low + noise stages. + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + negative_prompt (`None`, *optional*): + TODO: Add description. + max_sequence_length (`None`, *optional*, defaults to 512): + TODO: Add description. + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + num_frames (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + output_type (`str`, *optional*, defaults to np): + The output type of the decoded videos + + Outputs: + videos (`list`): + The generated videos. + """ + model_name = "wan" block_classes = [ WanTextEncoderStep, @@ -80,9 +169,8 @@ class Wan22Blocks(SequentialPipelineBlocks): @property def description(self): - return ( - "Modular pipeline for text-to-video using Wan2.2.\n" - + " - `WanTextEncoderStep` encodes the text\n" - + " - `Wan22CoreDenoiseStep` denoes the latents\n" - + " - `WanVaeDecoderStep` decodes the latents to video frames\n" - ) + return "Modular pipeline for text-to-video using Wan2.2." + + @property + def outputs(self): + return [OutputParam.template("videos")] diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py index 3db1c8fa837b..8e55b7a50f08 100644 --- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py +++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan22_i2v.py @@ -14,6 +14,7 @@ from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks +from ..modular_pipeline_utils import OutputParam from .before_denoise import ( WanAdditionalInputsStep, WanPrepareLatentsStep, @@ -40,7 +41,36 @@ # ==================== +# auto_docstring class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks): + """ + Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent + representation + + Components: + vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) + + Inputs: + image (`Image`): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + num_frames (`int`, *optional*, defaults to 81): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + resized_image (`Image`): + TODO: Add description. + first_frame_latents (`Tensor`): + video latent representation with the first frame image condition + image_condition_latents (`Tensor | NoneType`): + TODO: Add description. + """ + model_name = "wan-i2v" block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep] block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"] @@ -56,7 +86,52 @@ def description(self): # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents) +# auto_docstring class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks): + """ + denoise block that takes encoded text and image latent conditions and runs the denoising process. + + Components: + transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`) + guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`) + + Configs: + boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low + noise stages. + + Inputs: + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + num_frames (`None`, *optional*): + TODO: Add description. + image_condition_latents (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + model_name = "wan-i2v" block_classes = [ WanTextInputStep, @@ -75,15 +150,11 @@ class Wan22Image2VideoCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): - return ( - "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n" - + "This is a sequential pipeline blocks:\n" - + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" - + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n" - + " - `WanSetTimestepsStep` is used to set the timesteps\n" - + " - `WanPrepareLatentsStep` is used to prepare the latents\n" - + " - `Wan22Image2VideoDenoiseStep` is used to denoise the latents in wan2.2\n" - ) + return "denoise block that takes encoded text and image latent conditions and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] # ==================== @@ -91,7 +162,57 @@ def description(self): # ==================== +# auto_docstring class Wan22Image2VideoBlocks(SequentialPipelineBlocks): + """ + Modular pipeline for image-to-video using Wan2.2. + + Components: + text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) vae + (`AutoencoderKLWan`) video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler + (`UniPCMultistepScheduler`) guider_2 (`ClassifierFreeGuidance`) transformer_2 (`WanTransformer3DModel`) + + Configs: + boundary_ratio (default: 0.875): The boundary ratio to divide the denoising loop into high noise and low + noise stages. + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + negative_prompt (`None`, *optional*): + TODO: Add description. + max_sequence_length (`None`, *optional*, defaults to 512): + TODO: Add description. + image (`Image`): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + num_frames (`int`, *optional*, defaults to 81): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + output_type (`str`, *optional*, defaults to np): + The output type of the decoded videos + + Outputs: + videos (`list`): + The generated videos. + """ + model_name = "wan-i2v" block_classes = [ WanTextEncoderStep, @@ -108,10 +229,8 @@ class Wan22Image2VideoBlocks(SequentialPipelineBlocks): @property def description(self): - return ( - "Modular pipeline for image-to-video using Wan2.2.\n" - + " - `WanTextEncoderStep` encodes the text\n" - + " - `WanImage2VideoVaeEncoderStep` encodes the image\n" - + " - `Wan22Image2VideoCoreDenoiseStep` denoes the latents\n" - + " - `WanVaeDecoderStep` decodes the latents to video frames\n" - ) + return "Modular pipeline for image-to-video using Wan2.2." + + @property + def outputs(self): + return [OutputParam.template("videos")] diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py index d07ab8ecf473..c08db62c469a 100644 --- a/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py +++ b/src/diffusers/modular_pipelines/wan/modular_blocks_wan_i2v.py @@ -14,6 +14,7 @@ from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import OutputParam from .before_denoise import ( WanAdditionalInputsStep, WanPrepareLatentsStep, @@ -45,7 +46,29 @@ # wan2.1 I2V (first frame only) +# auto_docstring class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks): + """ + Image2Video Image Encoder step that resize the image and encode the image to generate the image embeddings + + Components: + image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) + + Inputs: + image (`Image`): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + + Outputs: + resized_image (`Image`): + TODO: Add description. + image_embeds (`Tensor`): + The image embeddings + """ + model_name = "wan-i2v" block_classes = [WanImageResizeStep, WanImageEncoderStep] block_names = ["image_resize", "image_encoder"] @@ -56,7 +79,34 @@ def description(self): # wan2.1 FLF2V (first and last frame) +# auto_docstring class WanFLF2VImageEncoderStep(SequentialPipelineBlocks): + """ + FLF2V Image Encoder step that resize and encode and encode the first and last frame images to generate the image + embeddings + + Components: + image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) + + Inputs: + image (`Image`): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + last_image (`Image`): + The last frameimage + + Outputs: + resized_image (`Image`): + TODO: Add description. + resized_last_image (`Image`): + TODO: Add description. + image_embeds (`Tensor`): + The image embeddings + """ + model_name = "wan-i2v" block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameImageEncoderStep] block_names = ["image_resize", "last_image_resize", "image_encoder"] @@ -67,7 +117,36 @@ def description(self): # wan2.1 Auto Image Encoder +# auto_docstring class WanAutoImageEncoderStep(AutoPipelineBlocks): + """ + Image Encoder step that encode the image to generate the image embeddingsThis is an auto pipeline block that works + for image2video tasks. - `WanFLF2VImageEncoderStep` (flf2v) is used when `last_image` is provided. - + `WanImage2VideoImageEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is + not provided, step will be skipped. + + Components: + image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) + + Inputs: + image (`Image`, *optional*): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + last_image (`Image`, *optional*): + The last frameimage + + Outputs: + resized_image (`Image`): + TODO: Add description. + resized_last_image (`Image`): + TODO: Add description. + image_embeds (`Tensor`): + The image embeddings + """ + block_classes = [WanFLF2VImageEncoderStep, WanImage2VideoImageEncoderStep] block_names = ["flf2v_image_encoder", "image2video_image_encoder"] block_trigger_inputs = ["last_image", "image"] @@ -90,7 +169,36 @@ def description(self): # wan2.1 I2V (first frame only) +# auto_docstring class WanImage2VideoVaeEncoderStep(SequentialPipelineBlocks): + """ + Image2Video Vae Image Encoder step that resize the image and encode the first frame image to its latent + representation + + Components: + vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) + + Inputs: + image (`Image`): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + num_frames (`int`, *optional*, defaults to 81): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + resized_image (`Image`): + TODO: Add description. + first_frame_latents (`Tensor`): + video latent representation with the first frame image condition + image_condition_latents (`Tensor | NoneType`): + TODO: Add description. + """ + model_name = "wan-i2v" block_classes = [WanImageResizeStep, WanVaeEncoderStep, WanPrepareFirstFrameLatentsStep] block_names = ["image_resize", "vae_encoder", "prepare_first_frame_latents"] @@ -101,7 +209,40 @@ def description(self): # wan2.1 FLF2V (first and last frame) +# auto_docstring class WanFLF2VVaeEncoderStep(SequentialPipelineBlocks): + """ + FLF2V Vae Image Encoder step that resize and encode and encode the first and last frame images to generate the + latent conditions + + Components: + vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) + + Inputs: + image (`Image`): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + last_image (`Image`): + The last frameimage + num_frames (`int`, *optional*, defaults to 81): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + resized_image (`Image`): + TODO: Add description. + resized_last_image (`Image`): + TODO: Add description. + first_last_frame_latents (`Tensor`): + video latent representation with the first and last frame images condition + image_condition_latents (`Tensor | NoneType`): + TODO: Add description. + """ + model_name = "wan-i2v" block_classes = [ WanImageResizeStep, @@ -117,7 +258,44 @@ def description(self): # wan2.1 Auto Vae Encoder +# auto_docstring class WanAutoVaeEncoderStep(AutoPipelineBlocks): + """ + Vae Image Encoder step that encode the image to generate the image latentsThis is an auto pipeline block that works + for image2video tasks. - `WanFLF2VVaeEncoderStep` (flf2v) is used when `last_image` is provided. - + `WanImage2VideoVaeEncoderStep` (image2video) is used when `image` is provided. - if `last_image` or `image` is not + provided, step will be skipped. + + Components: + vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`) + + Inputs: + image (`Image`, *optional*): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + last_image (`Image`, *optional*): + The last frameimage + num_frames (`int`, *optional*, defaults to 81): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + resized_image (`Image`): + TODO: Add description. + resized_last_image (`Image`): + TODO: Add description. + first_last_frame_latents (`Tensor`): + video latent representation with the first and last frame images condition + image_condition_latents (`Tensor | NoneType`): + TODO: Add description. + first_frame_latents (`Tensor`): + video latent representation with the first frame image condition + """ + model_name = "wan-i2v" block_classes = [WanFLF2VVaeEncoderStep, WanImage2VideoVaeEncoderStep] block_names = ["flf2v_vae_encoder", "image2video_vae_encoder"] @@ -141,7 +319,53 @@ def description(self): # wan2.1 I2V core denoise (support both I2V and FLF2V) # inputs (text + image_condition_latents) -> set_timesteps -> prepare_latents -> denoise (latents) +# auto_docstring class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): + """ + denoise block that takes encoded text and image latent conditions and runs the denoising process. + + Components: + transformer (`WanTransformer3DModel`) scheduler (`UniPCMultistepScheduler`) guider (`ClassifierFreeGuidance`) + + Inputs: + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`Tensor`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + num_frames (`None`, *optional*): + TODO: Add description. + image_condition_latents (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + image_embeds (`Tensor`): + TODO: Add description. + + Outputs: + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt + dtype (`dtype`): + Data type of model tensor inputs (determined by `transformer.dtype`) + latents (`Tensor`): + The initial latents to use for the denoising process + """ + model_name = "wan-i2v" block_classes = [ WanTextInputStep, @@ -160,15 +384,7 @@ class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): - return ( - "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n" - + "This is a sequential pipeline blocks:\n" - + " - `WanTextInputStep` is used to adjust the batch size of the model inputs\n" - + " - `WanAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n" - + " - `WanSetTimestepsStep` is used to set the timesteps\n" - + " - `WanPrepareLatentsStep` is used to prepare the latents\n" - + " - `WanImage2VideoDenoiseStep` is used to denoise the latents\n" - ) + return "denoise block that takes encoded text and image latent conditions and runs the denoising process." # ==================== @@ -177,7 +393,64 @@ def description(self): # wan2.1 Image2Video Auto Blocks +# auto_docstring class WanImage2VideoAutoBlocks(SequentialPipelineBlocks): + """ + Auto Modular pipeline for image-to-video using Wan. + + Supported workflows: + - `image2video`: requires `image`, `prompt` + - `flf2v`: requires `last_image`, `image`, `prompt` + + Components: + text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) + image_processor (`CLIPImageProcessor`) image_encoder (`CLIPVisionModel`) vae (`AutoencoderKLWan`) + video_processor (`VideoProcessor`) transformer (`WanTransformer3DModel`) scheduler + (`UniPCMultistepScheduler`) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + negative_prompt (`None`, *optional*): + TODO: Add description. + max_sequence_length (`None`, *optional*, defaults to 512): + TODO: Add description. + image (`Image`, *optional*): + TODO: Add description. + height (`int`, *optional*, defaults to 480): + TODO: Add description. + width (`int`, *optional*, defaults to 832): + TODO: Add description. + last_image (`Image`, *optional*): + The last frameimage + num_frames (`int`, *optional*, defaults to 81): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_videos_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + image_condition_latents (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 50): + TODO: Add description. + timesteps (`None`, *optional*): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + attention_kwargs (`None`, *optional*): + TODO: Add description. + image_embeds (`Tensor`): + TODO: Add description. + output_type (`str`, *optional*, defaults to np): + The output type of the decoded videos + + Outputs: + videos (`list`): + The generated videos. + """ + model_name = "wan-i2v" block_classes = [ WanTextEncoderStep, @@ -194,10 +467,15 @@ class WanImage2VideoAutoBlocks(SequentialPipelineBlocks): "decode", ] + _workflow_map = { + "image2video": {"image": True, "prompt": True}, + "flf2v": {"last_image": True, "image": True, "prompt": True}, + } + @property def description(self): - return ( - "Auto Modular pipeline for image-to-video using Wan.\n" - + "- for I2V workflow, all you need to provide is `image`" - + "- for FLF2V workflow, all you need to provide is `last_image` and `image`" - ) + return "Auto Modular pipeline for image-to-video using Wan." + + @property + def outputs(self): + return [OutputParam.template("videos")] diff --git a/src/diffusers/modular_pipelines/z_image/__init__.py b/src/diffusers/modular_pipelines/z_image/__init__.py index c8a8c14396c0..5c04008d3305 100644 --- a/src/diffusers/modular_pipelines/z_image/__init__.py +++ b/src/diffusers/modular_pipelines/z_image/__init__.py @@ -21,12 +21,7 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["decoders"] = ["ZImageVaeDecoderStep"] - _import_structure["encoders"] = ["ZImageTextEncoderStep", "ZImageVaeImageEncoderStep"] - _import_structure["modular_blocks"] = [ - "ALL_BLOCKS", - "ZImageAutoBlocks", - ] + _import_structure["modular_blocks_z_image"] = ["ZImageAutoBlocks"] _import_structure["modular_pipeline"] = ["ZImageModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -36,12 +31,7 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .decoders import ZImageVaeDecoderStep - from .encoders import ZImageTextEncoderStep - from .modular_blocks import ( - ALL_BLOCKS, - ZImageAutoBlocks, - ) + from .modular_blocks_z_image import ZImageAutoBlocks from .modular_pipeline import ZImageModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks.py b/src/diffusers/modular_pipelines/z_image/modular_blocks.py deleted file mode 100644 index a54baeccaf0c..000000000000 --- a/src/diffusers/modular_pipelines/z_image/modular_blocks.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ...utils import logging -from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict -from .before_denoise import ( - ZImageAdditionalInputsStep, - ZImagePrepareLatentsStep, - ZImagePrepareLatentswithImageStep, - ZImageSetTimestepsStep, - ZImageSetTimestepsWithStrengthStep, - ZImageTextInputStep, -) -from .decoders import ZImageVaeDecoderStep -from .denoise import ( - ZImageDenoiseStep, -) -from .encoders import ( - ZImageTextEncoderStep, - ZImageVaeImageEncoderStep, -) - - -logger = logging.get_logger(__name__) # pylint: disable=invalid-name - - -# z-image -# text2image -class ZImageCoreDenoiseStep(SequentialPipelineBlocks): - block_classes = [ - ZImageTextInputStep, - ZImagePrepareLatentsStep, - ZImageSetTimestepsStep, - ZImageDenoiseStep, - ] - block_names = ["input", "prepare_latents", "set_timesteps", "denoise"] - - @property - def description(self): - return ( - "denoise block that takes encoded conditions and runs the denoising process.\n" - + "This is a sequential pipeline blocks:\n" - + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n" - + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n" - + " - `ZImageSetTimestepsStep` is used to set the timesteps\n" - + " - `ZImageDenoiseStep` is used to denoise the latents\n" - ) - - -# z-image: image2image -## denoise -class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks): - block_classes = [ - ZImageTextInputStep, - ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]), - ZImagePrepareLatentsStep, - ZImageSetTimestepsStep, - ZImageSetTimestepsWithStrengthStep, - ZImagePrepareLatentswithImageStep, - ZImageDenoiseStep, - ] - block_names = [ - "input", - "additional_inputs", - "prepare_latents", - "set_timesteps", - "set_timesteps_with_strength", - "prepare_latents_with_image", - "denoise", - ] - - @property - def description(self): - return ( - "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n" - + "This is a sequential pipeline blocks:\n" - + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n" - + " - `ZImageAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n" - + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n" - + " - `ZImageSetTimestepsStep` is used to set the timesteps\n" - + " - `ZImageSetTimestepsWithStrengthStep` is used to set the timesteps with strength\n" - + " - `ZImagePrepareLatentswithImageStep` is used to prepare the latents with image\n" - + " - `ZImageDenoiseStep` is used to denoise the latents\n" - ) - - -## auto blocks -class ZImageAutoDenoiseStep(AutoPipelineBlocks): - block_classes = [ - ZImageImage2ImageCoreDenoiseStep, - ZImageCoreDenoiseStep, - ] - block_names = ["image2image", "text2image"] - block_trigger_inputs = ["image_latents", None] - - @property - def description(self) -> str: - return ( - "Denoise step that iteratively denoise the latents. " - "This is a auto pipeline block that works for text2image and image2image tasks." - " - `ZImageCoreDenoiseStep` (text2image) for text2image tasks." - " - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks." - + " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n" - + " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n" - ) - - -class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks): - block_classes = [ZImageVaeImageEncoderStep] - block_names = ["vae_encoder"] - block_trigger_inputs = ["image"] - - @property - def description(self) -> str: - return "Vae Image Encoder step that encode the image to generate the image latents" - +"This is an auto pipeline block that works for image2image tasks." - +" - `ZImageVaeImageEncoderStep` is used when `image` is provided." - +" - if `image` is not provided, step will be skipped." - - -class ZImageAutoBlocks(SequentialPipelineBlocks): - block_classes = [ - ZImageTextEncoderStep, - ZImageAutoVaeImageEncoderStep, - ZImageAutoDenoiseStep, - ZImageVaeDecoderStep, - ] - block_names = ["text_encoder", "vae_encoder", "denoise", "decode"] - - @property - def description(self) -> str: - return "Auto Modular pipeline for text-to-image and image-to-image using ZImage.\n" - +" - for text-to-image generation, all you need to provide is `prompt`\n" - +" - for image-to-image generation, you need to provide `image`\n" - +" - if `image` is not provided, step will be skipped." - - -# presets -TEXT2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", ZImageTextEncoderStep), - ("input", ZImageTextInputStep), - ("prepare_latents", ZImagePrepareLatentsStep), - ("set_timesteps", ZImageSetTimestepsStep), - ("denoise", ZImageDenoiseStep), - ("decode", ZImageVaeDecoderStep), - ] -) - -IMAGE2IMAGE_BLOCKS = InsertableDict( - [ - ("text_encoder", ZImageTextEncoderStep), - ("vae_encoder", ZImageVaeImageEncoderStep), - ("input", ZImageTextInputStep), - ("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])), - ("prepare_latents", ZImagePrepareLatentsStep), - ("set_timesteps", ZImageSetTimestepsStep), - ("set_timesteps_with_strength", ZImageSetTimestepsWithStrengthStep), - ("prepare_latents_with_image", ZImagePrepareLatentswithImageStep), - ("denoise", ZImageDenoiseStep), - ("decode", ZImageVaeDecoderStep), - ] -) - - -AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", ZImageTextEncoderStep), - ("vae_encoder", ZImageAutoVaeImageEncoderStep), - ("denoise", ZImageAutoDenoiseStep), - ("decode", ZImageVaeDecoderStep), - ] -) - -ALL_BLOCKS = { - "text2image": TEXT2IMAGE_BLOCKS, - "image2image": IMAGE2IMAGE_BLOCKS, - "auto": AUTO_BLOCKS, -} diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py new file mode 100644 index 000000000000..23e20d55fb1e --- /dev/null +++ b/src/diffusers/modular_pipelines/z_image/modular_blocks_z_image.py @@ -0,0 +1,334 @@ +# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import OutputParam +from .before_denoise import ( + ZImageAdditionalInputsStep, + ZImagePrepareLatentsStep, + ZImagePrepareLatentswithImageStep, + ZImageSetTimestepsStep, + ZImageSetTimestepsWithStrengthStep, + ZImageTextInputStep, +) +from .decoders import ZImageVaeDecoderStep +from .denoise import ( + ZImageDenoiseStep, +) +from .encoders import ( + ZImageTextEncoderStep, + ZImageVaeImageEncoderStep, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +# ==================== +# 1. DENOISE +# ==================== + + +# text2image: inputs(text) -> set_timesteps -> prepare_latents -> denoise +# auto_docstring +class ZImageCoreDenoiseStep(SequentialPipelineBlocks): + """ + denoise block that takes encoded conditions and runs the denoising process. + + Components: + transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`list`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`list`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + height (`int`, *optional*): + TODO: Add description. + width (`int`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 9): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + **denoiser_input_fields (`None`, *optional*): + The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + block_classes = [ + ZImageTextInputStep, + ZImagePrepareLatentsStep, + ZImageSetTimestepsStep, + ZImageDenoiseStep, + ] + block_names = ["input", "prepare_latents", "set_timesteps", "denoise"] + + @property + def description(self): + return "denoise block that takes encoded conditions and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] + + +# image2image: inputs(text + image_latents) -> prepare_latents -> set_timesteps -> set_timesteps_with_strength -> prepare_latents_with_image -> denoise +# auto_docstring +class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks): + """ + denoise block that takes encoded text and image latent conditions and runs the denoising process. + + Components: + transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`list`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`list`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + image_latents (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`, *optional*, defaults to 9): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + strength (`None`, *optional*, defaults to 0.6): + TODO: Add description. + **denoiser_input_fields (`None`, *optional*): + The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + block_classes = [ + ZImageTextInputStep, + ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]), + ZImagePrepareLatentsStep, + ZImageSetTimestepsStep, + ZImageSetTimestepsWithStrengthStep, + ZImagePrepareLatentswithImageStep, + ZImageDenoiseStep, + ] + block_names = [ + "input", + "additional_inputs", + "prepare_latents", + "set_timesteps", + "set_timesteps_with_strength", + "prepare_latents_with_image", + "denoise", + ] + + @property + def description(self): + return "denoise block that takes encoded text and image latent conditions and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] + + +# auto_docstring +class ZImageAutoDenoiseStep(AutoPipelineBlocks): + """ + Denoise step that iteratively denoise the latents. This is a auto pipeline block that works for text2image and + image2image tasks. - `ZImageCoreDenoiseStep` (text2image) for text2image tasks. - + `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks. - if `image_latents` is provided, + `ZImageImage2ImageCoreDenoiseStep` will be used. + - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used. + + Components: + transformer (`ZImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) + + Inputs: + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + prompt_embeds (`list`): + Pre-generated text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`list`, *optional*): + Pre-generated negative text embeddings. Can be generated from text_encoder step. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + image_latents (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_inference_steps (`None`): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + strength (`None`, *optional*, defaults to 0.6): + TODO: Add description. + **denoiser_input_fields (`None`, *optional*): + The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + block_classes = [ + ZImageImage2ImageCoreDenoiseStep, + ZImageCoreDenoiseStep, + ] + block_names = ["image2image", "text2image"] + block_trigger_inputs = ["image_latents", None] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. " + "This is a auto pipeline block that works for text2image and image2image tasks." + " - `ZImageCoreDenoiseStep` (text2image) for text2image tasks." + " - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks." + + " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n" + + " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n" + ) + + +# auto_docstring +class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks): + """ + Vae Image Encoder step that encode the image to generate the image latents + + Components: + vae (`AutoencoderKL`) image_processor (`VaeImageProcessor`) + + Inputs: + image (`Image`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + + Outputs: + image_latents (`Tensor`): + video latent representation with the first frame image condition + """ + + block_classes = [ZImageVaeImageEncoderStep] + block_names = ["vae_encoder"] + block_trigger_inputs = ["image"] + + @property + def description(self) -> str: + return "Vae Image Encoder step that encode the image to generate the image latents" + +"This is an auto pipeline block that works for image2image tasks." + +" - `ZImageVaeImageEncoderStep` is used when `image` is provided." + +" - if `image` is not provided, step will be skipped." + + +# auto_docstring +class ZImageAutoBlocks(SequentialPipelineBlocks): + """ + Auto Modular pipeline for text-to-image and image-to-image using ZImage. + + Supported workflows: + - `text2image`: requires `prompt` + - `image2image`: requires `image`, `prompt` + + Components: + text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) guider (`ClassifierFreeGuidance`) vae + (`AutoencoderKL`) image_processor (`VaeImageProcessor`) transformer (`ZImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + prompt (`None`, *optional*): + TODO: Add description. + negative_prompt (`None`, *optional*): + TODO: Add description. + max_sequence_length (`None`, *optional*, defaults to 512): + TODO: Add description. + image (`Image`, *optional*): + TODO: Add description. + height (`None`, *optional*): + TODO: Add description. + width (`None`, *optional*): + TODO: Add description. + generator (`None`, *optional*): + TODO: Add description. + num_images_per_prompt (`None`, *optional*, defaults to 1): + TODO: Add description. + image_latents (`None`, *optional*): + TODO: Add description. + latents (`Tensor | NoneType`): + TODO: Add description. + num_inference_steps (`None`): + TODO: Add description. + sigmas (`None`, *optional*): + TODO: Add description. + strength (`None`, *optional*, defaults to 0.6): + TODO: Add description. + **denoiser_input_fields (`None`, *optional*): + The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + output_type (`str`, *optional*, defaults to pil): + The type of the output images, can be 'pil', 'np', 'pt' + + Outputs: + images (`list`): + Generated images. + """ + + block_classes = [ + ZImageTextEncoderStep, + ZImageAutoVaeImageEncoderStep, + ZImageAutoDenoiseStep, + ZImageVaeDecoderStep, + ] + block_names = ["text_encoder", "vae_encoder", "denoise", "decode"] + _workflow_map = { + "text2image": {"prompt": True}, + "image2image": {"image": True, "prompt": True}, + } + + @property + def description(self) -> str: + return "Auto Modular pipeline for text-to-image and image-to-image using ZImage." + + @property + def outputs(self): + return [OutputParam.template("images")] diff --git a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py index 854b5218c617..9a6b4b9b6fb4 100644 --- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py +++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py @@ -33,6 +33,19 @@ from ..test_modular_pipelines_common import ModularPipelineTesterMixin +FLUX_TEXT2IMAGE_WORKFLOWS = { + "text2image": [ + ("text_encoder", "FluxTextEncoderStep"), + ("denoise.input", "FluxTextInputStep"), + ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"), + ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"), + ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"), + ("denoise.denoise", "FluxDenoiseStep"), + ("decode", "FluxDecodeStep"), + ] +} + + class TestFluxModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = FluxModularPipeline pipeline_blocks_class = FluxAutoBlocks @@ -40,6 +53,7 @@ class TestFluxModularPipelineFast(ModularPipelineTesterMixin): params = frozenset(["prompt", "height", "width", "guidance_scale"]) batch_params = frozenset(["prompt"]) + expected_workflow_blocks = FLUX_TEXT2IMAGE_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) @@ -59,6 +73,23 @@ def test_float16_inference(self): super().test_float16_inference(9e-2) +FLUX_IMAGE2IMAGE_WORKFLOWS = { + "image2image": [ + ("text_encoder", "FluxTextEncoderStep"), + ("vae_encoder.preprocess", "FluxProcessImagesInputStep"), + ("vae_encoder.encode", "FluxVaeEncoderStep"), + ("denoise.input.text_inputs", "FluxTextInputStep"), + ("denoise.input.additional_inputs", "FluxAdditionalInputsStep"), + ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"), + ("denoise.before_denoise.set_timesteps", "FluxImg2ImgSetTimestepsStep"), + ("denoise.before_denoise.prepare_img2img_latents", "FluxImg2ImgPrepareLatentsStep"), + ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"), + ("denoise.denoise", "FluxDenoiseStep"), + ("decode", "FluxDecodeStep"), + ] +} + + class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = FluxModularPipeline pipeline_blocks_class = FluxAutoBlocks @@ -66,6 +97,7 @@ class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin): params = frozenset(["prompt", "height", "width", "guidance_scale", "image"]) batch_params = frozenset(["prompt", "image"]) + expected_workflow_blocks = FLUX_IMAGE2IMAGE_WORKFLOWS def get_pipeline(self, components_manager=None, torch_dtype=torch.float32): pipeline = super().get_pipeline(components_manager, torch_dtype) @@ -125,6 +157,32 @@ def test_float16_inference(self): super().test_float16_inference(8e-2) +FLUX_KONTEXT_WORKFLOWS = { + "text2image": [ + ("text_encoder", "FluxTextEncoderStep"), + ("denoise.input", "FluxTextInputStep"), + ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"), + ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"), + ("denoise.before_denoise.prepare_rope_inputs", "FluxRoPEInputsStep"), + ("denoise.denoise", "FluxKontextDenoiseStep"), + ("decode", "FluxDecodeStep"), + ], + "image_conditioned": [ + ("text_encoder", "FluxTextEncoderStep"), + ("vae_encoder.preprocess", "FluxKontextProcessImagesInputStep"), + ("vae_encoder.encode", "FluxVaeEncoderStep"), + ("denoise.input.set_resolution", "FluxKontextSetResolutionStep"), + ("denoise.input.text_inputs", "FluxTextInputStep"), + ("denoise.input.additional_inputs", "FluxKontextAdditionalInputsStep"), + ("denoise.before_denoise.prepare_latents", "FluxPrepareLatentsStep"), + ("denoise.before_denoise.set_timesteps", "FluxSetTimestepsStep"), + ("denoise.before_denoise.prepare_rope_inputs", "FluxKontextRoPEInputsStep"), + ("denoise.denoise", "FluxKontextDenoiseStep"), + ("decode", "FluxDecodeStep"), + ], +} + + class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = FluxKontextModularPipeline pipeline_blocks_class = FluxKontextAutoBlocks @@ -132,6 +190,7 @@ class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin): params = frozenset(["prompt", "height", "width", "guidance_scale", "image"]) batch_params = frozenset(["prompt", "image"]) + expected_workflow_blocks = FLUX_KONTEXT_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py index 8fd529e97e71..3045af636841 100644 --- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py +++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py @@ -28,6 +28,21 @@ from ..test_modular_pipelines_common import ModularPipelineTesterMixin +FLUX2_TEXT2IMAGE_WORKFLOWS = { + "text2image": [ + ("text_encoder", "Flux2TextEncoderStep"), + ("denoise.input", "Flux2TextInputStep"), + ("denoise.prepare_latents", "Flux2PrepareLatentsStep"), + ("denoise.set_timesteps", "Flux2SetTimestepsStep"), + ("denoise.prepare_guidance", "Flux2PrepareGuidanceStep"), + ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"), + ("denoise.denoise", "Flux2DenoiseStep"), + ("denoise.after_denoise", "Flux2UnpackLatentsStep"), + ("decode", "Flux2DecodeStep"), + ], +} + + class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = Flux2ModularPipeline pipeline_blocks_class = Flux2AutoBlocks @@ -35,6 +50,7 @@ class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin): params = frozenset(["prompt", "height", "width", "guidance_scale"]) batch_params = frozenset(["prompt"]) + expected_workflow_blocks = FLUX2_TEXT2IMAGE_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) @@ -56,6 +72,24 @@ def test_float16_inference(self): super().test_float16_inference(9e-2) +FLUX2_IMAGE_CONDITIONED_WORKFLOWS = { + "image_conditioned": [ + ("text_encoder", "Flux2TextEncoderStep"), + ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"), + ("vae_encoder.encode", "Flux2VaeEncoderStep"), + ("denoise.input", "Flux2TextInputStep"), + ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"), + ("denoise.prepare_latents", "Flux2PrepareLatentsStep"), + ("denoise.set_timesteps", "Flux2SetTimestepsStep"), + ("denoise.prepare_guidance", "Flux2PrepareGuidanceStep"), + ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"), + ("denoise.denoise", "Flux2DenoiseStep"), + ("denoise.after_denoise", "Flux2UnpackLatentsStep"), + ("decode", "Flux2DecodeStep"), + ], +} + + class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = Flux2ModularPipeline pipeline_blocks_class = Flux2AutoBlocks @@ -63,6 +97,7 @@ class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin): params = frozenset(["prompt", "height", "width", "guidance_scale", "image"]) batch_params = frozenset(["prompt", "image"]) + expected_workflow_blocks = FLUX2_IMAGE_CONDITIONED_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py index 26653b20f8c4..ad295a961357 100644 --- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py +++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py @@ -28,13 +28,28 @@ from ..test_modular_pipelines_common import ModularPipelineTesterMixin -class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin): +FLUX2_KLEIN_WORKFLOWS = { + "text2image": [ + ("text_encoder", "Flux2KleinTextEncoderStep"), + ("denoise.input", "Flux2TextInputStep"), + ("denoise.prepare_latents", "Flux2PrepareLatentsStep"), + ("denoise.set_timesteps", "Flux2SetTimestepsStep"), + ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"), + ("denoise.denoise", "Flux2KleinDenoiseStep"), + ("denoise.after_denoise", "Flux2UnpackLatentsStep"), + ("decode", "Flux2DecodeStep"), + ], +} + + +class TestFlux2KleinModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = Flux2KleinModularPipeline pipeline_blocks_class = Flux2KleinAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular" params = frozenset(["prompt", "height", "width"]) batch_params = frozenset(["prompt"]) + expected_workflow_blocks = FLUX2_KLEIN_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) @@ -55,13 +70,31 @@ def test_float16_inference(self): super().test_float16_inference(9e-2) -class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin): +FLUX2_KLEIN_IMAGE_CONDITIONED_WORKFLOWS = { + "image_conditioned": [ + ("text_encoder", "Flux2KleinTextEncoderStep"), + ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"), + ("vae_encoder.encode", "Flux2VaeEncoderStep"), + ("denoise.input", "Flux2TextInputStep"), + ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"), + ("denoise.prepare_latents", "Flux2PrepareLatentsStep"), + ("denoise.set_timesteps", "Flux2SetTimestepsStep"), + ("denoise.prepare_rope_inputs", "Flux2RoPEInputsStep"), + ("denoise.denoise", "Flux2KleinDenoiseStep"), + ("denoise.after_denoise", "Flux2UnpackLatentsStep"), + ("decode", "Flux2DecodeStep"), + ], +} + + +class TestFlux2KleinImageConditionedModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = Flux2KleinModularPipeline pipeline_blocks_class = Flux2KleinAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular" params = frozenset(["prompt", "height", "width", "image"]) batch_params = frozenset(["prompt", "image"]) + expected_workflow_blocks = FLUX2_KLEIN_IMAGE_CONDITIONED_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py index 701dd0fed896..b3aa79040317 100644 --- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py +++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py @@ -21,20 +21,35 @@ from diffusers.modular_pipelines import ( Flux2KleinBaseAutoBlocks, - Flux2KleinModularPipeline, + Flux2KleinBaseModularPipeline, ) from ...testing_utils import floats_tensor, torch_device from ..test_modular_pipelines_common import ModularPipelineTesterMixin -class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin): - pipeline_class = Flux2KleinModularPipeline +FLUX2_KLEIN_BASE_WORKFLOWS = { + "text2image": [ + ("text_encoder", "Flux2KleinBaseTextEncoderStep"), + ("denoise.input", "Flux2KleinBaseTextInputStep"), + ("denoise.prepare_latents", "Flux2PrepareLatentsStep"), + ("denoise.set_timesteps", "Flux2SetTimestepsStep"), + ("denoise.prepare_rope_inputs", "Flux2KleinBaseRoPEInputsStep"), + ("denoise.denoise", "Flux2KleinBaseDenoiseStep"), + ("denoise.after_denoise", "Flux2UnpackLatentsStep"), + ("decode", "Flux2DecodeStep"), + ], +} + + +class TestFlux2KleinBaseModularPipelineFast(ModularPipelineTesterMixin): + pipeline_class = Flux2KleinBaseModularPipeline pipeline_blocks_class = Flux2KleinBaseAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular" params = frozenset(["prompt", "height", "width"]) batch_params = frozenset(["prompt"]) + expected_workflow_blocks = FLUX2_KLEIN_BASE_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) @@ -55,13 +70,31 @@ def test_float16_inference(self): super().test_float16_inference(9e-2) -class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin): - pipeline_class = Flux2KleinModularPipeline +FLUX2_KLEIN_BASE_IMAGE_CONDITIONED_WORKFLOWS = { + "image_conditioned": [ + ("text_encoder", "Flux2KleinBaseTextEncoderStep"), + ("vae_encoder.preprocess", "Flux2ProcessImagesInputStep"), + ("vae_encoder.encode", "Flux2VaeEncoderStep"), + ("denoise.input", "Flux2KleinBaseTextInputStep"), + ("denoise.prepare_latents", "Flux2PrepareLatentsStep"), + ("denoise.prepare_image_latents", "Flux2PrepareImageLatentsStep"), + ("denoise.set_timesteps", "Flux2SetTimestepsStep"), + ("denoise.prepare_rope_inputs", "Flux2KleinBaseRoPEInputsStep"), + ("denoise.denoise", "Flux2KleinBaseDenoiseStep"), + ("denoise.after_denoise", "Flux2UnpackLatentsStep"), + ("decode", "Flux2DecodeStep"), + ], +} + + +class TestFlux2KleinBaseImageConditionedModularPipelineFast(ModularPipelineTesterMixin): + pipeline_class = Flux2KleinBaseModularPipeline pipeline_blocks_class = Flux2KleinBaseAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular" params = frozenset(["prompt", "height", "width", "image"]) batch_params = frozenset(["prompt", "image"]) + expected_workflow_blocks = FLUX2_KLEIN_BASE_IMAGE_CONDITIONED_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) diff --git a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py index f4bd27b7ea47..92573c202e49 100644 --- a/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py +++ b/tests/modular_pipelines/qwen/test_modular_pipeline_qwenimage.py @@ -30,6 +30,103 @@ from ..test_modular_pipelines_common import ModularGuiderTesterMixin, ModularPipelineTesterMixin +QWEN_IMAGE_TEXT2IMAGE_WORKFLOWS = { + "text2image": [ + ("text_encoder", "QwenImageTextEncoderStep"), + ("denoise.input", "QwenImageTextInputsStep"), + ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"), + ("denoise.set_timesteps", "QwenImageSetTimestepsStep"), + ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"), + ("denoise.denoise", "QwenImageDenoiseStep"), + ("denoise.after_denoise", "QwenImageAfterDenoiseStep"), + ("decode.decode", "QwenImageDecoderStep"), + ("decode.postprocess", "QwenImageProcessImagesOutputStep"), + ], + "image2image": [ + ("text_encoder", "QwenImageTextEncoderStep"), + ("vae_encoder.preprocess", "QwenImageProcessImagesInputStep"), + ("vae_encoder.encode", "QwenImageVaeEncoderStep"), + ("denoise.input.text_inputs", "QwenImageTextInputsStep"), + ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"), + ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"), + ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"), + ("denoise.prepare_img2img_latents", "QwenImagePrepareLatentsWithStrengthStep"), + ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"), + ("denoise.denoise", "QwenImageDenoiseStep"), + ("denoise.after_denoise", "QwenImageAfterDenoiseStep"), + ("decode.decode", "QwenImageDecoderStep"), + ("decode.postprocess", "QwenImageProcessImagesOutputStep"), + ], + "inpainting": [ + ("text_encoder", "QwenImageTextEncoderStep"), + ("vae_encoder.preprocess", "QwenImageInpaintProcessImagesInputStep"), + ("vae_encoder.encode", "QwenImageVaeEncoderStep"), + ("denoise.input.text_inputs", "QwenImageTextInputsStep"), + ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"), + ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"), + ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"), + ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"), + ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"), + ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"), + ("denoise.denoise", "QwenImageInpaintDenoiseStep"), + ("denoise.after_denoise", "QwenImageAfterDenoiseStep"), + ("decode.decode", "QwenImageDecoderStep"), + ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"), + ], + "controlnet_text2image": [ + ("text_encoder", "QwenImageTextEncoderStep"), + ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"), + ("denoise.input", "QwenImageTextInputsStep"), + ("denoise.controlnet_input", "QwenImageControlNetInputsStep"), + ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"), + ("denoise.set_timesteps", "QwenImageSetTimestepsStep"), + ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"), + ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"), + ("denoise.controlnet_denoise", "QwenImageControlNetDenoiseStep"), + ("denoise.after_denoise", "QwenImageAfterDenoiseStep"), + ("decode.decode", "QwenImageDecoderStep"), + ("decode.postprocess", "QwenImageProcessImagesOutputStep"), + ], + "controlnet_image2image": [ + ("text_encoder", "QwenImageTextEncoderStep"), + ("vae_encoder.preprocess", "QwenImageProcessImagesInputStep"), + ("vae_encoder.encode", "QwenImageVaeEncoderStep"), + ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"), + ("denoise.input.text_inputs", "QwenImageTextInputsStep"), + ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"), + ("denoise.controlnet_input", "QwenImageControlNetInputsStep"), + ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"), + ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"), + ("denoise.prepare_img2img_latents", "QwenImagePrepareLatentsWithStrengthStep"), + ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"), + ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"), + ("denoise.controlnet_denoise", "QwenImageControlNetDenoiseStep"), + ("denoise.after_denoise", "QwenImageAfterDenoiseStep"), + ("decode.decode", "QwenImageDecoderStep"), + ("decode.postprocess", "QwenImageProcessImagesOutputStep"), + ], + "controlnet_inpainting": [ + ("text_encoder", "QwenImageTextEncoderStep"), + ("vae_encoder.preprocess", "QwenImageInpaintProcessImagesInputStep"), + ("vae_encoder.encode", "QwenImageVaeEncoderStep"), + ("controlnet_vae_encoder", "QwenImageControlNetVaeEncoderStep"), + ("denoise.input.text_inputs", "QwenImageTextInputsStep"), + ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"), + ("denoise.controlnet_input", "QwenImageControlNetInputsStep"), + ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"), + ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"), + ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"), + ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"), + ("denoise.prepare_rope_inputs", "QwenImageRoPEInputsStep"), + ("denoise.controlnet_before_denoise", "QwenImageControlNetBeforeDenoiserStep"), + ("denoise.controlnet_denoise", "QwenImageInpaintControlNetDenoiseStep"), + ("denoise.after_denoise", "QwenImageAfterDenoiseStep"), + ("decode.decode", "QwenImageDecoderStep"), + ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"), + ], +} + + class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin): pipeline_class = QwenImageModularPipeline pipeline_blocks_class = QwenImageAutoBlocks @@ -37,6 +134,7 @@ class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuider params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"]) batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"]) + expected_workflow_blocks = QWEN_IMAGE_TEXT2IMAGE_WORKFLOWS def get_dummy_inputs(self): generator = self.get_generator() @@ -56,6 +154,44 @@ def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=5e-4) +QWEN_IMAGE_EDIT_WORKFLOWS = { + "image_conditioned": [ + ("text_encoder.resize", "QwenImageEditResizeStep"), + ("text_encoder.encode", "QwenImageEditTextEncoderStep"), + ("vae_encoder.resize", "QwenImageEditResizeStep"), + ("vae_encoder.preprocess", "QwenImageEditProcessImagesInputStep"), + ("vae_encoder.encode", "QwenImageVaeEncoderStep"), + ("denoise.input.text_inputs", "QwenImageTextInputsStep"), + ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"), + ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"), + ("denoise.set_timesteps", "QwenImageSetTimestepsStep"), + ("denoise.prepare_rope_inputs", "QwenImageEditRoPEInputsStep"), + ("denoise.denoise", "QwenImageEditDenoiseStep"), + ("denoise.after_denoise", "QwenImageAfterDenoiseStep"), + ("decode.decode", "QwenImageDecoderStep"), + ("decode.postprocess", "QwenImageProcessImagesOutputStep"), + ], + "image_conditioned_inpainting": [ + ("text_encoder.resize", "QwenImageEditResizeStep"), + ("text_encoder.encode", "QwenImageEditTextEncoderStep"), + ("vae_encoder.resize", "QwenImageEditResizeStep"), + ("vae_encoder.preprocess", "QwenImageEditInpaintProcessImagesInputStep"), + ("vae_encoder.encode", "QwenImageVaeEncoderStep"), + ("denoise.input.text_inputs", "QwenImageTextInputsStep"), + ("denoise.input.additional_inputs", "QwenImageAdditionalInputsStep"), + ("denoise.prepare_latents", "QwenImagePrepareLatentsStep"), + ("denoise.set_timesteps", "QwenImageSetTimestepsWithStrengthStep"), + ("denoise.prepare_inpaint_latents.add_noise_to_latents", "QwenImagePrepareLatentsWithStrengthStep"), + ("denoise.prepare_inpaint_latents.create_mask_latents", "QwenImageCreateMaskLatentsStep"), + ("denoise.prepare_rope_inputs", "QwenImageEditRoPEInputsStep"), + ("denoise.denoise", "QwenImageEditInpaintDenoiseStep"), + ("denoise.after_denoise", "QwenImageAfterDenoiseStep"), + ("decode.decode", "QwenImageDecoderStep"), + ("decode.postprocess", "QwenImageInpaintProcessImagesOutputStep"), + ], +} + + class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin): pipeline_class = QwenImageEditModularPipeline pipeline_blocks_class = QwenImageEditAutoBlocks @@ -63,6 +199,7 @@ class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGu params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"]) batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"]) + expected_workflow_blocks = QWEN_IMAGE_EDIT_WORKFLOWS def get_dummy_inputs(self): generator = self.get_generator() diff --git a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py index 7b55933e4caf..f640f0ec83f2 100644 --- a/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py +++ b/tests/modular_pipelines/stable_diffusion_xl/test_modular_pipeline_stable_diffusion_xl.py @@ -267,6 +267,60 @@ def test_controlnet_cfg(self): assert max_diff > 1e-2, "Output with CFG must be different from normal inference" +TEXT2IMAGE_WORKFLOWS = { + "text2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"), + ("denoise.denoise", "StableDiffusionXLDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], + "controlnet_text2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"), + ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"), + ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], + "controlnet_union_text2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"), + ("denoise.controlnet_input", "StableDiffusionXLControlNetUnionInputStep"), + ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], + "ip_adapter_text2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("ip_adapter", "StableDiffusionXLIPAdapterStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"), + ("denoise.denoise", "StableDiffusionXLDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], + "ip_adapter_controlnet_text2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("ip_adapter", "StableDiffusionXLIPAdapterStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLPrepareAdditionalConditioningStep"), + ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"), + ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], +} + + class TestSDXLModularPipelineFast( SDXLModularTesterMixin, SDXLModularIPAdapterTesterMixin, @@ -291,6 +345,8 @@ class TestSDXLModularPipelineFast( batch_params = frozenset(["prompt", "negative_prompt"]) expected_image_output_shape = (1, 3, 64, 64) + expected_workflow_blocks = TEXT2IMAGE_WORKFLOWS + def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) inputs = { @@ -314,6 +370,65 @@ def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) +IMAGE2IMAGE_WORKFLOWS = { + "image2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("vae_encoder", "StableDiffusionXLVaeEncoderStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("denoise.denoise", "StableDiffusionXLDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], + "controlnet_image2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("vae_encoder", "StableDiffusionXLVaeEncoderStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"), + ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], + "controlnet_union_image2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("vae_encoder", "StableDiffusionXLVaeEncoderStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("denoise.controlnet_input", "StableDiffusionXLControlNetUnionInputStep"), + ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], + "ip_adapter_image2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("ip_adapter", "StableDiffusionXLIPAdapterStep"), + ("vae_encoder", "StableDiffusionXLVaeEncoderStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("denoise.denoise", "StableDiffusionXLDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], + "ip_adapter_controlnet_image2image": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("ip_adapter", "StableDiffusionXLIPAdapterStep"), + ("vae_encoder", "StableDiffusionXLVaeEncoderStep"), + ("denoise.input", "StableDiffusionXLInputStep"), + ("denoise.before_denoise.set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("denoise.before_denoise.prepare_latents", "StableDiffusionXLImg2ImgPrepareLatentsStep"), + ("denoise.before_denoise.prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("denoise.controlnet_input", "StableDiffusionXLControlNetInputStep"), + ("denoise.denoise", "StableDiffusionXLControlNetDenoiseStep"), + ("decode", "StableDiffusionXLDecodeStep"), + ], +} + + class TestSDXLImg2ImgModularPipelineFast( SDXLModularTesterMixin, SDXLModularIPAdapterTesterMixin, @@ -338,6 +453,7 @@ class TestSDXLImg2ImgModularPipelineFast( ) batch_params = frozenset(["prompt", "negative_prompt", "image"]) expected_image_output_shape = (1, 3, 64, 64) + expected_workflow_blocks = IMAGE2IMAGE_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed) @@ -367,6 +483,65 @@ def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) +INPAINTING_WORKFLOWS = { + "inpainting": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"), + ("input", "StableDiffusionXLInputStep"), + ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"), + ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("denoise", "StableDiffusionXLInpaintDenoiseStep"), + ("decode", "StableDiffusionXLInpaintDecodeStep"), + ], + "controlnet_inpainting": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"), + ("input", "StableDiffusionXLInputStep"), + ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"), + ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("controlnet_input", "StableDiffusionXLControlNetInputStep"), + ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"), + ("decode", "StableDiffusionXLInpaintDecodeStep"), + ], + "controlnet_union_inpainting": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"), + ("input", "StableDiffusionXLInputStep"), + ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"), + ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("controlnet_input", "StableDiffusionXLControlNetUnionInputStep"), + ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"), + ("decode", "StableDiffusionXLInpaintDecodeStep"), + ], + "ip_adapter_inpainting": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("ip_adapter", "StableDiffusionXLIPAdapterStep"), + ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"), + ("input", "StableDiffusionXLInputStep"), + ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"), + ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("denoise", "StableDiffusionXLInpaintDenoiseStep"), + ("decode", "StableDiffusionXLInpaintDecodeStep"), + ], + "ip_adapter_controlnet_inpainting": [ + ("text_encoder", "StableDiffusionXLTextEncoderStep"), + ("ip_adapter", "StableDiffusionXLIPAdapterStep"), + ("vae_encoder", "StableDiffusionXLInpaintVaeEncoderStep"), + ("input", "StableDiffusionXLInputStep"), + ("set_timesteps", "StableDiffusionXLImg2ImgSetTimestepsStep"), + ("prepare_latents", "StableDiffusionXLInpaintPrepareLatentsStep"), + ("prepare_add_cond", "StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep"), + ("controlnet_input", "StableDiffusionXLControlNetInputStep"), + ("denoise", "StableDiffusionXLInpaintControlNetDenoiseStep"), + ("decode", "StableDiffusionXLInpaintDecodeStep"), + ], +} + + class SDXLInpaintingModularPipelineFastTests( SDXLModularTesterMixin, SDXLModularIPAdapterTesterMixin, @@ -392,6 +567,7 @@ class SDXLInpaintingModularPipelineFastTests( ) batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"]) expected_image_output_shape = (1, 3, 64, 64) + expected_workflow_blocks = INPAINTING_WORKFLOWS def get_dummy_inputs(self, device, seed=0): generator = self.get_generator(seed) diff --git a/tests/modular_pipelines/test_modular_pipelines_common.py b/tests/modular_pipelines/test_modular_pipelines_common.py index 937a9ccec880..e97b543ff85d 100644 --- a/tests/modular_pipelines/test_modular_pipelines_common.py +++ b/tests/modular_pipelines/test_modular_pipelines_common.py @@ -100,6 +100,14 @@ def batch_params(self) -> frozenset: "See existing pipeline tests for reference." ) + @property + def expected_workflow_blocks(self) -> dict: + raise NotImplementedError( + "You need to set the attribute `expected_workflow_blocks` in the child test class. " + "`expected_workflow_blocks` is a dictionary that maps workflow names to list of block names. " + "See existing pipeline tests for reference." + ) + def setup_method(self): # clean up the VRAM before each test torch.compiler.reset() @@ -341,6 +349,34 @@ def test_save_from_pretrained(self): assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3 + def test_workflow_map(self): + blocks = self.pipeline_blocks_class() + if blocks._workflow_map is None: + pytest.skip("Skipping test as _workflow_map is not set") + + assert hasattr(self, "expected_workflow_blocks") and self.expected_workflow_blocks, ( + "expected_workflow_blocks must be defined in the test class" + ) + + for workflow_name, expected_blocks in self.expected_workflow_blocks.items(): + workflow_blocks = blocks.get_workflow(workflow_name) + actual_blocks = list(workflow_blocks.sub_blocks.items()) + + # Check that the number of blocks matches + assert len(actual_blocks) == len(expected_blocks), ( + f"Workflow '{workflow_name}' has {len(actual_blocks)} blocks, expected {len(expected_blocks)}" + ) + + # Check that each block name and type matches + for i, ((actual_name, actual_block), (expected_name, expected_class_name)) in enumerate( + zip(actual_blocks, expected_blocks) + ): + assert actual_name == expected_name + assert actual_block.__class__.__name__ == expected_class_name, ( + f"Workflow '{workflow_name}': block '{actual_name}' has type " + f"{actual_block.__class__.__name__}, expected {expected_class_name}" + ) + class ModularGuiderTesterMixin: def test_guider_cfg(self, expected_max_diff=1e-2): diff --git a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py index 29da18fce61b..ab45def3ef30 100644 --- a/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py +++ b/tests/modular_pipelines/z_image/test_modular_pipeline_z_image.py @@ -19,6 +19,30 @@ from ..test_modular_pipelines_common import ModularPipelineTesterMixin +ZIMAGE_WORKFLOWS = { + "text2image": [ + ("text_encoder", "ZImageTextEncoderStep"), + ("denoise.input", "ZImageTextInputStep"), + ("denoise.prepare_latents", "ZImagePrepareLatentsStep"), + ("denoise.set_timesteps", "ZImageSetTimestepsStep"), + ("denoise.denoise", "ZImageDenoiseStep"), + ("decode", "ZImageVaeDecoderStep"), + ], + "image2image": [ + ("text_encoder", "ZImageTextEncoderStep"), + ("vae_encoder", "ZImageVaeImageEncoderStep"), + ("denoise.input", "ZImageTextInputStep"), + ("denoise.additional_inputs", "ZImageAdditionalInputsStep"), + ("denoise.prepare_latents", "ZImagePrepareLatentsStep"), + ("denoise.set_timesteps", "ZImageSetTimestepsStep"), + ("denoise.set_timesteps_with_strength", "ZImageSetTimestepsWithStrengthStep"), + ("denoise.prepare_latents_with_image", "ZImagePrepareLatentswithImageStep"), + ("denoise.denoise", "ZImageDenoiseStep"), + ("decode", "ZImageVaeDecoderStep"), + ], +} + + class TestZImageModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = ZImageModularPipeline pipeline_blocks_class = ZImageAutoBlocks @@ -26,6 +50,7 @@ class TestZImageModularPipelineFast(ModularPipelineTesterMixin): params = frozenset(["prompt", "height", "width"]) batch_params = frozenset(["prompt"]) + expected_workflow_blocks = ZIMAGE_WORKFLOWS def get_dummy_inputs(self, seed=0): generator = self.get_generator(seed)