diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 46e241d817b5..64a4222845b0 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -114,6 +114,8 @@ title: Guiders - local: modular_diffusers/custom_blocks title: Building Custom Blocks + - local: modular_diffusers/mellon + title: Using Custom Blocks with Mellon title: Modular Diffusers - isExpanded: false sections: diff --git a/docs/source/en/modular_diffusers/custom_blocks.md b/docs/source/en/modular_diffusers/custom_blocks.md index 6ef8db613f7f..b412e0e58abc 100644 --- a/docs/source/en/modular_diffusers/custom_blocks.md +++ b/docs/source/en/modular_diffusers/custom_blocks.md @@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License. [ModularPipelineBlocks](./pipeline_block) are the fundamental building blocks of a [`ModularPipeline`]. You can create custom blocks by defining their inputs, outputs, and computation logic. This guide demonstrates how to create and use a custom block. > [!TIP] -> Explore the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for official custom modular blocks like Nano Banana. +> Explore the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for official custom blocks. ## Project Structure @@ -31,54 +31,58 @@ Your custom block project should use the following structure: - `block.py` contains the custom block implementation - `modular_config.json` contains the metadata needed to load the block -## Example: Florence 2 Inpainting Block +## Quick Start with Template -In this example we will create a custom block that uses the [Florence 2](https://huggingface.co/docs/transformers/model_doc/florence2) model to process an input image and generate a mask for inpainting. +The fastest way to create a custom block is to start from our template. The template provides a pre-configured project structure with `block.py` and `modular_config.json` files, plus commented examples showing how to define components, inputs, outputs, and the `__call__` method—so you can focus on your custom logic instead of boilerplate setup. -The first step is to define the components that the block will use. In this case, we will need to use the `Florence2ForConditionalGeneration` model and its corresponding processor `AutoProcessor`. When defining components, we must specify the name of the component within our pipeline, model class via `type_hint`, and provide a `pretrained_model_name_or_path` for the component if we intend to load the model weights from a specific repository on the Hub. +### Download the template -```py -# Inside block.py -from diffusers.modular_pipelines import ( - ModularPipelineBlocks, - ComponentSpec, +```python +from diffusers import ModularPipelineBlocks + +model_id = "diffusers/custom-block-template" +local_dir = model_id.split("/")[-1] + +blocks = ModularPipelineBlocks.from_pretrained( + model_id, + trust_remote_code=True, + local_dir=local_dir ) -from transformers import AutoProcessor, Florence2ForConditionalGeneration +``` +This saves the template files to `custom-block-template/` locally or you could use `local_dir` to save to a specific location. -class Florence2ImageAnnotatorBlock(ModularPipelineBlocks): +### Edit locally - @property - def expected_components(self): - return [ - ComponentSpec( - name="image_annotator", - type_hint=Florence2ForConditionalGeneration, - pretrained_model_name_or_path="florence-community/Florence-2-base-ft", - ), - ComponentSpec( - name="image_annotator_processor", - type_hint=AutoProcessor, - pretrained_model_name_or_path="florence-community/Florence-2-base-ft", - ), - ] +Open `block.py` and implement your custom block. The template includes commented examples showing how to define each property. See the [Florence-2 example](#example-florence-2-image-annotator) below for a complete implementation. + +### Test your block + +```python +from diffusers import ModularPipelineBlocks + +blocks = ModularPipelineBlocks.from_pretrained(local_dir, trust_remote_code=True) +pipeline = blocks.init_pipeline() +output = pipeline(...) # your inputs here ``` -Next, we define the inputs and outputs of the block. The inputs include the image to be annotated, the annotation task, and the annotation prompt. The outputs include the generated mask image and annotations. +### Upload to the Hub -```py -from typing import List, Union -from PIL import Image, ImageDraw -import torch -import numpy as np - -from diffusers.modular_pipelines import ( - PipelineState, - ModularPipelineBlocks, - InputParam, - ComponentSpec, - OutputParam, -) +```python +pipeline.save_pretrained(local_dir, repo_id="your-username/your-block-name", push_to_hub=True) +``` + +## Example: Florence-2 Image Annotator + +This example creates a custom block with [Florence-2](https://huggingface.co/docs/transformers/model_doc/florence2) to process an input image and generate a mask for inpainting. + +### Define components + +Define the components the block needs, `Florence2ForConditionalGeneration` and its processor. When defining components, specify the `name` (how you'll access it in code), `type_hint` (the model class), and `pretrained_model_name_or_path` (where to load weights from). + +```python +# Inside block.py +from diffusers.modular_pipelines import ModularPipelineBlocks, ComponentSpec from transformers import AutoProcessor, Florence2ForConditionalGeneration @@ -98,122 +102,21 @@ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks): pretrained_model_name_or_path="florence-community/Florence-2-base-ft", ), ] - - @property - def inputs(self) -> List[InputParam]: - return [ - InputParam( - "image", - type_hint=Union[Image.Image, List[Image.Image]], - required=True, - description="Image(s) to annotate", - ), - InputParam( - "annotation_task", - type_hint=Union[str, List[str]], - required=True, - default="", - description="""Annotation Task to perform on the image. - Supported Tasks: - - - - - - - - - - - """, - ), - InputParam( - "annotation_prompt", - type_hint=Union[str, List[str]], - required=True, - description="""Annotation Prompt to provide more context to the task. - Can be used to detect or segment out specific elements in the image - """, - ), - InputParam( - "annotation_output_type", - type_hint=str, - required=True, - default="mask_image", - description="""Output type from annotation predictions. Available options are - mask_image: - -black and white mask image for the given image based on the task type - mask_overlay: - - mask overlayed on the original image - bounding_box: - - bounding boxes drawn on the original image - """, - ), - InputParam( - "annotation_overlay", - type_hint=bool, - required=True, - default=False, - description="", - ), - ] - - @property - def intermediate_outputs(self) -> List[OutputParam]: - return [ - OutputParam( - "mask_image", - type_hint=Image, - description="Inpainting Mask for input Image(s)", - ), - OutputParam( - "annotations", - type_hint=dict, - description="Annotations Predictions for input Image(s)", - ), - OutputParam( - "image", - type_hint=Image, - description="Annotated input Image(s)", - ), - ] - ``` -Now we implement the `__call__` method, which contains the logic for processing the input image and generating the mask. +### Define inputs and outputs -```py +Inputs include the image, annotation task, and prompt. Outputs include the generated mask and annotations. + +```python from typing import List, Union -from PIL import Image, ImageDraw -import torch -import numpy as np - -from diffusers.modular_pipelines import ( - PipelineState, - ModularPipelineBlocks, - InputParam, - ComponentSpec, - OutputParam, -) -from transformers import AutoProcessor, Florence2ForConditionalGeneration +from PIL import Image +from diffusers.modular_pipelines import InputParam, OutputParam class Florence2ImageAnnotatorBlock(ModularPipelineBlocks): - @property - def expected_components(self): - return [ - ComponentSpec( - name="image_annotator", - type_hint=Florence2ForConditionalGeneration, - pretrained_model_name_or_path="florence-community/Florence-2-base-ft", - ), - ComponentSpec( - name="image_annotator_processor", - type_hint=AutoProcessor, - pretrained_model_name_or_path="florence-community/Florence-2-base-ft", - ), - ] + # ... expected_components from above ... @property def inputs(self) -> List[InputParam]: @@ -226,51 +129,21 @@ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks): ), InputParam( "annotation_task", - type_hint=Union[str, List[str]], - required=True, + type_hint=str, default="", - description="""Annotation Task to perform on the image. - Supported Tasks: - - - - - - - - - - - """, + description="Annotation task to perform (e.g., , , )", ), InputParam( "annotation_prompt", - type_hint=Union[str, List[str]], + type_hint=str, required=True, - description="""Annotation Prompt to provide more context to the task. - Can be used to detect or segment out specific elements in the image - """, + description="Prompt to provide context for the annotation task", ), InputParam( "annotation_output_type", type_hint=str, - required=True, default="mask_image", - description="""Output type from annotation predictions. Available options are - mask_image: - -black and white mask image for the given image based on the task type - mask_overlay: - - mask overlayed on the original image - bounding_box: - - bounding boxes drawn on the original image - """, - ), - InputParam( - "annotation_overlay", - type_hint=bool, - required=True, - default=False, - description="", + description="Output type: 'mask_image', 'mask_overlay', or 'bounding_box'", ), ] @@ -279,109 +152,45 @@ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks): return [ OutputParam( "mask_image", - type_hint=Image, - description="Inpainting Mask for input Image(s)", + type_hint=Image.Image, + description="Inpainting mask for the input image", ), OutputParam( "annotations", type_hint=dict, - description="Annotations Predictions for input Image(s)", + description="Raw annotation predictions", ), OutputParam( "image", - type_hint=Image, - description="Annotated input Image(s)", + type_hint=Image.Image, + description="Annotated image", ), ] +``` - def get_annotations(self, components, images, prompts, task): - task_prompts = [task + prompt for prompt in prompts] +### Implement the `__call__` method - inputs = components.image_annotator_processor( - text=task_prompts, images=images, return_tensors="pt" - ).to(components.image_annotator.device, components.image_annotator.dtype) +The `__call__` method contains the block's logic. Access inputs via `block_state`, run your computation, and set outputs back to `block_state`. - generated_ids = components.image_annotator.generate( - input_ids=inputs["input_ids"], - pixel_values=inputs["pixel_values"], - max_new_tokens=1024, - early_stopping=False, - do_sample=False, - num_beams=3, - ) - annotations = components.image_annotator_processor.batch_decode( - generated_ids, skip_special_tokens=False - ) - outputs = [] - for image, annotation in zip(images, annotations): - outputs.append( - components.image_annotator_processor.post_process_generation( - annotation, task=task, image_size=(image.width, image.height) - ) - ) - return outputs - - def prepare_mask(self, images, annotations, overlay=False, fill="white"): - masks = [] - for image, annotation in zip(images, annotations): - mask_image = image.copy() if overlay else Image.new("L", image.size, 0) - draw = ImageDraw.Draw(mask_image) - - for _, _annotation in annotation.items(): - if "polygons" in _annotation: - for polygon in _annotation["polygons"]: - polygon = np.array(polygon).reshape(-1, 2) - if len(polygon) < 3: - continue - polygon = polygon.reshape(-1).tolist() - draw.polygon(polygon, fill=fill) - - elif "bbox" in _annotation: - bbox = _annotation["bbox"] - draw.rectangle(bbox, fill="white") - - masks.append(mask_image) - - return masks - - def prepare_bounding_boxes(self, images, annotations): - outputs = [] - for image, annotation in zip(images, annotations): - image_copy = image.copy() - draw = ImageDraw.Draw(image_copy) - for _, _annotation in annotation.items(): - bbox = _annotation["bbox"] - label = _annotation["label"] - - draw.rectangle(bbox, outline="red", width=3) - draw.text((bbox[0], bbox[1] - 20), label, fill="red") - - outputs.append(image_copy) - - return outputs - - def prepare_inputs(self, images, prompts): - prompts = prompts or "" - - if isinstance(images, Image.Image): - images = [images] - if isinstance(prompts, str): - prompts = [prompts] - - if len(images) != len(prompts): - raise ValueError("Number of images and annotation prompts must match.") - - return images, prompts +```python +import torch +from diffusers.modular_pipelines import PipelineState + + +class Florence2ImageAnnotatorBlock(ModularPipelineBlocks): + + # ... expected_components, inputs, intermediate_outputs from above ... @torch.no_grad() def __call__(self, components, state: PipelineState) -> PipelineState: block_state = self.get_block_state(state) + images, annotation_task_prompt = self.prepare_inputs( block_state.image, block_state.annotation_prompt ) task = block_state.annotation_task fill = block_state.fill - + annotations = self.get_annotations( components, images, annotation_task_prompt, task ) @@ -400,67 +209,69 @@ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks): self.set_block_state(state, block_state) return components, state - -``` - -Once we have defined our custom block, we can save it to the Hub, using either the CLI or the [`push_to_hub`] method. This will make it easy to share and reuse our custom block with other pipelines. - - - - -```shell -# In the folder with the `block.py` file, run: -diffusers-cli custom_block -``` - -Then upload the block to the Hub: - -```shell -hf upload . . -``` - - - -```py -from block import Florence2ImageAnnotatorBlock -block = Florence2ImageAnnotatorBlock() -block.push_to_hub("") + + # Helper methods for mask/bounding box generation... ``` - - +> [!TIP] +> See the complete implementation at [diffusers/Florence2-image-Annotator](https://huggingface.co/diffusers/Florence2-image-Annotator). ## Using Custom Blocks -Load the custom block with [`~ModularPipelineBlocks.from_pretrained`] and set `trust_remote_code=True`. +Load a custom block with [`~ModularPipeline.from_pretrained`] and set `trust_remote_code=True`. ```py import torch -from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks -from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS +from diffusers import ModularPipeline from diffusers.utils import load_image -# Fetch the Florence2 image annotator block that will create our mask -image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True) +# Load the Florence-2 annotator pipeline +image_annotator = ModularPipeline.from_pretrained( + "diffusers/Florence2-image-Annotator", + trust_remote_code=True +) -my_blocks = INPAINT_BLOCKS.copy() -# insert the annotation block before the image encoding step -my_blocks.insert("image_annotator", image_annotator_block, 1) +# Check the docstring to see inputs/outputs +print(image_annotator.blocks.doc) +``` -# Create our initial set of inpainting blocks -blocks = SequentialPipelineBlocks.from_blocks_dict(my_blocks) +Use the block to generate a mask: -repo_id = "diffusers/modular-stable-diffusion-xl-base-1.0" -pipe = blocks.init_pipeline(repo_id) -pipe.load_components(torch_dtype=torch.float16, device_map="cuda", trust_remote_code=True) +```python +image_annotator.load_components(torch_dtype=torch.bfloat16) +image_annotator.to("cuda") -image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true") +image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg") image = image.resize((1024, 1024)) - prompt = ["A red car"] annotation_task = "" annotation_prompt = ["the car"] +mask_image = image_annotator_node( + prompt=prompt, + image=image, + annotation_task=annotation_task, + annotation_prompt=annotation_prompt, + annotation_output_type="mask_image", +).images +mask_image[0].save("car-mask.png") +``` + +Compose it with other blocks to create a new pipeline: + +```python +# Get the annotator block +annotator_block = image_annotator.blocks + +# Get an inpainting workflow and insert the annotator at the beginning +inpaint_blocks = ModularPipeline.from_pretrained("Qwen/Qwen-Image").blocks.get_workflow("inpainting") +inpaint_blocks.sub_blocks.insert("image_annotator", annotator_block, 0) + +# Initialize the combined pipeline +pipe = inpaint_blocks.init_pipeline() +pipe.load_components(torch_dtype=torch.float16, device="cuda") + +# Now the pipeline automatically generates masks from prompts output = pipe( prompt=prompt, image=image, @@ -475,18 +286,50 @@ output = pipe( output[0].save("florence-inpainting.png") ``` -## Editing Custom Blocks +## Editing custom blocks -By default, custom blocks are saved in your cache directory. Use the `local_dir` argument to download and edit a custom block in a specific folder. +Edit custom blocks by downloading it locally. This is the same workflow as the [Quick Start with Template](#quick-start-with-template), but starting from an existing block instead of the template. -```py -import torch -from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks -from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS -from diffusers.utils import load_image +Use the `local_dir` argument to download a custom block to a specific folder: + +```python +from diffusers import ModularPipelineBlocks -# Fetch the Florence2 image annotator block that will create our mask -image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True, local_dir="/my-local-folder") +# Download to a local folder for editing +annotator_block = ModularPipelineBlocks.from_pretrained( + "diffusers/Florence2-image-Annotator", + trust_remote_code=True, + local_dir="./my-florence-block" +) ``` -Any changes made to the block files in this folder will be reflected when you load the block again. +Any changes made to the block files in this folder will be reflected when you load the block again. When you're ready to share your changes, upload to a new repository: + +```python +pipeline = annotator_block.init_pipeline() +pipeline.save_pretrained("./my-florence-block", repo_id="your-username/my-custom-florence", push_to_hub=True) +``` + +## Next Steps + + + + +This guide covered creating a single custom block. Learn how to compose multiple blocks together: + +- [SequentialPipelineBlocks](./sequential_pipeline_blocks): Chain blocks to execute in sequence +- [ConditionalPipelineBlocks](./auto_pipeline_blocks): Create conditional blocks that select different execution paths +- [LoopSequentialPipelineBlocks](./loop_sequential_pipeline_blocks): Define an iterative workflows like the denoising loop + + + + +Make your custom block work with Mellon's visual interface. See the [Mellon Custom Blocks](./mellon) guide. + + + + +Browse the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for inspiration and ready-to-use blocks. + + + \ No newline at end of file diff --git a/docs/source/en/modular_diffusers/mellon.md b/docs/source/en/modular_diffusers/mellon.md new file mode 100644 index 000000000000..808e62ad7966 --- /dev/null +++ b/docs/source/en/modular_diffusers/mellon.md @@ -0,0 +1,270 @@ + + + +## Using Custom Blocks with Mellon + +[Mellon](https://github.com/cubiq/Mellon) is a visual workflow interface that integrates with Modular Diffusers and is designed for node-based workflows. + +> [!WARNING] +> Mellon is in early development and not ready for production use yet. Consider this a sneak peek of how the integration works! + + +Custom blocks work in Mellon out of the box - just need to add a `mellon_pipeline_config.json` to your repository. This config file tells Mellon how to render your block's parameters as UI components. + +Here's what it looks like in action with the [Gemini Prompt Expander](https://huggingface.co/diffusers/gemini-prompt-expander-mellon) block: + +![Mellon custom block demo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/modular_demo_dynamic.gif) + +To use a modular diffusers custom block in Mellon: +1. Drag a **Dynamic Block Node** from the ModularDiffusers section +2. Enter the `repo_id` (e.g., `diffusers/gemini-prompt-expander-mellon`) +3. Click **Load Custom Block** +4. The node transforms to show your block's inputs and outputs + +Now let's walk through how to create this config for your own custom block. + +## Steps to create a Mellon config + +1. **Specify Mellon types for your parameters** - Each `InputParam`/`OutputParam` needs a type that tells Mellon what UI component to render (e.g., `"textbox"`, `"dropdown"`, `"image"`). +2. **Generate `mellon_pipeline_config.json`** - Use our utility to generate a config template and push it to your Hub repository. +3. **(Optional) Manually adjust the config** - Fine-tune the generated config for your specific needs. + +## Specify Mellon types for parameters + +Mellon types determine how each parameter renders in the UI. If you don't specify a type for a parameter, it will default to `"custom"`, which renders as a simple connection dot. You can always adjust this later in the generated config. + + +| Type | Input/Output | Description | +|------|--------------|-------------| +| `image` | Both | Image (PIL Image) | +| `video` | Both | Video | +| `text` | Both | Text display | +| `textbox` | Input | Text input | +| `dropdown` | Input | Dropdown selection menu | +| `slider` | Input | Slider for numeric values | +| `number` | Input | Numeric input | +| `checkbox` | Input | Boolean toggle | + +For parameters that need more configuration (like dropdowns with options, or sliders with min/max values), pass a `MellonParam` instance directly instead of a string. You can use one of the class methods below, or create a fully custom one with `MellonParam(name, label, type, ...)`. + +| Method | Description | +|--------|-------------| +| `MellonParam.Input.image(name)` | Image input | +| `MellonParam.Input.textbox(name, default)` | Text input as textarea | +| `MellonParam.Input.dropdown(name, options, default)` | Dropdown selection | +| `MellonParam.Input.slider(name, default, min, max, step)` | Slider for numeric values | +| `MellonParam.Input.number(name, default, min, max, step)` | Numeric input (no slider) | +| `MellonParam.Input.seed(name, default)` | Seed input with randomize button | +| `MellonParam.Input.checkbox(name, default)` | Boolean checkbox | +| `MellonParam.Input.model(name)` | Model input for diffusers components | +| `MellonParam.Output.image(name)` | Image output | +| `MellonParam.Output.video(name)` | Video output | +| `MellonParam.Output.text(name)` | Text output | +| `MellonParam.Output.model(name)` | Model output for diffusers components | + +Choose one of the methods below to specify a Mellon type. + +### Using `metadata` in block definitions + +If you're defining a custom block from scratch, add `metadata={"mellon": ""}` directly to your `InputParam` and `OutputParam` definitions. If you're editing an existing custom block from the Hub, see [Editing custom blocks](./custom_blocks#editing-custom-blocks) for how to download it locally. + +```python +class GeminiPromptExpander(ModularPipelineBlocks): + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + "prompt", + type_hint=str, + required=True, + description="Prompt to use", + metadata={"mellon": "textbox"}, # Text input + ) + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + "prompt", + type_hint=str, + description="Expanded prompt by the LLM", + metadata={"mellon": "text"}, # Text output + ), + OutputParam( + "old_prompt", + type_hint=str, + description="Old prompt provided by the user", + # No metadata - we don't want to render this in UI + ) + ] +``` + +For full control over UI configuration, pass a `MellonParam` instance directly: +```python +from diffusers.modular_pipelines.mellon_node_utils import MellonParam + +InputParam( + "mode", + type_hint=str, + default="balanced", + metadata={"mellon": MellonParam.Input.dropdown("mode", options=["fast", "balanced", "quality"])}, +) +``` + +### Using `input_types` and `output_types` when Generating Config + +If you're working with an existing pipeline or prefer to keep your block definitions clean, specify types when generating the config using the `input_types/output_types` argument: +```python +from diffusers.modular_pipelines.mellon_node_utils import MellonPipelineConfig + +mellon_config = MellonPipelineConfig.from_custom_block( + blocks, + input_types={"prompt": "textbox"}, + output_types={"prompt": "text"} +) +``` + +> [!NOTE] +> When both `metadata` and `input_types`/`output_types` are specified, the arguments overrides `metadata`. + +## Generate and push the Mellon config + +After adding metadata to your block, generate the default Mellon configuration template and push it to the Hub: + +```python +from diffusers import ModularPipelineBlocks +from diffusers.modular_pipelines.mellon_node_utils import MellonPipelineConfig + +# load your custom blocks from your local dir +blocks = ModularPipelineBlocks.from_pretrained("/path/local/folder", trust_remote_code=True) + +# Generate the default config template +mellon_config = MellonPipelineConfig.from_custom_block(blocks) +# push the default template to `repo_id`, you will need to pass the same local folder path so that it will save the config locally first +mellon_config.save( + local_dir="/path/local/folder", + repo_id= repo_id, + push_to_hub=True +) +``` + +This creates a `mellon_pipeline_config.json` file in your repository. + +## Review and adjust the config + +The generated template is a starting point - you may want to adjust it for your needs. Let's walk through the generated config for the Gemini Prompt Expander: + +```json +{ + "label": "Gemini Prompt Expander", + "default_repo": "", + "default_dtype": "", + "node_params": { + "custom": { + "params": { + "prompt": { + "label": "Prompt", + "type": "string", + "display": "textarea", + "default": "" + }, + "out_prompt": { + "label": "Prompt", + "type": "string", + "display": "output" + }, + "old_prompt": { + "label": "Old Prompt", + "type": "custom", + "display": "output" + }, + "doc": { + "label": "Doc", + "type": "string", + "display": "output" + } + }, + "input_names": ["prompt"], + "model_input_names": [], + "output_names": ["out_prompt", "old_prompt", "doc"], + "block_name": "custom", + "node_type": "custom" + } + } +} +``` + +### Understanding the Structure + +The `params` dict defines how each UI element renders. The `input_names`, `model_input_names`, and `output_names` lists map these UI elements to the underlying [`ModularPipelineBlocks`]'s I/O interface: + +| Mellon Config | ModularPipelineBlocks | +|---------------|----------------------| +| `input_names` | `inputs` property | +| `model_input_names` | `expected_components` property | +| `output_names` | `intermediate_outputs` property | + +In this example: `prompt` is the only input. There are no model components, and outputs include `out_prompt`, `old_prompt`, and `doc`. + +Now let's look at the `params` dict: + +- **`prompt`**: An input parameter with `display: "textarea"` (renders as a text input box), `label: "Prompt"` (shown in the UI), and `default: ""` (starts empty). The `type: "string"` field is important in Mellon because it determines which nodes can connect together - only matching types can be linked with "noodles". + +- **`out_prompt`**: The expanded prompt output. The `out_` prefix was automatically added because the input and output share the same name (`prompt`), avoiding naming conflicts in the config. It has `display: "output"` which renders as an output socket. + +- **`old_prompt`**: Has `type: "custom"` because we didn't specify metadata. This renders as a simple dot in the UI. Since we don't actually want to expose this in the UI, we can remove it. + +- **`doc`**: The documentation output, automatically added to all custom blocks. + +### Making Adjustments + +Remove `old_prompt` from both `params` and `output_names` because you won't need to use it. + +```json +{ + "label": "Gemini Prompt Expander", + "default_repo": "", + "default_dtype": "", + "node_params": { + "custom": { + "params": { + "prompt": { + "label": "Prompt", + "type": "string", + "display": "textarea", + "default": "" + }, + "out_prompt": { + "label": "Prompt", + "type": "string", + "display": "output" + }, + "doc": { + "label": "Doc", + "type": "string", + "display": "output" + } + }, + "input_names": ["prompt"], + "model_input_names": [], + "output_names": ["out_prompt", "doc"], + "block_name": "custom", + "node_type": "custom" + } + } +} +``` + +See the final config at [diffusers/gemini-prompt-expander-mellon](https://huggingface.co/diffusers/gemini-prompt-expander-mellon). \ No newline at end of file diff --git a/docs/source/en/modular_diffusers/overview.md b/docs/source/en/modular_diffusers/overview.md index 7d07c4b73434..b7f20cf884b5 100644 --- a/docs/source/en/modular_diffusers/overview.md +++ b/docs/source/en/modular_diffusers/overview.md @@ -33,9 +33,14 @@ The Modular Diffusers docs are organized as shown below. - [SequentialPipelineBlocks](./sequential_pipeline_blocks) is a type of block that chains multiple blocks so they run one after another, passing data along the chain. This guide shows you how to create [`~modular_pipelines.SequentialPipelineBlocks`] and how they connect and work together. - [LoopSequentialPipelineBlocks](./loop_sequential_pipeline_blocks) is a type of block that runs a series of blocks in a loop. This guide shows you how to create [`~modular_pipelines.LoopSequentialPipelineBlocks`]. - [AutoPipelineBlocks](./auto_pipeline_blocks) is a type of block that automatically chooses which blocks to run based on the input. This guide shows you how to create [`~modular_pipelines.AutoPipelineBlocks`]. +- [Building Custom Blocks](./custom_blocks) shows you how to create your own custom blocks and share them on the Hub. ## ModularPipeline - [ModularPipeline](./modular_pipeline) shows you how to create and convert pipeline blocks into an executable [`ModularPipeline`]. - [ComponentsManager](./components_manager) shows you how to manage and reuse components across multiple pipelines. -- [Guiders](./guiders) shows you how to use different guidance methods in the pipeline. \ No newline at end of file +- [Guiders](./guiders) shows you how to use different guidance methods in the pipeline. + +## Mellon Integration + +- [Using Custom Blocks with Mellon](./mellon) shows you how to make your custom blocks work with [Mellon](https://github.com/cubiq/Mellon), a visual node-based interface for building workflows. \ No newline at end of file diff --git a/src/diffusers/modular_pipelines/components_manager.py b/src/diffusers/modular_pipelines/components_manager.py index e16abb382313..4a7ea8502c86 100644 --- a/src/diffusers/modular_pipelines/components_manager.py +++ b/src/diffusers/modular_pipelines/components_manager.py @@ -324,6 +324,7 @@ class ComponentsManager: "has_hook", "execution_device", "ip_adapter", + "quantization", ] def __init__(self): @@ -356,7 +357,9 @@ def _lookup_ids( ids_by_name.add(component_id) else: ids_by_name = set(components.keys()) - if collection: + if collection and collection not in self.collections: + return set() + elif collection and collection in self.collections: ids_by_collection = set() for component_id, component in components.items(): if component_id in self.collections[collection]: @@ -423,7 +426,8 @@ def add(self, name: str, component: Any, collection: Optional[str] = None): # add component to components manager self.components[component_id] = component - self.added_time[component_id] = time.time() + if is_new_component: + self.added_time[component_id] = time.time() if collection: if collection not in self.collections: @@ -760,7 +764,6 @@ def disable_auto_cpu_offload(self): self.model_hooks = None self._auto_offload_enabled = False - # YiYi TODO: (1) add quantization info def get_model_info( self, component_id: str, @@ -836,6 +839,17 @@ def get_model_info( if scales: info["ip_adapter"] = summarize_dict_by_value_and_parts(scales) + # Check for quantization + hf_quantizer = getattr(component, "hf_quantizer", None) + if hf_quantizer is not None: + quant_config = hf_quantizer.quantization_config + if hasattr(quant_config, "to_diff_dict"): + info["quantization"] = quant_config.to_diff_dict() + else: + info["quantization"] = quant_config.to_dict() + else: + info["quantization"] = None + # If fields specified, filter info if fields is not None: return {k: v for k, v in info.items() if k in fields} @@ -966,12 +980,16 @@ def format_device(component, info): output += "\nAdditional Component Info:\n" + "=" * 50 + "\n" for name in self.components: info = self.get_model_info(name) - if info is not None and (info.get("adapters") is not None or info.get("ip_adapter")): + if info is not None and ( + info.get("adapters") is not None or info.get("ip_adapter") or info.get("quantization") + ): output += f"\n{name}:\n" if info.get("adapters") is not None: output += f" Adapters: {info['adapters']}\n" if info.get("ip_adapter"): output += " IP-Adapter: Enabled\n" + if info.get("quantization"): + output += f" Quantization: {info['quantization']}\n" return output diff --git a/src/diffusers/modular_pipelines/mellon_node_utils.py b/src/diffusers/modular_pipelines/mellon_node_utils.py index f848afe9a3ae..35241023f3fc 100644 --- a/src/diffusers/modular_pipelines/mellon_node_utils.py +++ b/src/diffusers/modular_pipelines/mellon_node_utils.py @@ -1,3 +1,4 @@ +import copy import json import logging import os @@ -6,7 +7,7 @@ from dataclasses import asdict, dataclass from typing import Any, Dict, List, Optional, Union -from huggingface_hub import create_repo, hf_hub_download, upload_folder +from huggingface_hub import create_repo, hf_hub_download, upload_file from huggingface_hub.utils import ( EntryNotFoundError, HfHubHTTPError, @@ -15,25 +16,256 @@ ) from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT +from .modular_pipeline_utils import InputParam, OutputParam logger = logging.getLogger(__name__) +def _name_to_label(name: str) -> str: + """Convert snake_case name to Title Case label.""" + return name.replace("_", " ").title() + + +# Template definitions for standard diffuser pipeline parameters +MELLON_PARAM_TEMPLATES = { + # Image I/O + "image": {"label": "Image", "type": "image", "display": "input", "required_block_params": ["image"]}, + "images": {"label": "Images", "type": "image", "display": "output", "required_block_params": ["images"]}, + "control_image": { + "label": "Control Image", + "type": "image", + "display": "input", + "required_block_params": ["control_image"], + }, + # Latents + "latents": {"label": "Latents", "type": "latents", "display": "input", "required_block_params": ["latents"]}, + "image_latents": { + "label": "Image Latents", + "type": "latents", + "display": "input", + "required_block_params": ["image_latents"], + }, + "first_frame_latents": { + "label": "First Frame Latents", + "type": "latents", + "display": "input", + "required_block_params": ["first_frame_latents"], + }, + "latents_preview": {"label": "Latents Preview", "type": "latent", "display": "output"}, + # Image Latents with Strength + "image_latents_with_strength": { + "name": "image_latents", # name is not same as template key + "label": "Image Latents", + "type": "latents", + "display": "input", + "onChange": {"false": ["height", "width"], "true": ["strength"]}, + "required_block_params": ["image_latents", "strength"], + }, + # Embeddings + "embeddings": {"label": "Text Embeddings", "type": "embeddings", "display": "output"}, + "image_embeds": { + "label": "Image Embeddings", + "type": "image_embeds", + "display": "output", + "required_block_params": ["image_embeds"], + }, + # Text inputs + "prompt": { + "label": "Prompt", + "type": "string", + "display": "textarea", + "default": "", + "required_block_params": ["prompt"], + }, + "negative_prompt": { + "label": "Negative Prompt", + "type": "string", + "display": "textarea", + "default": "", + "required_block_params": ["negative_prompt"], + }, + # Numeric params + "guidance_scale": { + "label": "Guidance Scale", + "type": "float", + "display": "slider", + "default": 5.0, + "min": 1.0, + "max": 30.0, + "step": 0.1, + }, + "strength": { + "label": "Strength", + "type": "float", + "default": 0.5, + "min": 0.0, + "max": 1.0, + "step": 0.01, + "required_block_params": ["strength"], + }, + "height": { + "label": "Height", + "type": "int", + "default": 1024, + "min": 64, + "step": 8, + "required_block_params": ["height"], + }, + "width": { + "label": "Width", + "type": "int", + "default": 1024, + "min": 64, + "step": 8, + "required_block_params": ["width"], + }, + "seed": { + "label": "Seed", + "type": "int", + "default": 0, + "min": 0, + "max": 4294967295, + "display": "random", + "required_block_params": ["generator"], + }, + "num_inference_steps": { + "label": "Steps", + "type": "int", + "default": 25, + "min": 1, + "max": 100, + "display": "slider", + "required_block_params": ["num_inference_steps"], + }, + "num_frames": { + "label": "Frames", + "type": "int", + "default": 81, + "min": 1, + "max": 480, + "display": "slider", + "required_block_params": ["num_frames"], + }, + "layers": { + "label": "Layers", + "type": "int", + "default": 4, + "min": 1, + "max": 10, + "display": "slider", + "required_block_params": ["layers"], + }, + # ControlNet + "controlnet_conditioning_scale": { + "label": "Controlnet Conditioning Scale", + "type": "float", + "default": 0.5, + "min": 0.0, + "max": 1.0, + "step": 0.01, + "required_block_params": ["controlnet_conditioning_scale"], + }, + "control_guidance_start": { + "label": "Control Guidance Start", + "type": "float", + "default": 0.0, + "min": 0.0, + "max": 1.0, + "step": 0.01, + "required_block_params": ["control_guidance_start"], + }, + "control_guidance_end": { + "label": "Control Guidance End", + "type": "float", + "default": 1.0, + "min": 0.0, + "max": 1.0, + "step": 0.01, + "required_block_params": ["control_guidance_end"], + }, + # Video + "videos": {"label": "Videos", "type": "video", "display": "output", "required_block_params": ["videos"]}, + # Models + "vae": {"label": "VAE", "type": "diffusers_auto_model", "display": "input", "required_block_params": ["vae"]}, + "image_encoder": { + "label": "Image Encoder", + "type": "diffusers_auto_model", + "display": "input", + "required_block_params": ["image_encoder"], + }, + "unet": {"label": "Denoise Model", "type": "diffusers_auto_model", "display": "input"}, + "scheduler": {"label": "Scheduler", "type": "diffusers_auto_model", "display": "input"}, + "controlnet": { + "label": "ControlNet Model", + "type": "diffusers_auto_model", + "display": "input", + "required_block_params": ["controlnet"], + }, + "text_encoders": { + "label": "Text Encoders", + "type": "diffusers_auto_models", + "display": "input", + "required_block_params": ["text_encoder"], + }, + # Bundles/Custom + "controlnet_bundle": { + "label": "ControlNet", + "type": "custom_controlnet", + "display": "input", + "required_block_params": "controlnet_image", + }, + "ip_adapter": {"label": "IP Adapter", "type": "custom_ip_adapter", "display": "input"}, + "guider": { + "label": "Guider", + "type": "custom_guider", + "display": "input", + "onChange": {False: ["guidance_scale"], True: []}, + }, + "doc": {"label": "Doc", "type": "string", "display": "output"}, +} + + +class MellonParamMeta(type): + """Metaclass that enables MellonParam.template_name(**overrides) syntax.""" + + def __getattr__(cls, name: str): + if name in MELLON_PARAM_TEMPLATES: + + def factory(default=None, **overrides): + template = MELLON_PARAM_TEMPLATES[name] + # Use template's name if specified, otherwise use the key + params = {"name": template.get("name", name), **template, **overrides} + if default is not None: + params["default"] = default + return cls(**params) + + return factory + + raise AttributeError(f"type object 'MellonParam' has no attribute '{name}'") + + @dataclass(frozen=True) -class MellonParam: +class MellonParam(metaclass=MellonParamMeta): """ Parameter definition for Mellon nodes. - Use factory methods for common params (e.g., MellonParam.seed()) or create custom ones with - MellonParam(name="...", label="...", type="..."). - - Example: + Usage: ```python - # Custom param - MellonParam(name="my_param", label="My Param", type="float", default=0.5) - # Output in Mellon node definition: - # "my_param": {"label": "My Param", "type": "float", "default": 0.5} + # From template (standard diffuser params) + MellonParam.seed() + MellonParam.prompt(default="a cat") + MellonParam.latents(display="output") + + # Generic inputs (for custom blocks) + MellonParam.Input.slider("my_scale", default=1.0, min=0.0, max=2.0) + MellonParam.Input.dropdown("mode", options=["fast", "slow"]) + + # Generic outputs + MellonParam.Output.image("result_images") + + # Fully custom + MellonParam(name="custom", label="Custom", type="float", default=0.5) ``` """ @@ -53,577 +285,204 @@ class MellonParam: required_block_params: Optional[Union[str, List[str]]] = None def to_dict(self) -> Dict[str, Any]: - """Convert to dict for Mellon schema, excluding None values and name.""" + """Convert to dict for Mellon schema, excluding None values and internal fields.""" data = asdict(self) return {k: v for k, v in data.items() if v is not None and k not in ("name", "required_block_params")} - @classmethod - def image(cls) -> "MellonParam": - """ - Image input parameter. - - Mellon node definition: - "image": {"label": "Image", "type": "image", "display": "input"} - """ - return cls(name="image", label="Image", type="image", display="input", required_block_params=["image"]) - - @classmethod - def images(cls) -> "MellonParam": - """ - Images output parameter. - - Mellon node definition: - "images": {"label": "Images", "type": "image", "display": "output"} - """ - return cls(name="images", label="Images", type="image", display="output", required_block_params=["images"]) - - @classmethod - def control_image(cls, display: str = "input") -> "MellonParam": - """ - Control image parameter for ControlNet. - - Mellon node definition (display="input"): - "control_image": {"label": "Control Image", "type": "image", "display": "input"} - """ - return cls( - name="control_image", - label="Control Image", - type="image", - display=display, - required_block_params=["control_image"], - ) - - @classmethod - def latents(cls, display: str = "input") -> "MellonParam": - """ - Latents parameter. - - Mellon node definition (display="input"): - "latents": {"label": "Latents", "type": "latents", "display": "input"} - - Mellon node definition (display="output"): - "latents": {"label": "Latents", "type": "latents", "display": "output"} - """ - return cls(name="latents", label="Latents", type="latents", display=display, required_block_params=["latents"]) - - @classmethod - def image_latents(cls, display: str = "input") -> "MellonParam": - """ - Image latents parameter for img2img workflows. - - Mellon node definition (display="input"): - "image_latents": {"label": "Image Latents", "type": "latents", "display": "input"} - """ - return cls( - name="image_latents", - label="Image Latents", - type="latents", - display=display, - required_block_params=["image_latents"], - ) - - @classmethod - def first_frame_latents(cls, display: str = "input") -> "MellonParam": - """ - First frame latents for video generation. - - Mellon node definition (display="input"): - "first_frame_latents": {"label": "First Frame Latents", "type": "latents", "display": "input"} - """ - return cls( - name="first_frame_latents", - label="First Frame Latents", - type="latents", - display=display, - required_block_params=["first_frame_latents"], - ) - - @classmethod - def image_latents_with_strength(cls) -> "MellonParam": - """ - Image latents with strength-based onChange behavior. When connected, shows strength slider; when disconnected, - shows height/width. - - Mellon node definition: - "image_latents": { - "label": "Image Latents", "type": "latents", "display": "input", "onChange": {"false": ["height", - "width"], "true": ["strength"]} - } - """ - return cls( - name="image_latents", - label="Image Latents", - type="latents", - display="input", - onChange={"false": ["height", "width"], "true": ["strength"]}, - required_block_params=["image_latents", "strength"], - ) - - @classmethod - def latents_preview(cls) -> "MellonParam": - """ - Latents preview output for visualizing latents in the UI. - - Mellon node definition: - "latents_preview": {"label": "Latents Preview", "type": "latent", "display": "output"} - """ - return cls(name="latents_preview", label="Latents Preview", type="latent", display="output") - - @classmethod - def embeddings(cls, display: str = "output") -> "MellonParam": - """ - Text embeddings parameter. - - Mellon node definition (display="output"): - "embeddings": {"label": "Text Embeddings", "type": "embeddings", "display": "output"} - - Mellon node definition (display="input"): - "embeddings": {"label": "Text Embeddings", "type": "embeddings", "display": "input"} - """ - return cls(name="embeddings", label="Text Embeddings", type="embeddings", display=display) - - @classmethod - def image_embeds(cls, display: str = "output") -> "MellonParam": - """ - Image embeddings parameter for IP-Adapter workflows. - - Mellon node definition (display="output"): - "image_embeds": {"label": "Image Embeddings", "type": "image_embeds", "display": "output"} - """ - return cls( - name="image_embeds", - label="Image Embeddings", - type="image_embeds", - display=display, - required_block_params=["image_embeds"], - ) - - @classmethod - def controlnet_conditioning_scale(cls, default: float = 0.5) -> "MellonParam": - """ - ControlNet conditioning scale slider. - - Mellon node definition (default=0.5): - "controlnet_conditioning_scale": { - "label": "Controlnet Conditioning Scale", "type": "float", "default": 0.5, "min": 0.0, "max": 1.0, - "step": 0.01 - } - """ - return cls( - name="controlnet_conditioning_scale", - label="Controlnet Conditioning Scale", - type="float", - default=default, - min=0.0, - max=1.0, - step=0.01, - required_block_params=["controlnet_conditioning_scale"], - ) - - @classmethod - def control_guidance_start(cls, default: float = 0.0) -> "MellonParam": - """ - Control guidance start timestep. - - Mellon node definition (default=0.0): - "control_guidance_start": { - "label": "Control Guidance Start", "type": "float", "default": 0.0, "min": 0.0, "max": 1.0, "step": - 0.01 - } - """ - return cls( - name="control_guidance_start", - label="Control Guidance Start", - type="float", - default=default, - min=0.0, - max=1.0, - step=0.01, - required_block_params=["control_guidance_start"], - ) - - @classmethod - def control_guidance_end(cls, default: float = 1.0) -> "MellonParam": - """ - Control guidance end timestep. - - Mellon node definition (default=1.0): - "control_guidance_end": { - "label": "Control Guidance End", "type": "float", "default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01 - } - """ - return cls( - name="control_guidance_end", - label="Control Guidance End", - type="float", - default=default, - min=0.0, - max=1.0, - step=0.01, - required_block_params=["control_guidance_end"], - ) - - @classmethod - def prompt(cls, default: str = "") -> "MellonParam": - """ - Text prompt input as textarea. - - Mellon node definition (default=""): - "prompt": {"label": "Prompt", "type": "string", "default": "", "display": "textarea"} - """ - return cls( - name="prompt", - label="Prompt", - type="string", - default=default, - display="textarea", - required_block_params=["prompt"], - ) - - @classmethod - def negative_prompt(cls, default: str = "") -> "MellonParam": - """ - Negative prompt input as textarea. - - Mellon node definition (default=""): - "negative_prompt": {"label": "Negative Prompt", "type": "string", "default": "", "display": "textarea"} - """ - return cls( - name="negative_prompt", - label="Negative Prompt", - type="string", - default=default, - display="textarea", - required_block_params=["negative_prompt"], - ) - - @classmethod - def strength(cls, default: float = 0.5) -> "MellonParam": - """ - Denoising strength for img2img. - - Mellon node definition (default=0.5): - "strength": {"label": "Strength", "type": "float", "default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01} - """ - return cls( - name="strength", - label="Strength", - type="float", - default=default, - min=0.0, - max=1.0, - step=0.01, - required_block_params=["strength"], - ) - - @classmethod - def guidance_scale(cls, default: float = 5.0) -> "MellonParam": - """ - CFG guidance scale slider. - - Mellon node definition (default=5.0): - "guidance_scale": { - "label": "Guidance Scale", "type": "float", "display": "slider", "default": 5.0, "min": 1.0, "max": - 30.0, "step": 0.1 - } - """ - return cls( - name="guidance_scale", - label="Guidance Scale", - type="float", - display="slider", - default=default, - min=1.0, - max=30.0, - step=0.1, - ) - - @classmethod - def height(cls, default: int = 1024) -> "MellonParam": - """ - Image height in pixels. - - Mellon node definition (default=1024): - "height": {"label": "Height", "type": "int", "default": 1024, "min": 64, "step": 8} - """ - return cls( - name="height", - label="Height", - type="int", - default=default, - min=64, - step=8, - required_block_params=["height"], - ) - - @classmethod - def width(cls, default: int = 1024) -> "MellonParam": - """ - Image width in pixels. - - Mellon node definition (default=1024): - "width": {"label": "Width", "type": "int", "default": 1024, "min": 64, "step": 8} - """ - return cls( - name="width", label="Width", type="int", default=default, min=64, step=8, required_block_params=["width"] - ) - - @classmethod - def seed(cls, default: int = 0) -> "MellonParam": - """ - Random seed with randomize button. - - Mellon node definition (default=0): - "seed": { - "label": "Seed", "type": "int", "default": 0, "min": 0, "max": 4294967295, "display": "random" - } - """ - return cls( - name="seed", - label="Seed", - type="int", - default=default, - min=0, - max=4294967295, - display="random", - required_block_params=["generator"], - ) - - @classmethod - def num_inference_steps(cls, default: int = 25) -> "MellonParam": - """ - Number of denoising steps slider. - - Mellon node definition (default=25): - "num_inference_steps": { - "label": "Steps", "type": "int", "default": 25, "min": 1, "max": 100, "display": "slider" - } - """ - return cls( - name="num_inference_steps", - label="Steps", - type="int", - default=default, - min=1, - max=100, - display="slider", - required_block_params=["num_inference_steps"], - ) - - @classmethod - def num_frames(cls, default: int = 81) -> "MellonParam": - """ - Number of video frames slider. - - Mellon node definition (default=81): - "num_frames": {"label": "Frames", "type": "int", "default": 81, "min": 1, "max": 480, "display": "slider"} - """ - return cls( - name="num_frames", - label="Frames", - type="int", - default=default, - min=1, - max=480, - display="slider", - required_block_params=["num_frames"], - ) - - @classmethod - def layers(cls, default: int = 4) -> "MellonParam": - """ - Number of layers slider (for layered diffusion). - - Mellon node definition (default=4): - "layers": {"label": "Layers", "type": "int", "default": 4, "min": 1, "max": 10, "display": "slider"} - """ - return cls( - name="layers", - label="Layers", - type="int", - default=default, - min=1, - max=10, - display="slider", - required_block_params=["layers"], - ) - - @classmethod - def videos(cls) -> "MellonParam": - """ - Video output parameter. - - Mellon node definition: - "videos": {"label": "Videos", "type": "video", "display": "output"} - """ - return cls(name="videos", label="Videos", type="video", display="output", required_block_params=["videos"]) - - @classmethod - def vae(cls) -> "MellonParam": - """ - VAE model input. - - Mellon node definition: - "vae": {"label": "VAE", "type": "diffusers_auto_model", "display": "input"} - - Note: The value received is a model info dict with keys like 'model_id', 'repo_id', 'execution_device'. Use - components.get_one(model_id) to retrieve the actual model. - """ - return cls( - name="vae", label="VAE", type="diffusers_auto_model", display="input", required_block_params=["vae"] - ) - - @classmethod - def image_encoder(cls) -> "MellonParam": - """ - Image encoder model input. - - Mellon node definition: - "image_encoder": {"label": "Image Encoder", "type": "diffusers_auto_model", "display": "input"} - - Note: The value received is a model info dict with keys like 'model_id', 'repo_id', 'execution_device'. Use - components.get_one(model_id) to retrieve the actual model. - """ - return cls( - name="image_encoder", - label="Image Encoder", - type="diffusers_auto_model", - display="input", - required_block_params=["image_encoder"], - ) - - @classmethod - def unet(cls) -> "MellonParam": - """ - Denoising model (UNet/Transformer) input. - - Mellon node definition: - "unet": {"label": "Denoise Model", "type": "diffusers_auto_model", "display": "input"} - - Note: The value received is a model info dict with keys like 'model_id', 'repo_id', 'execution_device'. Use - components.get_one(model_id) to retrieve the actual model. - """ - return cls(name="unet", label="Denoise Model", type="diffusers_auto_model", display="input") - - @classmethod - def scheduler(cls) -> "MellonParam": - """ - Scheduler model input. - - Mellon node definition: - "scheduler": {"label": "Scheduler", "type": "diffusers_auto_model", "display": "input"} - - Note: The value received is a model info dict with keys like 'model_id', 'repo_id'. Use - components.get_one(model_id) to retrieve the actual scheduler. - """ - return cls(name="scheduler", label="Scheduler", type="diffusers_auto_model", display="input") - - @classmethod - def controlnet(cls) -> "MellonParam": - """ - ControlNet model input. - - Mellon node definition: - "controlnet": {"label": "ControlNet Model", "type": "diffusers_auto_model", "display": "input"} - - Note: The value received is a model info dict with keys like 'model_id', 'repo_id', 'execution_device'. Use - components.get_one(model_id) to retrieve the actual model. - """ - return cls( - name="controlnet", - label="ControlNet Model", - type="diffusers_auto_model", - display="input", - required_block_params=["controlnet"], - ) - - @classmethod - def text_encoders(cls) -> "MellonParam": - """ - Text encoders dict input (multiple encoders). - - Mellon node definition: - "text_encoders": {"label": "Text Encoders", "type": "diffusers_auto_models", "display": "input"} - - Note: The value received is a dict of model info dicts: - { - 'text_encoder': {'model_id': ..., 'execution_device': ..., ...}, 'tokenizer': {'model_id': ..., ...}, - 'repo_id': '...' - } - Use components.get_one(model_id) to retrieve each model. - """ - return cls( - name="text_encoders", - label="Text Encoders", - type="diffusers_auto_models", - display="input", - required_block_params=["text_encoder"], - ) - - @classmethod - def controlnet_bundle(cls, display: str = "input") -> "MellonParam": - """ - ControlNet bundle containing model and processed control inputs. Output from ControlNet node, input to Denoise - node. - - Mellon node definition (display="input"): - "controlnet_bundle": {"label": "ControlNet", "type": "custom_controlnet", "display": "input"} + # ========================================================================= + # Input: Generic input parameter factories (for custom blocks) + # ========================================================================= + class Input: + """input UI elements for custom blocks.""" + + @classmethod + def image(cls, name: str) -> "MellonParam": + """image input.""" + return MellonParam(name=name, label=_name_to_label(name), type="image", display="input") + + @classmethod + def textbox(cls, name: str, default: str = "") -> "MellonParam": + """text input as textarea.""" + return MellonParam( + name=name, label=_name_to_label(name), type="string", display="textarea", default=default + ) - Mellon node definition (display="output"): - "controlnet_bundle": {"label": "ControlNet", "type": "custom_controlnet", "display": "output"} + @classmethod + def dropdown(cls, name: str, options: List[str] = None, default: str = None) -> "MellonParam": + """dropdown selection.""" + if options and not default: + default = options[0] + if not default: + default = "" + if not options: + options = [default] + return MellonParam(name=name, label=_name_to_label(name), type="string", options=options, value=default) + + @classmethod + def slider( + cls, name: str, default: float = 0, min: float = None, max: float = None, step: float = None + ) -> "MellonParam": + """slider input.""" + is_float = isinstance(default, float) or (step is not None and isinstance(step, float)) + param_type = "float" if is_float else "int" + if min is None: + min = default + if max is None: + max = default + if step is None: + step = 0.01 if is_float else 1 + return MellonParam( + name=name, + label=_name_to_label(name), + type=param_type, + display="slider", + default=default, + min=min, + max=max, + step=step, + ) - Note: The value is a dict containing: - { - 'controlnet': {'model_id': ..., ...}, # controlnet model info 'control_image': ..., # processed control - image/embeddings 'controlnet_conditioning_scale': ..., # and other denoise block inputs - } - """ - return cls( - name="controlnet_bundle", - label="ControlNet", - type="custom_controlnet", - display=display, - required_block_params="controlnet_image", - ) + @classmethod + def number( + cls, name: str, default: float = 0, min: float = None, max: float = None, step: float = None + ) -> "MellonParam": + """number input (no slider).""" + is_float = isinstance(default, float) or (step is not None and isinstance(step, float)) + param_type = "float" if is_float else "int" + return MellonParam( + name=name, label=_name_to_label(name), type=param_type, default=default, min=min, max=max, step=step + ) - @classmethod - def ip_adapter(cls) -> "MellonParam": - """ - IP-Adapter input. + @classmethod + def seed(cls, name: str = "seed", default: int = 0) -> "MellonParam": + """seed input with randomize button.""" + return MellonParam( + name=name, + label=_name_to_label(name), + type="int", + display="random", + default=default, + min=0, + max=4294967295, + ) - Mellon node definition: - "ip_adapter": {"label": "IP Adapter", "type": "custom_ip_adapter", "display": "input"} - """ - return cls(name="ip_adapter", label="IP Adapter", type="custom_ip_adapter", display="input") + @classmethod + def checkbox(cls, name: str, default: bool = False) -> "MellonParam": + """boolean checkbox.""" + return MellonParam(name=name, label=_name_to_label(name), type="boolean", value=default) + + @classmethod + def custom_type(cls, name: str, type: str) -> "MellonParam": + """custom type input for node connections.""" + return MellonParam(name=name, label=_name_to_label(name), type=type, display="input") + + @classmethod + def model(cls, name: str) -> "MellonParam": + """model input for diffusers components.""" + return MellonParam(name=name, label=_name_to_label(name), type="diffusers_auto_model", display="input") + + # ========================================================================= + # Output: Generic output parameter factories (for custom blocks) + # ========================================================================= + class Output: + """output UI elements for custom blocks.""" + + @classmethod + def image(cls, name: str) -> "MellonParam": + """image output.""" + return MellonParam(name=name, label=_name_to_label(name), type="image", display="output") + + @classmethod + def video(cls, name: str) -> "MellonParam": + """video output.""" + return MellonParam(name=name, label=_name_to_label(name), type="video", display="output") + + @classmethod + def text(cls, name: str) -> "MellonParam": + """text output.""" + return MellonParam(name=name, label=_name_to_label(name), type="string", display="output") + + @classmethod + def custom_type(cls, name: str, type: str) -> "MellonParam": + """custom type output for node connections.""" + return MellonParam(name=name, label=_name_to_label(name), type=type, display="output") + + @classmethod + def model(cls, name: str) -> "MellonParam": + """model output for diffusers components.""" + return MellonParam(name=name, label=_name_to_label(name), type="diffusers_auto_model", display="output") + + +def input_param_to_mellon_param(input_param: "InputParam") -> MellonParam: + """ + Convert an InputParam to a MellonParam using metadata. - @classmethod - def guider(cls) -> "MellonParam": - """ - Custom guider input. When connected, hides the guidance_scale slider. + Args: + input_param: An InputParam with optional metadata containing either: + - {"mellon": ""} for simple types (image, textbox, slider, etc.) + - {"mellon": MellonParam(...)} for full control over UI configuration - Mellon node definition: - "guider": { - "label": "Guider", "type": "custom_guider", "display": "input", "onChange": {false: ["guidance_scale"], - true: []} - } - """ - return cls( - name="guider", - label="Guider", - type="custom_guider", - display="input", - onChange={False: ["guidance_scale"], True: []}, - ) + Returns: + MellonParam instance + """ + name = input_param.name + metadata = input_param.metadata + mellon_value = metadata.get("mellon") if metadata else None + default = input_param.default + + # If it's already a MellonParam, return it directly + if isinstance(mellon_value, MellonParam): + return mellon_value + + mellon_type = mellon_value + + if mellon_type == "image": + return MellonParam.Input.image(name) + elif mellon_type == "textbox": + return MellonParam.Input.textbox(name, default=default or "") + elif mellon_type == "dropdown": + return MellonParam.Input.dropdown(name, default=default or "") + elif mellon_type == "slider": + return MellonParam.Input.slider(name, default=default or 0) + elif mellon_type == "number": + return MellonParam.Input.number(name, default=default or 0) + elif mellon_type == "seed": + return MellonParam.Input.seed(name, default=default or 0) + elif mellon_type == "checkbox": + return MellonParam.Input.checkbox(name, default=default or False) + elif mellon_type == "model": + return MellonParam.Input.model(name) + else: + # None or unknown -> custom + return MellonParam.Input.custom_type(name, type="custom") + + +def output_param_to_mellon_param(output_param: "OutputParam") -> MellonParam: + """ + Convert an OutputParam to a MellonParam using metadata. - @classmethod - def doc(cls) -> "MellonParam": - """ - Documentation output for inspecting the underlying modular pipeline. + Args: + output_param: An OutputParam with optional metadata={"mellon": ""} where type is one of: + image, video, text, model. If metadata is None or unknown, maps to "custom". - Mellon node definition: - "doc": {"label": "Doc", "type": "string", "display": "output"} - """ - return cls(name="doc", label="Doc", type="string", display="output") + Returns: + MellonParam instance + """ + name = output_param.name + metadata = output_param.metadata + mellon_type = metadata.get("mellon") if metadata else None + + if mellon_type == "image": + return MellonParam.Output.image(name) + elif mellon_type == "video": + return MellonParam.Output.video(name) + elif mellon_type == "text": + return MellonParam.Output.text(name) + elif mellon_type == "model": + return MellonParam.Output.model(name) + else: + # None or unknown -> custom + return MellonParam.Output.custom_type(name, type="custom") DEFAULT_NODE_SPECS = { @@ -804,10 +663,15 @@ def node_spec_to_mellon_dict(node_spec: Dict[str, Any], node_type: str) -> Dict[ params[p.name] = param_dict model_input_names.append(p.name) - # Process outputs + # Process outputs: add a prefix to the output name if it already exists as an input for p in node_spec.get("outputs", []): - params[p.name] = p.to_dict() - output_names.append(p.name) + if p.name in input_names: + # rename to out_ + output_name = f"out_{p.name}" + else: + output_name = p.name + params[output_name] = p.to_dict() + output_names.append(output_name) return { "params": params, @@ -959,7 +823,7 @@ def from_json_file(cls, json_file_path: Union[str, os.PathLike]) -> "MellonPipel return cls.from_dict(data) def save(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): - """Save the pipeline config to a directory.""" + """Save the mellon pipeline config to a directory.""" if os.path.isfile(save_directory): raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") @@ -975,15 +839,14 @@ def save(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = Fals token = kwargs.pop("token", None) repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id - subfolder = kwargs.pop("subfolder", None) - upload_folder( + upload_file( + path_or_fileobj=output_path, + path_in_repo=self.config_name, repo_id=repo_id, - folder_path=save_directory, token=token, commit_message=commit_message or "Upload MellonPipelineConfig", create_pr=create_pr, - path_in_repo=subfolder, ) logger.info(f"Pipeline config pushed to hub: {repo_id}") @@ -1150,3 +1013,83 @@ def filter_spec_for_block(template_spec: Dict[str, Any], block) -> Optional[Dict default_repo=default_repo, default_dtype=default_dtype, ) + + @classmethod + def from_custom_block( + cls, + block, + node_label: str = None, + input_types: Optional[Dict[str, str]] = None, + output_types: Optional[Dict[str, str]] = None, + ) -> "MellonPipelineConfig": + """ + Create a MellonPipelineConfig from a custom block. + + Args: + block: A block instance with `inputs`, `outputs`, and `expected_components`/`component_names` properties. + Each InputParam/OutputParam should have metadata={"mellon": ""} where type is one of: image, + video, text, checkbox, number, slider, dropdown, model. If metadata is None, maps to "custom". + node_label: The display label for the node. Defaults to block class name with spaces. + input_types: + Optional dict mapping input param names to mellon types. Overrides the block's metadata if provided. + Example: {"prompt": "textbox", "image": "image"} + output_types: + Optional dict mapping output param names to mellon types. Overrides the block's metadata if provided. + Example: {"prompt": "text", "images": "image"} + + Returns: + MellonPipelineConfig instance + """ + if node_label is None: + class_name = block.__class__.__name__ + node_label = "".join([" " + c if c.isupper() else c for c in class_name]).strip() + + if input_types is None: + input_types = {} + if output_types is None: + output_types = {} + + inputs = [] + model_inputs = [] + outputs = [] + + # Process block inputs + for input_param in block.inputs: + if input_param.name is None: + continue + if input_param.name in input_types: + input_param = copy.copy(input_param) + input_param.metadata = {"mellon": input_types[input_param.name]} + print(f" processing input: {input_param.name}, metadata: {input_param.metadata}") + inputs.append(input_param_to_mellon_param(input_param)) + + # Process block outputs + for output_param in block.outputs: + if output_param.name is None: + continue + if output_param.name in output_types: + output_param = copy.copy(output_param) + output_param.metadata = {"mellon": output_types[output_param.name]} + outputs.append(output_param_to_mellon_param(output_param)) + + # Process expected components (all map to model inputs) + component_names = block.component_names + for component_name in component_names: + model_inputs.append(MellonParam.Input.model(component_name)) + + # Always add doc output + outputs.append(MellonParam.doc()) + + node_spec = { + "inputs": inputs, + "model_inputs": model_inputs, + "outputs": outputs, + "required_inputs": [], + "required_model_inputs": [], + "block_name": "custom", + } + + return cls( + node_specs={"custom": node_spec}, + label=node_label, + ) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index f3b12d716160..5481790a9405 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -520,6 +520,7 @@ class InputParam: required: bool = False description: str = "" kwargs_type: str = None + metadata: Dict[str, Any] = None def __repr__(self): return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>" @@ -553,6 +554,7 @@ class OutputParam: type_hint: Any = None description: str = "" kwargs_type: str = None + metadata: Dict[str, Any] = None def __repr__(self): return (