diff --git a/comfy/sd.py b/comfy/sd.py
index bc63d6ced30e..bc940740568e 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -976,7 +976,7 @@ def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=N
if overlap is not None:
args["overlap"] = overlap
- if dims == 1:
+ if dims == 1 or self.extra_1d_channel is not None:
args.pop("tile_y")
output = self.decode_tiled_1d(samples, **args)
elif dims == 2:
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index 74e62733eb39..00dd5ba908ef 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -3,6 +3,7 @@
from comfy import sd1_clip
import torch
import math
+import yaml
import comfy.utils
@@ -125,14 +126,43 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_06b", tokenizer=Qwen3Tokenizer)
+ def _metas_to_cot(self, *, return_yaml: bool = False, **kwargs) -> str:
+ user_metas = {
+ k: kwargs.pop(k)
+ for k in ("bpm", "duration", "keyscale", "timesignature", "language", "caption")
+ if k in kwargs
+ }
+ timesignature = user_metas.get("timesignature")
+ if isinstance(timesignature, str) and timesignature.endswith("/4"):
+ user_metas["timesignature"] = timesignature.rsplit("/", 1)[0]
+ user_metas = {
+ k: v if not isinstance(v, str) or not v.isdigit() else int(v)
+ for k, v in user_metas.items()
+ if v not in {"unspecified", None}
+ }
+ if len(user_metas):
+ meta_yaml = yaml.dump(user_metas, allow_unicode=True, sort_keys=True).strip()
+ else:
+ meta_yaml = ""
+ return f"\n{meta_yaml}\n" if not return_yaml else meta_yaml
+
+ def _metas_to_cap(self, **kwargs) -> str:
+ use_keys = ("bpm", "duration", "keyscale", "timesignature")
+ user_metas = { k: kwargs.pop(k, "N/A") for k in use_keys }
+ duration = user_metas["duration"]
+ if duration == "N/A":
+ user_metas["duration"] = "30 seconds"
+ elif isinstance(duration, (str, int, float)):
+ user_metas["duration"] = f"{math.ceil(float(duration))} seconds"
+ else:
+ raise TypeError("Unexpected type for duration key, must be str, int or float")
+ return "\n".join(f"- {k}: {user_metas[k]}" for k in use_keys)
+
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
out = {}
lyrics = kwargs.get("lyrics", "")
- bpm = kwargs.get("bpm", 120)
duration = kwargs.get("duration", 120)
- keyscale = kwargs.get("keyscale", "C major")
- timesignature = kwargs.get("timesignature", 2)
- language = kwargs.get("language", "en")
+ language = kwargs.get("language")
seed = kwargs.get("seed", 0)
generate_audio_codes = kwargs.get("generate_audio_codes", True)
@@ -141,16 +171,20 @@ def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
top_p = kwargs.get("top_p", 0.9)
top_k = kwargs.get("top_k", 0.0)
+
duration = math.ceil(duration)
- meta_lm = 'bpm: {}\nduration: {}\nkeyscale: {}\ntimesignature: {}'.format(bpm, duration, keyscale, timesignature)
- lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n{}\n<|im_end|>\n<|im_start|>assistant\n\n{}\n\n\n<|im_end|>\n"
+ kwargs["duration"] = duration
+
+ cot_text = self._metas_to_cot(caption = text, **kwargs)
+ meta_cap = self._metas_to_cap(**kwargs)
+
+ lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n# Lyric\n{}\n<|im_end|>\n<|im_start|>assistant\n{}\n<|im_end|>\n"
- meta_cap = '- bpm: {}\n- timesignature: {}\n- keyscale: {}\n- duration: {}\n'.format(bpm, timesignature, keyscale, duration)
- out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, meta_lm), disable_weights=True)
- out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, ""), disable_weights=True)
+ out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, cot_text), disable_weights=True)
+ out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, "\n"), disable_weights=True)
- out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric{}<|endoftext|><|endoftext|>".format(language, lyrics), return_word_ids, disable_weights=True, **kwargs)
- out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}# Metas\n{}<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
+ out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric\n{}<|endoftext|><|endoftext|>".format(language if language is not None else "", lyrics), return_word_ids, disable_weights=True, **kwargs)
+ out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}\n# Metas\n{}\n<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
out["lm_metadata"] = {"min_tokens": duration * 5,
"seed": seed,
"generate_audio_codes": generate_audio_codes,
diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index bef723dce10c..b63dd8e9717e 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -94,6 +94,19 @@ def execute(cls, vae, audio) -> IO.NodeOutput:
encode = execute # TODO: remove
+def vae_decode_audio(vae, samples, tile=None, overlap=None):
+ if tile is not None:
+ audio = vae.decode_tiled(samples["samples"], tile_y=tile, overlap=overlap).movedim(-1, 1)
+ else:
+ audio = vae.decode(samples["samples"]).movedim(-1, 1)
+
+ std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
+ std[std < 1.0] = 1.0
+ audio /= std
+ vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+ return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}
+
+
class VAEDecodeAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
@@ -111,16 +124,33 @@ def define_schema(cls):
@classmethod
def execute(cls, vae, samples) -> IO.NodeOutput:
- audio = vae.decode(samples["samples"]).movedim(-1, 1)
- std = torch.std(audio, dim=[1,2], keepdim=True) * 5.0
- std[std < 1.0] = 1.0
- audio /= std
- vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
- return IO.NodeOutput({"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]})
+ return IO.NodeOutput(vae_decode_audio(vae, samples))
decode = execute # TODO: remove
+class VAEDecodeAudioTiled(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="VAEDecodeAudioTiled",
+ search_aliases=["latent to audio"],
+ display_name="VAE Decode Audio (Tiled)",
+ category="latent/audio",
+ inputs=[
+ IO.Latent.Input("samples"),
+ IO.Vae.Input("vae"),
+ IO.Int.Input("tile_size", default=512, min=32, max=8192, step=8),
+ IO.Int.Input("overlap", default=64, min=0, max=1024, step=8),
+ ],
+ outputs=[IO.Audio.Output()],
+ )
+
+ @classmethod
+ def execute(cls, vae, samples, tile_size, overlap) -> IO.NodeOutput:
+ return IO.NodeOutput(vae_decode_audio(vae, samples, tile_size, overlap))
+
+
class SaveAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
@@ -675,6 +705,7 @@ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
EmptyLatentAudio,
VAEEncodeAudio,
VAEDecodeAudio,
+ VAEDecodeAudioTiled,
SaveAudio,
SaveAudioMP3,
SaveAudioOpus,
diff --git a/comfy_extras/nodes_toolkit.py b/comfy_extras/nodes_toolkit.py
new file mode 100644
index 000000000000..71faf7226792
--- /dev/null
+++ b/comfy_extras/nodes_toolkit.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+
+class CreateList(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ template_matchtype = io.MatchType.Template("type")
+ template_autogrow = io.Autogrow.TemplatePrefix(
+ input=io.MatchType.Input("input", template=template_matchtype),
+ prefix="input",
+ )
+ return io.Schema(
+ node_id="CreateList",
+ display_name="Create List",
+ category="logic",
+ is_input_list=True,
+ search_aliases=["Image Iterator", "Text Iterator", "Iterator"],
+ inputs=[io.Autogrow.Input("inputs", template=template_autogrow)],
+ outputs=[
+ io.MatchType.Output(
+ template=template_matchtype,
+ is_output_list=True,
+ display_name="list",
+ ),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, inputs: io.Autogrow.Type) -> io.NodeOutput:
+ output_list = []
+ for input in inputs.values():
+ output_list += input
+ return io.NodeOutput(output_list)
+
+
+class ToolkitExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ CreateList,
+ ]
+
+
+async def comfy_entrypoint() -> ToolkitExtension:
+ return ToolkitExtension()
diff --git a/comfyui_version.py b/comfyui_version.py
index 5d296cd1b680..706b3776332c 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is
# updated in pyproject.toml.
-__version__ = "0.12.2"
+__version__ = "0.12.3"
diff --git a/nodes.py b/nodes.py
index e11a8ed802c8..91de7a9d74ae 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2433,7 +2433,8 @@ async def init_builtin_extra_nodes():
"nodes_image_compare.py",
"nodes_zimage.py",
"nodes_lora_debug.py",
- "nodes_color.py"
+ "nodes_color.py",
+ "nodes_toolkit.py",
]
import_failed = []
diff --git a/pyproject.toml b/pyproject.toml
index 1ddcc3596b2c..f7925b92ad8a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "ComfyUI"
-version = "0.12.2"
+version = "0.12.3"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"
diff --git a/requirements.txt b/requirements.txt
index 0c401873a0cb..41cc9174b1c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.37.11
+comfyui-frontend-package==1.38.13
comfyui-workflow-templates==0.8.31
comfyui-embedded-docs==0.4.0
torch