From 9cf299a9f9488e4cb9b3f7cef3bc94c185c19f73 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 23 Jan 2026 16:50:48 -0800
Subject: [PATCH 1/2] Make regular empty latent node work properly on flux 2
 variants. (#12050)

---
 comfy/latent_formats.py              |  3 +++
 comfy/sample.py                      | 12 +++++++++---
 comfy_extras/nodes_custom_sampler.py |  6 ++++--
 comfy_extras/nodes_sd3.py            |  2 +-
 nodes.py                             |  5 +++--
 5 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
index cb4f52ce11fa..5600825ede08 100644
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -8,6 +8,7 @@ class LatentFormat:
     latent_rgb_factors_bias = None
     latent_rgb_factors_reshape = None
     taesd_decoder_name = None
+    spacial_downscale_ratio = 8
 
     def process_in(self, latent):
         return latent * self.scale_factor
@@ -181,6 +182,7 @@ def process_out(self, latent):
 
 class Flux2(LatentFormat):
     latent_channels = 128
+    spacial_downscale_ratio = 16
 
     def __init__(self):
         self.latent_rgb_factors =[
@@ -749,6 +751,7 @@ class ACEAudio(LatentFormat):
 
 class ChromaRadiance(LatentFormat):
     latent_channels = 3
+    spacial_downscale_ratio = 1
 
     def __init__(self):
         self.latent_rgb_factors = [
diff --git a/comfy/sample.py b/comfy/sample.py
index 2f8f3a51c5fc..a2a39b527a41 100644
--- a/comfy/sample.py
+++ b/comfy/sample.py
@@ -37,12 +37,18 @@ def prepare_noise(latent_image, seed, noise_inds=None):
 
     return noises
 
-def fix_empty_latent_channels(model, latent_image):
+def fix_empty_latent_channels(model, latent_image, downscale_ratio_spacial=None):
     if latent_image.is_nested:
         return latent_image
     latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels
-    if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0:
-        latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
+    if torch.count_nonzero(latent_image) == 0:
+        if latent_format.latent_channels != latent_image.shape[1]:
+            latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
+        if downscale_ratio_spacial is not None:
+            if downscale_ratio_spacial != latent_format.spacial_downscale_ratio:
+                ratio = downscale_ratio_spacial / latent_format.spacial_downscale_ratio
+                latent_image = comfy.utils.common_upscale(latent_image, round(latent_image.shape[-1] * ratio), round(latent_image.shape[-2] * ratio), "nearest-exact", crop="disabled")
+
     if latent_format.latent_dimensions == 3 and latent_image.ndim == 4:
         latent_image = latent_image.unsqueeze(2)
     return latent_image
diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py
index 3eb40e93726d..a4d84ddf7509 100644
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -741,7 +741,7 @@ def execute(cls, model, add_noise, noise_seed, cfg, positive, negative, sampler,
         latent = latent_image
         latent_image = latent["samples"]
         latent = latent.copy()
-        latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image)
+        latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None))
         latent["samples"] = latent_image
 
         if not add_noise:
@@ -760,6 +760,7 @@ def execute(cls, model, add_noise, noise_seed, cfg, positive, negative, sampler,
         samples = comfy.sample.sample_custom(model, noise, cfg, sampler, sigmas, positive, negative, latent_image, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=noise_seed)
 
         out = latent.copy()
+        out.pop("downscale_ratio_spacial", None)
         out["samples"] = samples
         if "x0" in x0_output:
             x0_out = model.model.process_latent_out(x0_output["x0"].cpu())
@@ -939,7 +940,7 @@ def execute(cls, noise, guider, sampler, sigmas, latent_image) -> io.NodeOutput:
         latent = latent_image
         latent_image = latent["samples"]
         latent = latent.copy()
-        latent_image = comfy.sample.fix_empty_latent_channels(guider.model_patcher, latent_image)
+        latent_image = comfy.sample.fix_empty_latent_channels(guider.model_patcher, latent_image, latent.get("downscale_ratio_spacial", None))
         latent["samples"] = latent_image
 
         noise_mask = None
@@ -954,6 +955,7 @@ def execute(cls, noise, guider, sampler, sigmas, latent_image) -> io.NodeOutput:
         samples = samples.to(comfy.model_management.intermediate_device())
 
         out = latent.copy()
+        out.pop("downscale_ratio_spacial", None)
         out["samples"] = samples
         if "x0" in x0_output:
             x0_out = guider.model_patcher.model.process_latent_out(x0_output["x0"].cpu())
diff --git a/comfy_extras/nodes_sd3.py b/comfy_extras/nodes_sd3.py
index 02e5e7dd807c..736213a473ed 100644
--- a/comfy_extras/nodes_sd3.py
+++ b/comfy_extras/nodes_sd3.py
@@ -55,7 +55,7 @@ def define_schema(cls):
     @classmethod
     def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
         latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        return io.NodeOutput({"samples":latent})
+        return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 8})
 
     generate = execute  # TODO: remove
 
diff --git a/nodes.py b/nodes.py
index 158106686996..b75247665c07 100644
--- a/nodes.py
+++ b/nodes.py
@@ -1230,7 +1230,7 @@ def INPUT_TYPES(s):
 
     def generate(self, width, height, batch_size=1):
         latent = torch.zeros([batch_size, 4, height // 8, width // 8], device=self.device)
-        return ({"samples":latent}, )
+        return ({"samples": latent, "downscale_ratio_spacial": 8}, )
 
 
 class LatentFromBatch:
@@ -1538,7 +1538,7 @@ def set_mask(self, samples, mask):
 
 def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False):
     latent_image = latent["samples"]
-    latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image)
+    latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None))
 
     if disable_noise:
         noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu")
@@ -1556,6 +1556,7 @@ def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive,
                                   denoise=denoise, disable_noise=disable_noise, start_step=start_step, last_step=last_step,
                                   force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
     out = latent.copy()
+    out.pop("downscale_ratio_spacial", None)
     out["samples"] = samples
     return (out, )
 

From 4e6a1b66a93ef91848bc4bbf2a84e0ea98efcfc9 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Fri, 23 Jan 2026 16:56:14 -0800
Subject: [PATCH 2/2] speed up and reduce VRAM of QWEN VAE and WAN (less so)
 (#12036)

* ops: introduce autopad for conv3d

This works around pytorch missing ability to causal pad as part of the
kernel and avoids massive weight duplications for padding.

* wan-vae: rework causal padding

This currently uses F.pad which takes a full deep copy and is liable to
be the VRAM peak. Instead, kick spatial padding back to the op and
consolidate the temporal padding with the cat for the cache.

* wan-vae: implement zero pad fast path

The WAN VAE is also QWEN where it is used single-image. These
convolutions are however zero padded 3d convolutions, which means the
VAE is actually just 2D down the last element of the conv weight in
the temporal dimension. Fast path this, to avoid adding zeros that
then just evaporate in convoluton math but cost computation.
---
 comfy/ldm/wan/vae.py | 27 +++++++++++++++++----------
 comfy/ops.py         | 10 ++++++----
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py
index 08315f1a8d3e..40e76721300b 100644
--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@@ -5,7 +5,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from comfy.ldm.modules.diffusionmodules.model import vae_attention
+from comfy.ldm.modules.diffusionmodules.model import vae_attention, torch_cat_if_needed
 
 import comfy.ops
 ops = comfy.ops.disable_weight_init
@@ -20,22 +20,29 @@ class CausalConv3d(ops.Conv3d):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self._padding = (self.padding[2], self.padding[2], self.padding[1],
-                         self.padding[1], 2 * self.padding[0], 0)
-        self.padding = (0, 0, 0)
+        self._padding = 2 * self.padding[0]
+        self.padding = (0, self.padding[1], self.padding[2])
 
     def forward(self, x, cache_x=None, cache_list=None, cache_idx=None):
         if cache_list is not None:
             cache_x = cache_list[cache_idx]
             cache_list[cache_idx] = None
 
-        padding = list(self._padding)
-        if cache_x is not None and self._padding[4] > 0:
-            cache_x = cache_x.to(x.device)
-            x = torch.cat([cache_x, x], dim=2)
-            padding[4] -= cache_x.shape[2]
+        if cache_x is None and x.shape[2] == 1:
+            #Fast path - the op will pad for use by truncating the weight
+            #and save math on a pile of zeros.
+            return super().forward(x, autopad="causal_zero")
+
+        if self._padding > 0:
+            padding_needed = self._padding
+            if cache_x is not None:
+                cache_x = cache_x.to(x.device)
+                padding_needed = max(0, padding_needed - cache_x.shape[2])
+            padding_shape = list(x.shape)
+            padding_shape[2] = padding_needed
+            padding = torch.zeros(padding_shape, device=x.device, dtype=x.dtype)
+            x = torch_cat_if_needed([padding, cache_x, x], dim=2)
             del cache_x
-        x = F.pad(x, padding)
 
         return super().forward(x)
 
diff --git a/comfy/ops.py b/comfy/ops.py
index 415c39e9202b..e406ba7edeab 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -203,7 +203,9 @@ class Conv3d(torch.nn.Conv3d, CastWeightBiasOp):
         def reset_parameters(self):
             return None
 
-        def _conv_forward(self, input, weight, bias, *args, **kwargs):
+        def _conv_forward(self, input, weight, bias, autopad=None, *args, **kwargs):
+            if autopad == "causal_zero":
+                weight = weight[:, :, -input.shape[2]:, :, :]
             if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16):
                 out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True)
                 if bias is not None:
@@ -212,15 +214,15 @@ def _conv_forward(self, input, weight, bias, *args, **kwargs):
             else:
                 return super()._conv_forward(input, weight, bias, *args, **kwargs)
 
-        def forward_comfy_cast_weights(self, input):
+        def forward_comfy_cast_weights(self, input, autopad=None):
             weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
-            x = self._conv_forward(input, weight, bias)
+            x = self._conv_forward(input, weight, bias, autopad=autopad)
             uncast_bias_weight(self, weight, bias, offload_stream)
             return x
 
         def forward(self, *args, **kwargs):
             run_every_op()
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0 or "autopad" in kwargs:
                 return self.forward_comfy_cast_weights(*args, **kwargs)
             else:
                 return super().forward(*args, **kwargs)