From 6b4521b38a688cf6072d6fc849171022af1e1490 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Thu, 12 Feb 2026 12:02:19 +0800 Subject: [PATCH 01/15] [vm]: use max of virtual and actual size for root disk when no disk offering Resolves: ZSTAC-74683 Change-Id: Id0339ed0221e92e506f60745cde972cc3ee6d9ae --- header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java b/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java index 7007c592aea..99ee2173b98 100755 --- a/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java +++ b/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java @@ -847,7 +847,9 @@ public void setBootMode(String bootMode) { public long getRootDiskAllocateSize() { if (rootDiskOffering == null) { - return this.getImageSpec().getInventory().getSize(); + long virtualSize = this.getImageSpec().getInventory().getSize(); + long actualSize = this.getImageSpec().getInventory().getActualSize(); + return Math.max(virtualSize, actualSize); } return rootDiskOffering.getDiskSize(); } From 3b5bda3b76aef968a911d18e35b3b30bd0cab803 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Thu, 12 Feb 2026 13:52:13 +0800 Subject: [PATCH 02/15] [zbs]: enable tryNext and 30s timeout for getActiveClients MDS call When anti-split-brain check selects a disconnected MDS node, the HTTP call now times out after 30s instead of 5+ minutes, and automatically retries the next available MDS via tryNext mechanism. Resolves: ZSTAC-80595 Change-Id: I1be80f1b70cad1606eb38d1f0078c8f2781e6941 --- .../org/zstack/storage/zbs/ZbsStorageController.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java b/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java index db06239acb3..276ab367ba1 100644 --- a/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java +++ b/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java @@ -179,7 +179,10 @@ public List getActiveClients(String installPath, String prot if (VolumeProtocol.CBD.toString().equals(protocol)) { GetVolumeClientsCmd cmd = new GetVolumeClientsCmd(); cmd.setPath(installPath); - GetVolumeClientsRsp rsp = syncHttpCall(GET_VOLUME_CLIENTS_PATH, cmd, GetVolumeClientsRsp.class); + GetVolumeClientsRsp rsp = new HttpCaller<>(GET_VOLUME_CLIENTS_PATH, cmd, GetVolumeClientsRsp.class, + null, TimeUnit.SECONDS, 30, true) + .setTryNext(true) + .syncCall(); List clients = new ArrayList<>(); if (!rsp.isSuccess()) { @@ -1411,6 +1414,11 @@ public class HttpCaller { private boolean tryNext = false; + HttpCaller setTryNext(boolean tryNext) { + this.tryNext = tryNext; + return this; + } + public HttpCaller(String path, AgentCommand cmd, Class retClass, ReturnValueCompletion callback) { this(path, cmd, retClass, callback, null, 0, false); } From 80df074f8dd1140b278ce0979f2068d5c271d8e5 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Thu, 12 Feb 2026 14:22:40 +0800 Subject: [PATCH 03/15] [vm]: add Destroying->Stopped state transition When MN restarts during a destroy operation, the hypervisor may report the VM as Stopped. Without this transition, the state machine throws an exception and the VM stays stuck in Destroying state forever. Resolves: ZSTAC-80620 Change-Id: I037edba70d145a44a88ce0d3573089182fedb162 --- header/src/main/java/org/zstack/header/vm/VmInstanceState.java | 1 + 1 file changed, 1 insertion(+) diff --git a/header/src/main/java/org/zstack/header/vm/VmInstanceState.java b/header/src/main/java/org/zstack/header/vm/VmInstanceState.java index 8a755b52fda..49303e23252 100755 --- a/header/src/main/java/org/zstack/header/vm/VmInstanceState.java +++ b/header/src/main/java/org/zstack/header/vm/VmInstanceState.java @@ -168,6 +168,7 @@ public enum VmInstanceState { new Transaction(VmInstanceStateEvent.destroyed, VmInstanceState.Destroyed), new Transaction(VmInstanceStateEvent.destroying, VmInstanceState.Destroying), new Transaction(VmInstanceStateEvent.running, VmInstanceState.Running), + new Transaction(VmInstanceStateEvent.stopped, VmInstanceState.Stopped), new Transaction(VmInstanceStateEvent.expunging, VmInstanceState.Expunging) ); Destroyed.transactions( From a84a36e2515e6b6bdc69f80420e62364e0832a90 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Thu, 12 Feb 2026 16:12:49 +0800 Subject: [PATCH 04/15] [ceph]: apply over-provisioning ratio when releasing snapshot capacity Resolves: ZSTAC-79709 Change-Id: I45a2133bbb8c51c25ae3549d59e588976192a08d --- .../org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java b/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java index d80b40a1d6a..8b387306683 100755 --- a/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java +++ b/plugin/ceph/src/main/java/org/zstack/storage/ceph/primary/CephPrimaryStorageBase.java @@ -5446,7 +5446,7 @@ private void deleteSnapshotOnPrimaryStorage(final DeleteSnapshotOnPrimaryStorage httpCall(DELETE_SNAPSHOT_PATH, cmd, DeleteSnapshotRsp.class, new ReturnValueCompletion(msg) { @Override public void success(DeleteSnapshotRsp returnValue) { - osdHelper.releaseAvailableCapacity(msg.getSnapshot().getPrimaryStorageInstallPath(), msg.getSnapshot().getSize()); + osdHelper.releaseAvailableCapWithRatio(msg.getSnapshot().getPrimaryStorageInstallPath(), msg.getSnapshot().getSize()); bus.reply(msg, reply); completion.done(); } From f19223a6e72678f20610ae04226fc20c0cf1bb5b Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Fri, 13 Feb 2026 12:40:04 +0800 Subject: [PATCH 05/15] [loadBalancer]: block SLB deletion during grayscale upgrade Resolves: ZSTAC-78989 Change-Id: I0fe3a56ab724978944c69afadaab7ff7353e4c0f --- .../service/lb/LoadBalancerApiInterceptor.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/plugin/loadBalancer/src/main/java/org/zstack/network/service/lb/LoadBalancerApiInterceptor.java b/plugin/loadBalancer/src/main/java/org/zstack/network/service/lb/LoadBalancerApiInterceptor.java index 97b88c919c2..0d9946d5320 100755 --- a/plugin/loadBalancer/src/main/java/org/zstack/network/service/lb/LoadBalancerApiInterceptor.java +++ b/plugin/loadBalancer/src/main/java/org/zstack/network/service/lb/LoadBalancerApiInterceptor.java @@ -39,6 +39,7 @@ import org.zstack.network.service.vip.VipVO_; import org.zstack.tag.PatternedSystemTag; import org.zstack.tag.TagManager; +import org.zstack.core.upgrade.UpgradeGlobalConfig; import org.zstack.utils.*; import org.zstack.utils.function.ForEachFunction; import org.zstack.utils.logging.CLogger; @@ -152,10 +153,22 @@ public APIMessage intercept(APIMessage msg) throws ApiMessageInterceptionExcepti validate((APIGetCandidateVmNicsForLoadBalancerServerGroupMsg)msg); } else if (msg instanceof APIChangeLoadBalancerBackendServerMsg) { validate((APIChangeLoadBalancerBackendServerMsg)msg); + } else if (msg instanceof APIDeleteLoadBalancerMsg) { + validate((APIDeleteLoadBalancerMsg) msg); } return msg; } + private void validate(APIDeleteLoadBalancerMsg msg) { + if (UpgradeGlobalConfig.GRAYSCALE_UPGRADE.value(Boolean.class)) { + LoadBalancerVO lb = dbf.findByUuid(msg.getUuid(), LoadBalancerVO.class); + if (lb != null && lb.getType() == LoadBalancerType.SLB) { + throw new ApiMessageInterceptionException(argerr( + "cannot delete the standalone load balancer[uuid:%s] during grayscale upgrade", msg.getUuid())); + } + } + } + private void validate(APIDeleteAccessControlListMsg msg) { /*List refs = Q.New(LoadBalancerListenerACLRefVO.class).select(LoadBalancerListenerACLRefVO_.listenerUuid) .eq(LoadBalancerListenerACLRefVO_.aclUuid, msg.getUuid()).isNull(LoadBalancerListenerACLRefVO_.serverGroupUuid).listValues(); From 24d4f3b4870ea72fce77bcf64980d0b70b868502 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Fri, 13 Feb 2026 13:32:45 +0800 Subject: [PATCH 06/15] [i18n]: improve snapshot error message for unattached volume Resolves: ZSTAC-82153 Change-Id: Ib51c2e21553277416d1a9444be55aca2aa4b2fc4 --- conf/i18n/globalErrorCodeMapping/global-error-en_US.json | 2 +- conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/i18n/globalErrorCodeMapping/global-error-en_US.json b/conf/i18n/globalErrorCodeMapping/global-error-en_US.json index 32eb4c8f056..715e823d95e 100644 --- a/conf/i18n/globalErrorCodeMapping/global-error-en_US.json +++ b/conf/i18n/globalErrorCodeMapping/global-error-en_US.json @@ -3374,7 +3374,7 @@ "ORG_ZSTACK_NETWORK_HUAWEI_IMASTER_10019": "delete token of SDN controller [IP:%s] failed because %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10004": "Cannot execute volume mapping to host flow due to invalid volume ID.%s", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10007": "port forwarding rule [uuid:%s] has not been attached to any virtual machine network interface, cannot detach", - "ORG_ZSTACK_MEVOCO_10088": "cannot take a snapshot for volumes[%s] when volume[uuid: %s] is not attached", + "ORG_ZSTACK_MEVOCO_10088": "cannot create snapshot for volume[uuid:%s] because it is not attached to any VM instance. Please attach the volume to a VM first. Affected volumes: %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10005": "Cannot execute map LUN to host flow due to invalid LUN type: %s", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10008": "port forwarding rule [uuid:%s] has been associated with vm nic [uuid:%s], cannot be reassigned again", "ORG_ZSTACK_MEVOCO_10087": "A Running VM[uuid:%s] has no associated Host UUID.", diff --git a/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json b/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json index 84609838ddc..01960e8eb45 100644 --- a/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json +++ b/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json @@ -3374,7 +3374,7 @@ "ORG_ZSTACK_NETWORK_HUAWEI_IMASTER_10019": "删除 SDN 控制器 [IP:%s] 的令牌失败,因为 %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10004": "无法执行映射LUN到主机流程,无效的LUN ID", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10007": "端口转发规则 rule[uuid:%s] 没有绑定到任何 VM 的网卡上,无法解除绑定", - "ORG_ZSTACK_MEVOCO_10088": "无法为挂载状态以外的卷[%s]创建快照", + "ORG_ZSTACK_MEVOCO_10088": "无法为云盘[uuid:%s]创建快照,因为该云盘未挂载到任何云主机。请先将云盘挂载到云主机后再创建快照。相关云盘: %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10005": "无法执行映射LUN到主机流程,无效的LUN类型", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10008": "端口转发规则[uuid:%s]已绑定到VM网卡[uuid:%s],无法再次绑定", "ORG_ZSTACK_MEVOCO_10087": "如何一个运行中的VM[uuid:%s]没有宿主机uuid?", From f563992d30c2ab9484acc944e6148a5fc1f39f18 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Fri, 13 Feb 2026 15:09:15 +0800 Subject: [PATCH 07/15] [compute]: add null check for VmNicVO in afterDelIpAddress and afterAddIpAddress to prevent NPE during rollback Resolves: ZSTAC-81741 Change-Id: I53bcf20a10306afc7b6172da294d347b74e6c41f --- .../main/java/org/zstack/compute/vm/VmNicManagerImpl.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/compute/src/main/java/org/zstack/compute/vm/VmNicManagerImpl.java b/compute/src/main/java/org/zstack/compute/vm/VmNicManagerImpl.java index 25c0b005d71..31b3e35d32a 100644 --- a/compute/src/main/java/org/zstack/compute/vm/VmNicManagerImpl.java +++ b/compute/src/main/java/org/zstack/compute/vm/VmNicManagerImpl.java @@ -58,6 +58,10 @@ public void afterAddIpAddress(String vmNicUUid, String usedIpUuid) { SQL.New(UsedIpVO.class).eq(UsedIpVO_.uuid, usedIpUuid).set(UsedIpVO_.vmNicUuid, vmNicUUid).update(); VmNicVO nic = Q.New(VmNicVO.class).eq(VmNicVO_.uuid, vmNicUUid).find(); + if (nic == null) { + logger.debug(String.format("VmNic[uuid:%s] not found, skip afterAddIpAddress", vmNicUUid)); + return; + } UsedIpVO temp = null; /* if there is ipv4 addresses, we put the first attached ipv4 address to VmNic.ip @@ -88,6 +92,10 @@ public void afterAddIpAddress(String vmNicUUid, String usedIpUuid) { @Override public void afterDelIpAddress(String vmNicUUid, String usedIpUuid) { VmNicVO nic = Q.New(VmNicVO.class).eq(VmNicVO_.uuid, vmNicUUid).find(); + if (nic == null) { + logger.debug(String.format("VmNic[uuid:%s] not found, skip afterDelIpAddress", vmNicUUid)); + return; + } if (nic.getUsedIpUuid() != null && !nic.getUsedIpUuid().equals(usedIpUuid)) { return; } From 65453500d7614d0ebdb86bf5c601dcab08f360a0 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Fri, 13 Feb 2026 15:21:44 +0800 Subject: [PATCH 08/15] [network]: filter reserved IPs from GetFreeIp API results Resolves: ZSTAC-81182 Change-Id: Id1bb642154dc66ae9995dcc4d9fc00cdce9bcaf8 --- .../main/java/org/zstack/network/l3/L3BasicNetwork.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/network/src/main/java/org/zstack/network/l3/L3BasicNetwork.java b/network/src/main/java/org/zstack/network/l3/L3BasicNetwork.java index 5536a5fc487..b1b0b92d497 100755 --- a/network/src/main/java/org/zstack/network/l3/L3BasicNetwork.java +++ b/network/src/main/java/org/zstack/network/l3/L3BasicNetwork.java @@ -1075,6 +1075,13 @@ private void handle(APIGetFreeIpMsg msg) { } limit -= freeIpInventorys.size(); } + + Set reservedIpRanges = self.getReservedIpRanges(); + if (reservedIpRanges != null && !reservedIpRanges.isEmpty()) { + freeIpInventorys.removeIf(freeIp -> reservedIpRanges.stream().anyMatch( + r -> NetworkUtils.isInRange(freeIp.getIp(), r.getStartIp(), r.getEndIp()))); + } + reply.setInventories(freeIpInventorys); bus.reply(msg, reply); From 26b8b1a82959fd616637cf63cf6a46b725872173 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Thu, 12 Feb 2026 13:17:20 +0800 Subject: [PATCH 09/15] [mn]: synchronize hash ring operations to prevent dual-MN task stalling In dual management node scenarios, concurrent modifications to the consistent hash ring from heartbeat reconciliation and canonical event callbacks can cause NodeHash/Nodes inconsistency, leading to message routing failures and task timeouts. Fix: (1) synchronized all ResourceDestinationMakerImpl methods to ensure atomic nodeHash+nodes updates, (2) added lifecycleLock in ManagementNodeManagerImpl to serialize heartbeat reconciliation with event callbacks, (3) added two-round delayed confirmation before removing nodes from hash ring to avoid race with NodeJoin events. Resolves: ZSTAC-77711 Change-Id: I3d33d53595dd302784dff17417a5b25f2d0f3426 --- .../ResourceDestinationMakerImpl.java | 28 ++--- .../ManagementNodeManagerImpl.java | 101 ++++++++++++------ 2 files changed, 82 insertions(+), 47 deletions(-) diff --git a/core/src/main/java/org/zstack/core/cloudbus/ResourceDestinationMakerImpl.java b/core/src/main/java/org/zstack/core/cloudbus/ResourceDestinationMakerImpl.java index 08a776f1db2..9ead578395d 100755 --- a/core/src/main/java/org/zstack/core/cloudbus/ResourceDestinationMakerImpl.java +++ b/core/src/main/java/org/zstack/core/cloudbus/ResourceDestinationMakerImpl.java @@ -27,27 +27,27 @@ public class ResourceDestinationMakerImpl implements ManagementNodeChangeListene private DatabaseFacade dbf; @Override - public void nodeJoin(ManagementNodeInventory inv) { + public synchronized void nodeJoin(ManagementNodeInventory inv) { nodeHash.add(inv.getUuid()); nodes.put(inv.getUuid(), new NodeInfo(inv)); } @Override - public void nodeLeft(ManagementNodeInventory inv) { + public synchronized void nodeLeft(ManagementNodeInventory inv) { String nodeId = inv.getUuid(); nodeHash.remove(nodeId); nodes.remove(nodeId); } @Override - public void iAmDead(ManagementNodeInventory inv) { + public synchronized void iAmDead(ManagementNodeInventory inv) { String nodeId = inv.getUuid(); nodeHash.remove(nodeId); nodes.remove(nodeId); } @Override - public void iJoin(ManagementNodeInventory inv) { + public synchronized void iJoin(ManagementNodeInventory inv) { List lst = Q.New(ManagementNodeVO.class).list(); lst.forEach((ManagementNodeVO node) -> { nodeHash.add(node.getUuid()); @@ -56,7 +56,7 @@ public void iJoin(ManagementNodeInventory inv) { } @Override - public String makeDestination(String resourceUuid) { + public synchronized String makeDestination(String resourceUuid) { String nodeUuid = nodeHash.get(resourceUuid); if (nodeUuid == null) { throw new CloudRuntimeException("Cannot find any available management node to send message"); @@ -66,18 +66,18 @@ public String makeDestination(String resourceUuid) { } @Override - public boolean isManagedByUs(String resourceUuid) { + public synchronized boolean isManagedByUs(String resourceUuid) { String nodeUuid = makeDestination(resourceUuid); return nodeUuid.equals(Platform.getManagementServerId()); } @Override - public Collection getManagementNodesInHashRing() { - return nodeHash.getNodes(); + public synchronized Collection getManagementNodesInHashRing() { + return new ArrayList<>(nodeHash.getNodes()); } @Override - public NodeInfo getNodeInfo(String nodeUuid) { + public synchronized NodeInfo getNodeInfo(String nodeUuid) { NodeInfo info = nodes.get(nodeUuid); if (info == null) { ManagementNodeVO vo = dbf.findByUuid(nodeUuid, ManagementNodeVO.class); @@ -93,17 +93,17 @@ public NodeInfo getNodeInfo(String nodeUuid) { } @Override - public Collection getAllNodeInfo() { - return nodes.values(); + public synchronized Collection getAllNodeInfo() { + return new ArrayList<>(nodes.values()); } @Override - public int getManagementNodeCount() { - return nodes.values().size(); + public synchronized int getManagementNodeCount() { + return nodes.size(); } - public boolean isNodeInCircle(String nodeId) { + public synchronized boolean isNodeInCircle(String nodeId) { return nodeHash.hasNode(nodeId); } } diff --git a/portal/src/main/java/org/zstack/portal/managementnode/ManagementNodeManagerImpl.java b/portal/src/main/java/org/zstack/portal/managementnode/ManagementNodeManagerImpl.java index a945ab77274..4ece718ff52 100755 --- a/portal/src/main/java/org/zstack/portal/managementnode/ManagementNodeManagerImpl.java +++ b/portal/src/main/java/org/zstack/portal/managementnode/ManagementNodeManagerImpl.java @@ -74,6 +74,7 @@ import java.sql.SQLException; import java.sql.Timestamp; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -107,6 +108,15 @@ public class ManagementNodeManagerImpl extends AbstractService implements Manage // A dictionary (nodeId -> ManagementNodeInventory) of joined management Node final private Map joinedManagementNodes = new ConcurrentHashMap<>(); + // Lock to serialize lifecycle events from heartbeat reconciliation and canonical event callbacks, + // preventing race conditions where a nodeJoin event is immediately followed by a stale nodeLeft + // from the heartbeat thread, or vice versa. See ZSTAC-77711. + private final Object lifecycleLock = new Object(); + + // Track nodes found in hash ring but missing from DB. Only call nodeLeft after a node + // is missing for two consecutive heartbeat cycles, to avoid removing nodes that just joined. + private final Set suspectedMissingFromDb = new HashSet<>(); + private static int NODE_STARTING = 0; private static int NODE_RUNNING = 1; private static int NODE_FAILED = -1; @@ -368,12 +378,16 @@ protected void run(Map tokens, Object data) { ManagementNodeLifeCycleData d = (ManagementNodeLifeCycleData) data; - if (LifeCycle.NodeJoin.toString().equals(d.getLifeCycle())) { - nodeLifeCycle.nodeJoin(d.getInventory()); - } else if (LifeCycle.NodeLeft.toString().equals(d.getLifeCycle())) { - nodeLifeCycle.nodeLeft(d.getInventory()); - } else { - throw new CloudRuntimeException(String.format("unknown lifecycle[%s]", d.getLifeCycle())); + synchronized (lifecycleLock) { + if (LifeCycle.NodeJoin.toString().equals(d.getLifeCycle())) { + // Clear from suspected set since the node is confirmed alive + suspectedMissingFromDb.remove(d.getInventory().getUuid()); + nodeLifeCycle.nodeJoin(d.getInventory()); + } else if (LifeCycle.NodeLeft.toString().equals(d.getLifeCycle())) { + nodeLifeCycle.nodeLeft(d.getInventory()); + } else { + throw new CloudRuntimeException(String.format("unknown lifecycle[%s]", d.getLifeCycle())); + } } } }; @@ -860,34 +874,55 @@ private void checkAllNodesHealth() { Set nodeUuidsInDb = nodesInDb.stream().map(ManagementNodeVO::getUuid).collect(Collectors.toSet()); - // When a node is dying, we may not receive the the dead notification because the message bus may be also dead - // at that moment. By checking if the node UUID is still in our hash ring, we know what nodes should be kicked out - destinationMaker.getManagementNodesInHashRing().forEach(nodeUuid -> { - if (!nodeUuidsInDb.contains(nodeUuid)) { - logger.warn(String.format("found that a management node[uuid:%s] had no heartbeat in database but still in our hash ring," + - "notify that it's dead", nodeUuid)); - ManagementNodeInventory inv = new ManagementNodeInventory(); - inv.setUuid(nodeUuid); - inv.setHostName(destinationMaker.getNodeInfo(nodeUuid).getNodeIP()); - - nodeLifeCycle.nodeLeft(inv); - } - }); - - // check if any node missing in our hash ring - nodesInDb.forEach(n -> { - if (n.getUuid().equals(node().getUuid()) || suspects.contains(n)) { - return; - } - - new Runnable() { - @Override - @AsyncThread - public void run() { - nodeLifeCycle.nodeJoin(ManagementNodeInventory.valueOf(n)); + // Reconcile hash ring with DB under lifecycleLock to prevent race with + // canonical event callbacks (nodeJoin/nodeLeft). See ZSTAC-77711. + synchronized (lifecycleLock) { + // When a node is dying, we may not receive the dead notification because the message bus may be also dead + // at that moment. By checking if the node UUID is still in our hash ring, we know what nodes should be kicked out. + // Use two-round confirmation: first round marks as suspected, second round actually removes. + Set currentSuspected = new HashSet<>(); + destinationMaker.getManagementNodesInHashRing().forEach(nodeUuid -> { + if (!nodeUuidsInDb.contains(nodeUuid)) { + if (suspectedMissingFromDb.contains(nodeUuid)) { + // Second consecutive detection — confirmed missing, remove from hash ring + logger.warn(String.format("management node[uuid:%s] confirmed missing from database for two consecutive" + + " heartbeat cycles, removing from hash ring", nodeUuid)); + ManagementNodeInventory inv = new ManagementNodeInventory(); + inv.setUuid(nodeUuid); + try { + inv.setHostName(destinationMaker.getNodeInfo(nodeUuid).getNodeIP()); + } catch (Exception e) { + logger.warn(String.format("cannot get node info for node[uuid:%s], use empty hostname", nodeUuid)); + } + + nodeLifeCycle.nodeLeft(inv); + } else { + // First detection — mark as suspected, defer removal to next cycle + logger.warn(String.format("management node[uuid:%s] not found in database but still in hash ring," + + " marking as suspected (will remove on next heartbeat if still missing)", nodeUuid)); + currentSuspected.add(nodeUuid); + } } - }.run(); - }); + }); + // Update suspected set: only keep nodes that are newly suspected this round + suspectedMissingFromDb.clear(); + suspectedMissingFromDb.addAll(currentSuspected); + + // check if any node missing in our hash ring + nodesInDb.forEach(n -> { + if (n.getUuid().equals(node().getUuid()) || suspects.contains(n)) { + return; + } + + new Runnable() { + @Override + @AsyncThread + public void run() { + nodeLifeCycle.nodeJoin(ManagementNodeInventory.valueOf(n)); + } + }.run(); + }); + } } @Override From aaeaf39c323da0013c01cab8ff58d0fbb3163c9e Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Fri, 13 Feb 2026 19:25:07 +0800 Subject: [PATCH 10/15] [storage]: desensitize mdsUrls in ExternalPrimaryStorageInventory The mdsUrls field in ExternalPrimaryStorage config contains user:password@host format credentials. Add desensitization to mask credentials as ***@host in API/CLI output. Resolves: ZSTAC-80664 Change-Id: I94bdede5a1b52eb039de70efb5458693484405f7 --- .../ExternalPrimaryStorageInventory.java | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/header/src/main/java/org/zstack/header/storage/addon/primary/ExternalPrimaryStorageInventory.java b/header/src/main/java/org/zstack/header/storage/addon/primary/ExternalPrimaryStorageInventory.java index 7808c227623..a15ed211307 100644 --- a/header/src/main/java/org/zstack/header/storage/addon/primary/ExternalPrimaryStorageInventory.java +++ b/header/src/main/java/org/zstack/header/storage/addon/primary/ExternalPrimaryStorageInventory.java @@ -4,8 +4,10 @@ import org.zstack.header.storage.primary.PrimaryStorageInventory; import org.zstack.utils.gson.JSONObjectUtil; +import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.stream.Collectors; @Inventory(mappingVOClass = ExternalPrimaryStorageVO.class) @@ -59,6 +61,7 @@ public ExternalPrimaryStorageInventory(ExternalPrimaryStorageVO lvo) { super(lvo); identity = lvo.getIdentity(); config = JSONObjectUtil.toObject(lvo.getConfig(), LinkedHashMap.class); + desensitizeConfig(config); addonInfo = JSONObjectUtil.toObject(lvo.getAddonInfo(), LinkedHashMap.class); outputProtocols = lvo.getOutputProtocols().stream().map(PrimaryStorageOutputProtocolRefVO::getOutputProtocol).collect(Collectors.toList()); defaultProtocol = lvo.getDefaultProtocol(); @@ -68,6 +71,35 @@ public static ExternalPrimaryStorageInventory valueOf(ExternalPrimaryStorageVO l return new ExternalPrimaryStorageInventory(lvo); } + private static void desensitizeConfig(Map config) { + if (config == null) return; + desensitizeUrlList(config, "mdsUrls"); + desensitizeUrlList(config, "mdsInfos"); + } + + private static void desensitizeUrlList(Map config, String key) { + Object urls = config.get(key); + if (urls instanceof List) { + List desensitized = new ArrayList<>(); + for (Object url : (List) urls) { + desensitized.add(desensitizeUrl(String.valueOf(url))); + } + config.put(key, desensitized); + } + } + + private static String desensitizeUrl(String url) { + int atIndex = url.lastIndexOf('@'); + if (atIndex > 0) { + int schemeIndex = url.indexOf("://"); + if (schemeIndex >= 0 && schemeIndex < atIndex) { + return url.substring(0, schemeIndex + 3) + "***" + url.substring(atIndex); + } + return "***" + url.substring(atIndex); + } + return url; + } + public String getIdentity() { return identity; } From f41558d0f404210562a7e13e05a64b928932cb4f Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Mon, 16 Feb 2026 17:36:51 +0800 Subject: [PATCH 11/15] [volumebackup]: add backup cancel timeout error code Add ORG_ZSTACK_STORAGE_BACKUP_CANCEL_TIMEOUT constant to CloudOperationsErrorCode for use in premium volumebackup module. Resolves: ZSTAC-82195 Change-Id: Ibc405876e1171b637cf76b91a6822574fb6e7811 --- .../zstack/utils/clouderrorcode/CloudOperationsErrorCode.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/src/main/java/org/zstack/utils/clouderrorcode/CloudOperationsErrorCode.java b/utils/src/main/java/org/zstack/utils/clouderrorcode/CloudOperationsErrorCode.java index a0f09d4f1e9..a550fb7d673 100644 --- a/utils/src/main/java/org/zstack/utils/clouderrorcode/CloudOperationsErrorCode.java +++ b/utils/src/main/java/org/zstack/utils/clouderrorcode/CloudOperationsErrorCode.java @@ -6274,6 +6274,8 @@ public class CloudOperationsErrorCode { public static final String ORG_ZSTACK_STORAGE_BACKUP_10133 = "ORG_ZSTACK_STORAGE_BACKUP_10133"; + public static final String ORG_ZSTACK_STORAGE_BACKUP_CANCEL_TIMEOUT = "ORG_ZSTACK_STORAGE_BACKUP_CANCEL_TIMEOUT"; + public static final String ORG_ZSTACK_COMPUTE_10000 = "ORG_ZSTACK_COMPUTE_10000"; public static final String ORG_ZSTACK_COMPUTE_10001 = "ORG_ZSTACK_COMPUTE_10001"; From 7f53f5a5ae1064721fba6ad1c4ab914b5862d3e7 Mon Sep 17 00:00:00 2001 From: "yaohua.wu" Date: Mon, 16 Feb 2026 21:25:44 +0800 Subject: [PATCH 12/15] [thread]: guard Context.current() with telemetry check SyncTaskFuture constructor calls Context.current() unconditionally, triggering ServiceLoader for ContextStorageProvider even when telemetry is disabled. If sentry-opentelemetry-bootstrap jar is on classpath, ServiceLoader fails with "not a subtype" due to ClassLoader isolation in Tomcat, throwing ServiceConfigurationError (extends Error) that escapes all catch(Exception) blocks. 1. Why is this change necessary? MN startup crashes with ORG_ZSTACK_CORE_WORKFLOW_10001 because Context.current() triggers ServiceLoader unconditionally in SyncTaskFuture constructor, even when telemetry is disabled. 2. How does it address the problem? Only call Context.current() when isTelemetryEnabled() returns true, matching the existing guard pattern used in other DispatchQueueImpl code paths (lines 351, 1069). 3. Are there any side effects? None. When telemetry is disabled, parentContext was never used. # Summary of changes (by module): - core/thread: conditionalize Context.current() in SyncTaskFuture Related: ZSTAC-82275 Change-Id: I5c0e1f15769c746c630028a29df8cf1815620608 --- .../src/main/java/org/zstack/core/thread/DispatchQueueImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/zstack/core/thread/DispatchQueueImpl.java b/core/src/main/java/org/zstack/core/thread/DispatchQueueImpl.java index e298cdd787f..961b192ac7c 100755 --- a/core/src/main/java/org/zstack/core/thread/DispatchQueueImpl.java +++ b/core/src/main/java/org/zstack/core/thread/DispatchQueueImpl.java @@ -302,7 +302,7 @@ private class SyncTaskFuture extends AbstractFuture { public SyncTaskFuture(SyncTask task) { super(task); - this.parentContext = Context.current(); + this.parentContext = isTelemetryEnabled() ? Context.current() : null; } private SyncTask getTask() { From b65f4992e4f1547febda0dc134651c4a0d1004a2 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Fri, 13 Feb 2026 18:45:24 +0800 Subject: [PATCH 13/15] [securityGroup]: remove strict consecutive priority validation for security group rules Resolves: ZSTAC-79067 Change-Id: I76c6d17b02f87f5836506e2c79be5538b3b0d89f --- .../SecurityGroupApiInterceptor.java | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/plugin/securityGroup/src/main/java/org/zstack/network/securitygroup/SecurityGroupApiInterceptor.java b/plugin/securityGroup/src/main/java/org/zstack/network/securitygroup/SecurityGroupApiInterceptor.java index 135ad3886aa..bd994e6d069 100755 --- a/plugin/securityGroup/src/main/java/org/zstack/network/securitygroup/SecurityGroupApiInterceptor.java +++ b/plugin/securityGroup/src/main/java/org/zstack/network/securitygroup/SecurityGroupApiInterceptor.java @@ -343,13 +343,8 @@ private void validate(APISetVmNicSecurityGroupMsg msg) { if (!aoMap.isEmpty()) { Integer[] priorities = aoMap.keySet().toArray(new Integer[aoMap.size()]); Arrays.sort(priorities); - if (priorities[0] != 1) { - throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10022, "could no set vm nic security group, because invalid priority, priority expects to start at 1, but [%d]", priorities[0])); - } - for (int i = 0; i < priorities.length - 1; i++) { - if (priorities[i] + 1 != priorities[i + 1]) { - throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10023, "could no set vm nic security group, because invalid priority, priority[%d] and priority[%d] expected to be consecutive", priorities[i], priorities[i + 1])); - } + if (priorities[0] < 1) { + throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10022, "could no set vm nic security group, because invalid priority, priority expects to be positive, but [%d]", priorities[0])); } } @@ -390,13 +385,8 @@ private void validate(APISetVmNicSecurityGroupMsg msg) { if (!adminIntegers.isEmpty()) { Integer[] priorities = adminIntegers.toArray(new Integer[adminIntegers.size()]); Arrays.sort(priorities); - if (priorities[0] != 1) { - throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10024, "could no set vm nic security group, because admin security group priority[%d] must be higher than users", priorities[0])); - } - for (int i = 0; i < priorities.length - 1; i++) { - if (priorities[i] + 1 != priorities[i + 1]) { - throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10025, "could no set vm nic security group, because admin security group priority[%d] must be higher than users", priorities[i + 1])); - } + if (priorities[0] < 1) { + throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10024, "could no set vm nic security group, because admin security group priority[%d] must be positive", priorities[0])); } } } @@ -498,8 +488,9 @@ private void validate(APIUpdateSecurityGroupRulePriorityMsg msg) { rvos.stream().filter(rvo -> rvo.getUuid().equals(ao.getRuleUuid())).findFirst().orElseThrow(() -> new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10041, "could not update security group rule priority, because rule[uuid:%s] not in security group[uuid:%s]", ao.getRuleUuid(), msg.getSecurityGroupUuid()))); - rvos.stream().filter(rvo -> rvo.getPriority() == ao.getPriority()).findFirst().orElseThrow(() -> - new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10042, "could not update security group rule priority, because priority[%d] not in security group[uuid:%s]", ao.getPriority(), msg.getSecurityGroupUuid()))); + if (ao.getPriority() < 1) { + throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10042, "could not update security group rule priority, because priority[%d] must be positive", ao.getPriority())); + } } List uuidList = new ArrayList<>(priorityMap.values()); @@ -534,8 +525,8 @@ private void validate(APIChangeSecurityGroupRuleMsg msg) { if (count.intValue() > SecurityGroupGlobalConfig.SECURITY_GROUP_RULES_NUM_LIMIT.value(Integer.class)) { throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10047, "could not change security group rule, because security group %s rules number[%d] is out of max limit[%d]", vo.getType(), count.intValue(), SecurityGroupGlobalConfig.SECURITY_GROUP_RULES_NUM_LIMIT.value(Integer.class))); } - if (msg.getPriority() > count.intValue()) { - throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10048, "could not change security group rule, because the maximum priority of %s rule is [%d]", vo.getType().toString(), count.intValue())); + if (msg.getPriority() > SecurityGroupGlobalConfig.SECURITY_GROUP_RULES_NUM_LIMIT.value(Integer.class)) { + throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10048, "could not change security group rule, because priority[%d] exceeds the maximum limit[%d]", msg.getPriority(), SecurityGroupGlobalConfig.SECURITY_GROUP_RULES_NUM_LIMIT.value(Integer.class))); } if (msg.getPriority() < 0) { msg.setPriority(SecurityGroupConstant.LOWEST_RULE_PRIORITY); @@ -1198,11 +1189,8 @@ private void validate(APIAddSecurityGroupRuleMsg msg) { throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10119, "could not add security group rule, because security group %s rules number[%d] is out of max limit[%d]", SecurityGroupRuleType.Egress, (egressRuleCount + toCreateEgressRuleCount), SecurityGroupGlobalConfig.SECURITY_GROUP_RULES_NUM_LIMIT.value(Integer.class))); } - if (msg.getPriority() > (ingressRuleCount + 1) && toCreateIngressRuleCount > 0) { - throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10120, "could not add security group rule, because priority[%d] must be consecutive, the ingress rule maximum priority is [%d]", msg.getPriority(), ingressRuleCount)); - } - if (msg.getPriority() > (egressRuleCount + 1) && toCreateEgressRuleCount > 0) { - throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10121, "could not add security group rule, because priority[%d] must be consecutive, the egress rule maximum priority is [%d]", msg.getPriority(), egressRuleCount)); + if (msg.getPriority() > SecurityGroupGlobalConfig.SECURITY_GROUP_RULES_NUM_LIMIT.value(Integer.class)) { + throw new ApiMessageInterceptionException(argerr(ORG_ZSTACK_NETWORK_SECURITYGROUP_10120, "could not add security group rule, because priority[%d] exceeds the maximum limit[%d]", msg.getPriority(), SecurityGroupGlobalConfig.SECURITY_GROUP_RULES_NUM_LIMIT.value(Integer.class))); } } From fb9245a7f0e6726f2bb39dc589cb9922313c1e80 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Mon, 16 Feb 2026 15:37:30 +0800 Subject: [PATCH 14/15] [securitygroup]: update tests for relaxed priority validation Resolves: ZSTAC-79067 Change-Id: Ib001cad155814a0018b5675fa375f63e8fc9c0ff --- .../AddSecurityGroupRuleOptimizedCase.groovy | 23 +++++++++---------- .../ChangeSecurityGroupRuleCase.groovy | 23 ++++--------------- 2 files changed, 15 insertions(+), 31 deletions(-) diff --git a/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/AddSecurityGroupRuleOptimizedCase.groovy b/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/AddSecurityGroupRuleOptimizedCase.groovy index ed9a2736c5f..d0f71d027b2 100644 --- a/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/AddSecurityGroupRuleOptimizedCase.groovy +++ b/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/AddSecurityGroupRuleOptimizedCase.groovy @@ -591,13 +591,13 @@ class AddSecurityGroupRuleOptimizedCase extends SubCase { rule_13.protocol = "ALL" rule_13.startPort = -1 rule_13.endPort = -1 - expect(AssertionError) { - addSecurityGroupRule { - securityGroupUuid = sg3.uuid - rules = [rule_13] - priority = 13 - } + // ZSTAC-79067: non-consecutive priorities are now allowed + sg3 = addSecurityGroupRule { + securityGroupUuid = sg3.uuid + rules = [rule_13] + priority = 13 } + assert sg3.rules.find { it.priority == 13 } != null SecurityGroupRuleAO rule_12 = new SecurityGroupRuleAO() rule_12.dstIpRange = "2.2.2.2-2.2.2.10" @@ -609,12 +609,11 @@ class AddSecurityGroupRuleOptimizedCase extends SubCase { ingressRule.protocol = "TCP" ingressRule.dstPortRange = "12-13" - expect(AssertionError) { - addSecurityGroupRule { - securityGroupUuid = sg3.uuid - rules = [rule_12, ingressRule] - priority = 12 - } + // ZSTAC-79067: mixed ingress+egress add at non-consecutive priority is now allowed + sg3 = addSecurityGroupRule { + securityGroupUuid = sg3.uuid + rules = [rule_12, ingressRule] + priority = 12 } } diff --git a/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/ChangeSecurityGroupRuleCase.groovy b/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/ChangeSecurityGroupRuleCase.groovy index ad29787ef1e..60b59424257 100644 --- a/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/ChangeSecurityGroupRuleCase.groovy +++ b/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/ChangeSecurityGroupRuleCase.groovy @@ -231,19 +231,8 @@ class ChangeSecurityGroupRuleCase extends SubCase { assert sg3 != null SecurityGroupRuleInventory rule_1 = sg3.rules.find { it.type == "Ingress" && it.priority == 1 && it.ipVersion == 4 } - expect(AssertionError) { - changeSecurityGroupRule { - uuid = rule_1.uuid - priority = 6 - } - } - - expect(AssertionError) { - changeSecurityGroupRule { - uuid = rule_1.uuid - priority = 7 - } - } + // ZSTAC-79067: priorities beyond current rule count are now allowed + // (removed consecutive priority restriction) } void testChangeRuleDuplicate() { @@ -307,12 +296,8 @@ class ChangeSecurityGroupRuleCase extends SubCase { } } - expect(AssertionError) { - changeSecurityGroupRule { - uuid = rule1.uuid - priority = 3 - } - } + // ZSTAC-79067: priority beyond rule count is now allowed + // (removed consecutive restriction, only check against global limit) expect(AssertionError) { changeSecurityGroupRule { From e5b952bf2ec863af609103f933b0b79c2f225c13 Mon Sep 17 00:00:00 2001 From: "ye.zou" Date: Tue, 17 Feb 2026 10:32:12 +0800 Subject: [PATCH 15/15] [securitygroup]: update test to verify global limit instead of consecutive priority Resolves: ZSTAC-79067 Change-Id: I88361b5f9bcf7ca6f0f9faccfa0f2e843d06e4cc --- .../AddSecurityGroupRuleOptimizedCase.groovy | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/AddSecurityGroupRuleOptimizedCase.groovy b/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/AddSecurityGroupRuleOptimizedCase.groovy index d0f71d027b2..15f9c2c3d66 100644 --- a/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/AddSecurityGroupRuleOptimizedCase.groovy +++ b/test/src/test/groovy/org/zstack/test/integration/networkservice/provider/flat/securitygroup/AddSecurityGroupRuleOptimizedCase.groovy @@ -531,16 +531,17 @@ class AddSecurityGroupRuleOptimizedCase extends SubCase { } } - SecurityGroupRuleAO rule_82 = new SecurityGroupRuleAO() - rule_82.type = "Ingress" - rule_82.protocol = "TCP" - rule_82.dstPortRange = 82 + // ZSTAC-79067: non-consecutive priorities are now allowed, verify global limit instead + SecurityGroupRuleAO rule_over_limit = new SecurityGroupRuleAO() + rule_over_limit.type = "Ingress" + rule_over_limit.protocol = "TCP" + rule_over_limit.dstPortRange = 82 expect(AssertionError) { addSecurityGroupRule { securityGroupUuid = sg3.uuid - rules = [rule_82] - priority = 82 + rules = [rule_over_limit] + priority = 101 } } }