diff --git a/compute/src/main/java/org/zstack/compute/vm/VmInstanceBase.java b/compute/src/main/java/org/zstack/compute/vm/VmInstanceBase.java index d8c8be6532..80ba38486f 100755 --- a/compute/src/main/java/org/zstack/compute/vm/VmInstanceBase.java +++ b/compute/src/main/java/org/zstack/compute/vm/VmInstanceBase.java @@ -7226,6 +7226,7 @@ protected void scripts() { self.setZoneUuid(spec.getDestHost().getZoneUuid()); } }.execute()); + syncVmDevicesAddressInfo(self.getUuid()); logger.debug(String.format("vm[uuid:%s] is running ..", self.getUuid())); VmInstanceInventory inv = VmInstanceInventory.valueOf(self); extEmitter.afterStartVm(inv); @@ -7495,6 +7496,9 @@ public void run(FlowTrigger trigger, Map data) { self.setHypervisorType(spec.getDestHost().getHypervisorType()); self.setRootVolumeUuid(spec.getDestRootVolume().getUuid()); }); + if (struct.getStrategy() == VmCreationStrategy.InstantStart) { + syncVmDevicesAddressInfo(self.getUuid()); + } logger.debug(String.format("vm[uuid:%s] is started ..", self.getUuid())); VmInstanceInventory inv = VmInstanceInventory.valueOf(self); extEmitter.afterStartNewCreatedVm(inv); @@ -7931,6 +7935,7 @@ public void handle(Map data) { public void done() { self = changeVmStateInDb(VmInstanceStateEvent.running, () -> self.setHostUuid(originalCopy.getHostUuid())); + syncVmDevicesAddressInfo(self.getUuid()); VmInstanceInventory inv = VmInstanceInventory.valueOf(self); extEmitter.afterRebootVm(inv); new StaticIpOperator().deleteIpChange(self.getUuid()); @@ -8347,6 +8352,7 @@ protected void resumeVm(final Message msg, Completion completion) { @Override public void handle(Map Data) { self = changeVmStateInDb(VmInstanceStateEvent.running); + syncVmDevicesAddressInfo(self.getUuid()); completion.success(); } }).error(new FlowErrorHandler(completion) { @@ -8467,6 +8473,28 @@ public String getName() { }); } + private void syncVmDevicesAddressInfo(String vmUuid) { + if (self.getHostUuid() == null) { + return; + } + SyncVmDeviceInfoMsg msg = new SyncVmDeviceInfoMsg(); + msg.setVmInstanceUuid(vmUuid); + msg.setHostUuid(self.getHostUuid()); + bus.makeTargetServiceIdByResourceUuid(msg, HostConstant.SERVICE_ID, msg.getHostUuid()); + bus.send(msg, new CloudBusCallBack(msg) { + @Override + public void run(MessageReply reply) { + if (!reply.isSuccess()) { + logger.warn(String.format("Failed to sync vm device info for vm[uuid:%s], %s", + vmUuid, reply.getError())); + } else { + logger.debug(String.format("Sent SyncVmDeviceInfoMsg for vm[uuid:%s] on host[uuid:%s]", + vmUuid, self.getHostUuid())); + } + } + }); + } + private void deleteVmCdRom(String cdRomUuid, Completion completion) { boolean exist = dbf.isExist(cdRomUuid, VmCdRomVO.class); if (!exist) { diff --git a/conf/i18n/globalErrorCodeMapping/global-error-en_US.json b/conf/i18n/globalErrorCodeMapping/global-error-en_US.json index 32eb4c8f05..715e823d95 100644 --- a/conf/i18n/globalErrorCodeMapping/global-error-en_US.json +++ b/conf/i18n/globalErrorCodeMapping/global-error-en_US.json @@ -3374,7 +3374,7 @@ "ORG_ZSTACK_NETWORK_HUAWEI_IMASTER_10019": "delete token of SDN controller [IP:%s] failed because %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10004": "Cannot execute volume mapping to host flow due to invalid volume ID.%s", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10007": "port forwarding rule [uuid:%s] has not been attached to any virtual machine network interface, cannot detach", - "ORG_ZSTACK_MEVOCO_10088": "cannot take a snapshot for volumes[%s] when volume[uuid: %s] is not attached", + "ORG_ZSTACK_MEVOCO_10088": "cannot create snapshot for volume[uuid:%s] because it is not attached to any VM instance. Please attach the volume to a VM first. Affected volumes: %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10005": "Cannot execute map LUN to host flow due to invalid LUN type: %s", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10008": "port forwarding rule [uuid:%s] has been associated with vm nic [uuid:%s], cannot be reassigned again", "ORG_ZSTACK_MEVOCO_10087": "A Running VM[uuid:%s] has no associated Host UUID.", diff --git a/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json b/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json index 84609838dd..01960e8eb4 100644 --- a/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json +++ b/conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json @@ -3374,7 +3374,7 @@ "ORG_ZSTACK_NETWORK_HUAWEI_IMASTER_10019": "删除 SDN 控制器 [IP:%s] 的令牌失败,因为 %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10004": "无法执行映射LUN到主机流程,无效的LUN ID", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10007": "端口转发规则 rule[uuid:%s] 没有绑定到任何 VM 的网卡上,无法解除绑定", - "ORG_ZSTACK_MEVOCO_10088": "无法为挂载状态以外的卷[%s]创建快照", + "ORG_ZSTACK_MEVOCO_10088": "无法为云盘[uuid:%s]创建快照,因为该云盘未挂载到任何云主机。请先将云盘挂载到云主机后再创建快照。相关云盘: %s", "ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10005": "无法执行映射LUN到主机流程,无效的LUN类型", "ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10008": "端口转发规则[uuid:%s]已绑定到VM网卡[uuid:%s],无法再次绑定", "ORG_ZSTACK_MEVOCO_10087": "如何一个运行中的VM[uuid:%s]没有宿主机uuid?", diff --git a/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java b/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java index 7007c592ae..99ee2173b9 100755 --- a/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java +++ b/header/src/main/java/org/zstack/header/vm/VmInstanceSpec.java @@ -847,7 +847,9 @@ public void setBootMode(String bootMode) { public long getRootDiskAllocateSize() { if (rootDiskOffering == null) { - return this.getImageSpec().getInventory().getSize(); + long virtualSize = this.getImageSpec().getInventory().getSize(); + long actualSize = this.getImageSpec().getInventory().getActualSize(); + return Math.max(virtualSize, actualSize); } return rootDiskOffering.getDiskSize(); } diff --git a/header/src/main/java/org/zstack/header/vm/VmInstanceState.java b/header/src/main/java/org/zstack/header/vm/VmInstanceState.java index 8a755b52fd..49303e2325 100755 --- a/header/src/main/java/org/zstack/header/vm/VmInstanceState.java +++ b/header/src/main/java/org/zstack/header/vm/VmInstanceState.java @@ -168,6 +168,7 @@ public enum VmInstanceState { new Transaction(VmInstanceStateEvent.destroyed, VmInstanceState.Destroyed), new Transaction(VmInstanceStateEvent.destroying, VmInstanceState.Destroying), new Transaction(VmInstanceStateEvent.running, VmInstanceState.Running), + new Transaction(VmInstanceStateEvent.stopped, VmInstanceState.Stopped), new Transaction(VmInstanceStateEvent.expunging, VmInstanceState.Expunging) ); Destroyed.transactions( diff --git a/plugin/kvm/src/main/java/org/zstack/kvm/KvmVmSyncPingTask.java b/plugin/kvm/src/main/java/org/zstack/kvm/KvmVmSyncPingTask.java index b02525b99d..d1bb913390 100755 --- a/plugin/kvm/src/main/java/org/zstack/kvm/KvmVmSyncPingTask.java +++ b/plugin/kvm/src/main/java/org/zstack/kvm/KvmVmSyncPingTask.java @@ -69,6 +69,12 @@ public class KvmVmSyncPingTask extends VmTracer implements KVMPingAgentNoFailure private List skipVmTracerReplies = new ArrayList<>(); private Map vmInShutdownMap = new ConcurrentHashMap<>(); + // Orphaned skip entries from departed MN nodes. Key=vmUuid, Value=timestamp when orphaned. + // These VMs remain in skip-trace state for ORPHAN_TTL_MS to avoid false HA triggers + // when a MN restarts and its in-flight VM operations haven't completed yet. See ZSTAC-80821. + private final ConcurrentHashMap orphanedSkipVms = new ConcurrentHashMap<>(); + private static final long ORPHAN_TTL_MS = 10 * 60 * 1000; // 10 minutes + { getReflections().getTypesAnnotatedWith(SkipVmTracer.class).forEach(clz -> { skipVmTracerMessages.add(clz.asSubclass(Message.class)); @@ -196,8 +202,13 @@ private void syncVm(final HostInventory host, final Completion completion) { // Get vms to skip before send command to host to confirm the vm will be skipped after sync command finished. // The problem is if one vm-sync skipped operation is started and finished during vm sync command's handling // vm state would still be sync to mn + // ZSTAC-80821: clean up expired orphaned entries each sync cycle + cleanupExpiredOrphanedSkipVms(); + Set vmsToSkipSetHostSide = new HashSet<>(); vmsToSkip.values().forEach(vmsToSkipSetHostSide::addAll); + // ZSTAC-80821: also skip VMs from departed MN nodes that are still within TTL + vmsToSkipSetHostSide.addAll(orphanedSkipVms.keySet()); // if the vm is not running on host when sync command executing but started as soon as possible // before response handling of vm sync, mgmtSideStates will including the running vm but not result in @@ -228,6 +239,8 @@ public void run(MessageReply reply) { // Get vms to skip after sync result returned. vmsToSkip.values().forEach(vmsToSkipSetHostSide::addAll); + // ZSTAC-80821: include orphaned entries from departed MN nodes + vmsToSkipSetHostSide.addAll(orphanedSkipVms.keySet()); Collection vmUuidsInDeleteVmGC = DeleteVmGC.queryVmInGC(host.getUuid(), ret.getStates().keySet()); @@ -446,7 +459,19 @@ public void nodeJoin(ManagementNodeInventory inv) { @Override public void nodeLeft(ManagementNodeInventory inv) { vmApis.remove(inv.getUuid()); - vmsToSkip.remove(inv.getUuid()); + + // ZSTAC-80821: Instead of immediately removing skip list entries, move them + // to the orphaned set with a TTL. This prevents false HA triggers for VMs that + // are still being started by kvmagent but whose controlling MN has restarted. + Set skippedVms = vmsToSkip.remove(inv.getUuid()); + if (skippedVms != null && !skippedVms.isEmpty()) { + long now = System.currentTimeMillis(); + for (String vmUuid : skippedVms) { + orphanedSkipVms.put(vmUuid, now); + logger.info(String.format("moved VM[uuid:%s] from departed MN[uuid:%s] skip list to orphaned set" + + " (will expire in %d minutes)", vmUuid, inv.getUuid(), ORPHAN_TTL_MS / 60000)); + } + } } @Override @@ -460,6 +485,39 @@ public void iJoin(ManagementNodeInventory inv) { } public boolean isVmDoNotNeedToTrace(String vmUuid) { - return vmsToSkip.values().stream().anyMatch(vmsToSkipSet -> vmsToSkipSet.contains(vmUuid)); + if (vmsToSkip.values().stream().anyMatch(vmsToSkipSet -> vmsToSkipSet.contains(vmUuid))) { + return true; + } + + // ZSTAC-80821: Also check orphaned skip entries from departed MN nodes + Long orphanedAt = orphanedSkipVms.get(vmUuid); + if (orphanedAt != null) { + if (System.currentTimeMillis() - orphanedAt < ORPHAN_TTL_MS) { + logger.debug(String.format("VM[uuid:%s] is in orphaned skip set, skipping trace", vmUuid)); + return true; + } else { + // Expired, clean up + orphanedSkipVms.remove(vmUuid, orphanedAt); + logger.info(String.format("orphaned skip entry for VM[uuid:%s] expired after %d minutes, resuming trace", + vmUuid, ORPHAN_TTL_MS / 60000)); + } + } + + return false; + } + + // Periodically clean up expired orphaned entries. Called from VM sync cycle. + private void cleanupExpiredOrphanedSkipVms() { + if (orphanedSkipVms.isEmpty()) { + return; + } + + long now = System.currentTimeMillis(); + for (Map.Entry entry : orphanedSkipVms.entrySet()) { + if (now - entry.getValue() >= ORPHAN_TTL_MS) { + orphanedSkipVms.remove(entry.getKey(), entry.getValue()); + logger.info(String.format("cleaned up expired orphaned skip entry for VM[uuid:%s]", entry.getKey())); + } + } } } diff --git a/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java b/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java index db06239acb..276ab367ba 100644 --- a/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java +++ b/plugin/zbs/src/main/java/org/zstack/storage/zbs/ZbsStorageController.java @@ -179,7 +179,10 @@ public List getActiveClients(String installPath, String prot if (VolumeProtocol.CBD.toString().equals(protocol)) { GetVolumeClientsCmd cmd = new GetVolumeClientsCmd(); cmd.setPath(installPath); - GetVolumeClientsRsp rsp = syncHttpCall(GET_VOLUME_CLIENTS_PATH, cmd, GetVolumeClientsRsp.class); + GetVolumeClientsRsp rsp = new HttpCaller<>(GET_VOLUME_CLIENTS_PATH, cmd, GetVolumeClientsRsp.class, + null, TimeUnit.SECONDS, 30, true) + .setTryNext(true) + .syncCall(); List clients = new ArrayList<>(); if (!rsp.isSuccess()) { @@ -1411,6 +1414,11 @@ public class HttpCaller { private boolean tryNext = false; + HttpCaller setTryNext(boolean tryNext) { + this.tryNext = tryNext; + return this; + } + public HttpCaller(String path, AgentCommand cmd, Class retClass, ReturnValueCompletion callback) { this(path, cmd, retClass, callback, null, 0, false); }