From 960a419792e1161fb7241e465b7349efe4a10137 Mon Sep 17 00:00:00 2001 From: Janis Date: Thu, 14 May 2026 04:47:20 +0200 Subject: [PATCH] fix: force-delete hanging pods and remove failed instances from registry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When pod deletion fails, instances remained in registry with state=DEAD, preventing scale-down since avgLoad calculation counted them. Now: - Use gracePeriod(0) for immediate pod deletion instead of 30s wait (prevents cascade when nodes are down or pods stuck terminating) - Remove instance from registry on deletion failure anyway (prevents dead instances from blocking scale-down via avgLoad) This breaks the cycle: failed deletions → scaleUp → max replicas → more failures → more stuck instances blocking recovery. --- .../de/nowchess/coordinator/service/HealthMonitor.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala index e4452b2..d288b0e 100644 --- a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala +++ b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala @@ -185,14 +185,15 @@ class HealthMonitor: pods.find(pod => pod.getMetadata.getName.contains(instanceId)) match case Some(pod) => val podName = pod.getMetadata.getName - kube.pods().inNamespace(config.k8sNamespace).withName(podName).delete() + kube.pods().inNamespace(config.k8sNamespace).withName(podName).withGracePeriod(0L).delete() meterRegistry.counter("nowchess.coordinator.pods.deleted").increment() - log.infof("Deleted pod %s for dead instance %s", podName, instanceId) + log.infof("Force-deleted pod %s for dead instance %s", podName, instanceId) case None => log.debugf("No pod found for instance %s, skipping deletion", instanceId) catch case ex: Exception => - log.warnf(ex, "Failed to delete pod for instance %s", instanceId) + log.warnf(ex, "Failed to delete pod for instance %s — removing from registry to prevent blocking scale-down", instanceId) + instanceRegistry.removeInstance(instanceId) private def validateStartupInstances(timeoutMs: Long): Unit = Thread.sleep(timeoutMs)