From 960a419792e1161fb7241e465b7349efe4a10137 Mon Sep 17 00:00:00 2001
From: Janis <janis.e.20@gmx.de>
Date: Thu, 14 May 2026 04:47:20 +0200
Subject: [PATCH] fix: force-delete hanging pods and remove failed instances
 from registry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When pod deletion fails, instances remained in registry with state=DEAD,
preventing scale-down since avgLoad calculation counted them. Now:

- Use gracePeriod(0) for immediate pod deletion instead of 30s wait
  (prevents cascade when nodes are down or pods stuck terminating)
- Remove instance from registry on deletion failure anyway
  (prevents dead instances from blocking scale-down via avgLoad)

This breaks the cycle: failed deletions → scaleUp → max replicas →
more failures → more stuck instances blocking recovery.
---
 .../de/nowchess/coordinator/service/HealthMonitor.scala    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala
index e4452b2..d288b0e 100644
--- a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala
+++ b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala
@@ -185,14 +185,15 @@ class HealthMonitor:
           pods.find(pod => pod.getMetadata.getName.contains(instanceId)) match
             case Some(pod) =>
               val podName = pod.getMetadata.getName
-              kube.pods().inNamespace(config.k8sNamespace).withName(podName).delete()
+              kube.pods().inNamespace(config.k8sNamespace).withName(podName).withGracePeriod(0L).delete()
               meterRegistry.counter("nowchess.coordinator.pods.deleted").increment()
-              log.infof("Deleted pod %s for dead instance %s", podName, instanceId)
+              log.infof("Force-deleted pod %s for dead instance %s", podName, instanceId)
             case None =>
               log.debugf("No pod found for instance %s, skipping deletion", instanceId)
         catch
           case ex: Exception =>
-            log.warnf(ex, "Failed to delete pod for instance %s", instanceId)
+            log.warnf(ex, "Failed to delete pod for instance %s — removing from registry to prevent blocking scale-down", instanceId)
+            instanceRegistry.removeInstance(instanceId)
 
   private def validateStartupInstances(timeoutMs: Long): Unit =
     Thread.sleep(timeoutMs)