From 43184d296da5a6a7b760ac90c2b739220d86bce3 Mon Sep 17 00:00:00 2001 From: Janis Date: Wed, 13 May 2026 17:16:25 +0200 Subject: [PATCH] fix: remove corrupted instances immediately and evict dead instances Problem: Dead instances pile up indefinitely. Failed metadata parsing leaves stale data in registry. No cleanup mechanism exists. Changes: 1. Remove instance from registry on parse failure (corrupted metadata = unrecoverable) 2. Evict instances with state="DEAD" on next health check (was only evicting by heartbeat age) This prevents: - Memory leak from accumulating dead/corrupted instances - Stale data persisting after parse failures - Dead instances blocking resources indefinitely Co-Authored-By: Claude Haiku 4.5 --- .../coordinator/service/InstanceRegistry.scala | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/InstanceRegistry.scala b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/InstanceRegistry.scala index da91967..541f0cd 100644 --- a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/InstanceRegistry.scala +++ b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/InstanceRegistry.scala @@ -92,7 +92,9 @@ class InstanceRegistry: Uni.createFrom().item(()) catch case ex: Exception => - log.warnf(ex, "Failed to parse instance metadata for %s", instanceId) + log.warnf(ex, "Failed to parse instance metadata for %s — removing from registry", instanceId) + instances.remove(instanceId) + meterRegistry.counter("nowchess.coordinator.instances.removed").increment() Uni.createFrom().item(()) } .onFailure() @@ -112,15 +114,21 @@ class InstanceRegistry: val stale = instances.asScala .collect { case (id, inst) => try - if Instant.parse(inst.lastHeartbeat).isBefore(cutoff) then Some(id) - else None + val isHeartbeatStale = Instant.parse(inst.lastHeartbeat).isBefore(cutoff) + val isDead = inst.state == "DEAD" + if isHeartbeatStale || isDead then Some(id) else None catch case _: Exception => None } .flatten .toList stale.foreach { id => - instances.remove(id) + val inst = Option(instances.remove(id)) meterRegistry.counter("nowchess.coordinator.instances.evicted").increment() - log.warnf("Evicted stale instance %s (heartbeat older than %s)", id, maxAge) + inst.foreach { i => + if i.state == "DEAD" then + log.warnf("Evicted dead instance %s", id) + else + log.warnf("Evicted stale instance %s (heartbeat older than %s)", id, maxAge) + } } stale