fix: remove corrupted instances immediately and evict dead instances

Problem: Dead instances pile up indefinitely. Failed metadata parsing leaves stale data in registry. No cleanup mechanism exists.

Changes:
1. Remove instance from registry on parse failure (corrupted metadata = unrecoverable)
2. Evict instances with state="DEAD" on next health check (was only evicting by heartbeat age)

This prevents:
- Memory leak from accumulating dead/corrupted instances
- Stale data persisting after parse failures
- Dead instances blocking resources indefinitely

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 17:16:25 +02:00
parent 67802ee1b8
commit 43184d296d
@@ -92,7 +92,9 @@ class InstanceRegistry:
Uni.createFrom().item(())
catch
case ex: Exception =>
log.warnf(ex, "Failed to parse instance metadata for %s", instanceId)
log.warnf(ex, "Failed to parse instance metadata for %s — removing from registry", instanceId)
instances.remove(instanceId)
meterRegistry.counter("nowchess.coordinator.instances.removed").increment()
Uni.createFrom().item(())
}
.onFailure()
@@ -112,15 +114,21 @@ class InstanceRegistry:
val stale = instances.asScala
.collect { case (id, inst) =>
try
if Instant.parse(inst.lastHeartbeat).isBefore(cutoff) then Some(id)
else None
val isHeartbeatStale = Instant.parse(inst.lastHeartbeat).isBefore(cutoff)
val isDead = inst.state == "DEAD"
if isHeartbeatStale || isDead then Some(id) else None
catch case _: Exception => None
}
.flatten
.toList
stale.foreach { id =>
instances.remove(id)
val inst = Option(instances.remove(id))
meterRegistry.counter("nowchess.coordinator.instances.evicted").increment()
log.warnf("Evicted stale instance %s (heartbeat older than %s)", id, maxAge)
inst.foreach { i =>
if i.state == "DEAD" then
log.warnf("Evicted dead instance %s", id)
else
log.warnf("Evicted stale instance %s (heartbeat older than %s)", id, maxAge)
}
}
stale