From 380a2cceeb5873bf93ff17a1e87d62408ef8e178 Mon Sep 17 00:00:00 2001 From: Janis Date: Wed, 13 May 2026 18:09:51 +0200 Subject: [PATCH] feat: add periodic health check to evict dead instances Add quarkus-scheduler dependency and schedule health check every 10 seconds. Dead instances (marked with state="DEAD") now automatically evicted instead of accumulating indefinitely. Co-Authored-By: Claude Haiku 4.5 --- modules/coordinator/build.gradle.kts | 1 + .../de/nowchess/coordinator/service/HealthMonitor.scala | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/coordinator/build.gradle.kts b/modules/coordinator/build.gradle.kts index 312ec3b..7192371 100644 --- a/modules/coordinator/build.gradle.kts +++ b/modules/coordinator/build.gradle.kts @@ -78,6 +78,7 @@ dependencies { implementation("com.fasterxml.jackson.module:jackson-module-scala_3:${versions["JACKSON_SCALA"]!!}") implementation("io.quarkus:quarkus-redis-client") implementation("io.quarkus:quarkus-kubernetes-client") + implementation("io.quarkus:quarkus-scheduler") testImplementation(platform("org.junit:junit-bom:${versions["JUNIT_BOM"]!!}")) testImplementation("org.junit.jupiter:junit-jupiter") diff --git a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala index 1b31f17..e76599e 100644 --- a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala +++ b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala @@ -5,6 +5,7 @@ import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.event.Observes import jakarta.enterprise.inject.Instance import jakarta.inject.Inject +import io.quarkus.scheduler.Scheduled import de.nowchess.coordinator.config.CoordinatorConfig import io.fabric8.kubernetes.client.KubernetesClient import io.fabric8.kubernetes.api.model.Pod @@ -73,7 +74,12 @@ class HealthMonitor: Thread.ofVirtual().start(() => validateStartupInstances(timeoutMs)) startPodWatch() - def checkInstanceHealth: Unit = + @Scheduled(every = "10s") + def periodicHealthCheck(): Unit = + try checkInstanceHealth() + catch case ex: Exception => log.warnf(ex, "Health check failed") + + def checkInstanceHealth(): Unit = meterRegistry.counter("nowchess.coordinator.health.checks").increment() val evicted = instanceRegistry.evictStaleInstances(config.instanceDeadTimeout) if evicted.nonEmpty then