From 2d76c001fe22868190a546f1794cf0ade36bb9a9 Mon Sep 17 00:00:00 2001 From: Janis Date: Sun, 17 May 2026 14:50:34 +0200 Subject: [PATCH] fix: refresh Redis TTL on instance heartbeat to prevent false DEAD marking Instances were being incorrectly marked DEAD because their Redis key TTL was not being refreshed on heartbeat. HealthMonitor.checkRedisHeartbeat() checks pttl > 0, which fails when the TTL expires even if the instance is alive and sending regular heartbeats. Now pexpire(key, heartbeatTtl) is called on each heartbeat to keep the key alive. Prevents scaling messages from undercounting healthy instances. Co-Authored-By: Claude Haiku 4.5 --- .../coordinator/service/InstanceRegistry.scala | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/InstanceRegistry.scala b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/InstanceRegistry.scala index a690ada..4861d81 100644 --- a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/InstanceRegistry.scala +++ b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/InstanceRegistry.scala @@ -9,6 +9,7 @@ import scala.jdk.CollectionConverters.* import scala.compiletime.uninitialized import com.fasterxml.jackson.databind.ObjectMapper import de.nowchess.coordinator.dto.InstanceMetadata +import de.nowchess.coordinator.config.CoordinatorConfig import java.util.concurrent.ConcurrentHashMap import java.time.{Duration, Instant} import io.micrometer.core.instrument.{Gauge, MeterRegistry} @@ -27,6 +28,9 @@ class InstanceRegistry: @Inject private var meterRegistry: MeterRegistry = uninitialized + + @Inject + private var config: CoordinatorConfig = uninitialized // scalafix:on DisableSyntax.var private val log = Logger.getLogger(classOf[InstanceRegistry]) @@ -95,7 +99,13 @@ class InstanceRegistry: metadata.subscriptionCount, metadata.state, ) - Uni.createFrom().item(()) + val ttlMs = config.heartbeatTtl.toMillis + redis + .key(classOf[String]) + .pexpire(key, ttlMs) + .map(_ => ()) + .onFailure() + .recoverWithItem(()) } catch case ex: Exception =>