fix: refresh Redis TTL on instance heartbeat to prevent false DEAD marking

Instances were being incorrectly marked DEAD because their Redis key TTL was
not being refreshed on heartbeat. HealthMonitor.checkRedisHeartbeat() checks
pttl > 0, which fails when the TTL expires even if the instance is alive and
sending regular heartbeats.

Now pexpire(key, heartbeatTtl) is called on each heartbeat to keep the key
alive. Prevents scaling messages from undercounting healthy instances.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-05-17 14:50:34 +02:00
parent b58bbbc782
commit 2d76c001fe
@@ -9,6 +9,7 @@ import scala.jdk.CollectionConverters.*
import scala.compiletime.uninitialized
import com.fasterxml.jackson.databind.ObjectMapper
import de.nowchess.coordinator.dto.InstanceMetadata
import de.nowchess.coordinator.config.CoordinatorConfig
import java.util.concurrent.ConcurrentHashMap
import java.time.{Duration, Instant}
import io.micrometer.core.instrument.{Gauge, MeterRegistry}
@@ -27,6 +28,9 @@ class InstanceRegistry:
@Inject
private var meterRegistry: MeterRegistry = uninitialized
@Inject
private var config: CoordinatorConfig = uninitialized
// scalafix:on DisableSyntax.var
private val log = Logger.getLogger(classOf[InstanceRegistry])
@@ -95,7 +99,13 @@ class InstanceRegistry:
metadata.subscriptionCount,
metadata.state,
)
Uni.createFrom().item(())
val ttlMs = config.heartbeatTtl.toMillis
redis
.key(classOf[String])
.pexpire(key, ttlMs)
.map(_ => ())
.onFailure()
.recoverWithItem(())
}
catch
case ex: Exception =>