fix: coordinator auto-scaling, cache eviction, rebalancing, and grpc timeouts

Critical fixes: - Enable auto-scaling (was disabled in config) - Add periodic cache eviction (5m interval) — CacheEvictionManager never ran - Add periodic rebalance check (30s) — proactive load balancing - Add 5s timeout to all gRPC calls (batchResubscribe, unsubscribe, evict) - Use Option instead of null checks (scalafix compliance) These gaps left the coordinator unable to: 1. Scale up when instances overloaded (scaling was disabled) 2. Clean up idle games from memory (no scheduled eviction) 3. Rebalance load proactively (only on scale-up) 4. Handle hung instances (no RPC timeouts, operations could hang forever) Combined with prior fixes for instance metadata parsing and heartbeat TTL, the coordinator now handles overload scenarios correctly. Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-05-13 22:20:25 +02:00
parent 3f12f695f1
commit d0c71693bb
7 changed files with 26 additions and 9 deletions
@@ -256,6 +256,7 @@
 - `modules/coordinator/src/main/scala/de/nowchess/coordinator/service/AutoScaler.scala`
  - class AutoScaler
  - function initMetrics
+  - function periodicScaleCheck
  - function checkAndScale
  - function scaleUp
  - function scaleDown
@@ -200,6 +200,7 @@
 - `modules/coordinator/src/main/scala/de/nowchess/coordinator/service/AutoScaler.scala`
  - class AutoScaler
  - function initMetrics
+  - function periodicScaleCheck
  - function checkAndScale
  - function scaleUp
  - function scaleDown
@@ -37,7 +37,7 @@ nowchess:
    stream-heartbeat-interval: PT0.2S
    cache-eviction-interval: 10m
    game-idle-threshold: 45m
-    auto-scale-enabled: false
+    auto-scale-enabled: true
    scale-up-threshold: 0.8
    scale-down-threshold: 0.3
    scale-min-replicas: 2
@@ -39,7 +39,7 @@ class CoreGrpcClient:

  def batchResubscribeGames(host: String, port: Int, gameIds: List[String]): Int =
    try
-      val stub    = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port))
+      val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port)).withDeadlineAfter(5, TimeUnit.SECONDS)
      val request = BatchResubscribeRequest.newBuilder().addAllGameIds(gameIds.asJava).build()
      val count   = stub.batchResubscribeGames(request).getSubscribedCount
      log.debugf("batchResubscribeGames %s:%d — subscribed %d games", host, port, count)
@@ -52,7 +52,7 @@ class CoreGrpcClient:

  def unsubscribeGames(host: String, port: Int, gameIds: List[String]): Int =
    try
-      val stub    = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port))
+      val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port)).withDeadlineAfter(5, TimeUnit.SECONDS)
      val request = UnsubscribeGamesRequest.newBuilder().addAllGameIds(gameIds.asJava).build()
      val count   = stub.unsubscribeGames(request).getUnsubscribedCount
      log.debugf("unsubscribeGames %s:%d — unsubscribed %d games", host, port, count)
@@ -65,7 +65,7 @@ class CoreGrpcClient:

  def evictGames(host: String, port: Int, gameIds: List[String]): Int =
    try
-      val stub    = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port))
+      val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port)).withDeadlineAfter(5, TimeUnit.SECONDS)
      val request = EvictGamesRequest.newBuilder().addAllGameIds(gameIds.asJava).build()
      val count   = stub.evictGames(request).getEvictedCount
      log.debugf("evictGames %s:%d — evicted %d games", host, port, count)
@@ -7,6 +7,7 @@ import io.quarkus.redis.datasource.RedisDataSource
 import de.nowchess.coordinator.config.CoordinatorConfig
 import com.fasterxml.jackson.databind.ObjectMapper
 import io.micrometer.core.instrument.MeterRegistry
+import io.quarkus.scheduler.Scheduled
 import scala.jdk.CollectionConverters.*
 import org.jboss.logging.Logger
 import scala.compiletime.uninitialized
@@ -48,6 +49,11 @@ class CacheEvictionManager:
    meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record(0L, TimeUnit.MILLISECONDS)
    meterRegistry.counter("nowchess.coordinator.cache.evictions").increment(0)

+  @Scheduled(every = "5m")
+  def periodicCacheEviction(): Unit =
+    try evictStaleGames
+    catch case ex: Exception => log.warnf(ex, "Periodic cache eviction failed")
+
  def evictStaleGames: Unit =
    meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record((() => runEviction()): Runnable)

@@ -76,11 +76,13 @@ class InstanceRegistry:
      .onItem()
      .transformToUni { value =>
        try
-          if value == null then
-            log.debugf("Instance %s metadata missing from Redis (may have expired)", instanceId)
-            Uni.createFrom().item(())
-          else
-            val metadata = mapper.readValue(value, classOf[InstanceMetadata])
+          Option(value).fold(
+            {
+              log.debugf("Instance %s metadata missing from Redis (may have expired)", instanceId)
+              Uni.createFrom().item(())
+            },
+          ) { json =>
+            val metadata = mapper.readValue(json, classOf[InstanceMetadata])
            val isNew    = !instances.containsKey(instanceId)
            instances.put(instanceId, metadata)
            if isNew then
@@ -94,6 +96,7 @@ class InstanceRegistry:
                metadata.state,
              )
            Uni.createFrom().item(())
+          }
        catch
          case ex: Exception =>
            log.warnf(ex, "Failed to parse instance metadata for %s — removing from registry", instanceId)
@@ -4,6 +4,7 @@ import jakarta.enterprise.context.ApplicationScoped
 import jakarta.inject.Inject
 import de.nowchess.coordinator.config.CoordinatorConfig
 import io.quarkus.redis.datasource.RedisDataSource
+import io.quarkus.scheduler.Scheduled
 import org.jboss.logging.Logger
 import scala.compiletime.uninitialized
 import scala.concurrent.duration.*
@@ -125,3 +126,8 @@ class LoadBalancer:
    catch
      case ex: Exception =>
        log.warnf(ex, "Failed to update Redis game sets")
+
+  @Scheduled(every = "30s")
+  def periodicRebalanceCheck(): Unit =
+    try if shouldRebalance then rebalance
+    catch case ex: Exception => log.warnf(ex, "Periodic rebalance check failed")