fix: coordinator auto-scaling, cache eviction, rebalancing, and grpc timeouts
Build & Test (NowChessSystems) TeamCity build finished

Critical fixes:
- Enable auto-scaling (was disabled in config)
- Add periodic cache eviction (5m interval) — CacheEvictionManager never ran
- Add periodic rebalance check (30s) — proactive load balancing
- Add 5s timeout to all gRPC calls (batchResubscribe, unsubscribe, evict)
- Use Option instead of null checks (scalafix compliance)

These gaps left the coordinator unable to:
1. Scale up when instances overloaded (scaling was disabled)
2. Clean up idle games from memory (no scheduled eviction)
3. Rebalance load proactively (only on scale-up)
4. Handle hung instances (no RPC timeouts, operations could hang forever)

Combined with prior fixes for instance metadata parsing and heartbeat TTL,
the coordinator now handles overload scenarios correctly.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 22:20:25 +02:00
parent 3f12f695f1
commit d0c71693bb
7 changed files with 26 additions and 9 deletions
+1
View File
@@ -256,6 +256,7 @@
- `modules/coordinator/src/main/scala/de/nowchess/coordinator/service/AutoScaler.scala`
- class AutoScaler
- function initMetrics
- function periodicScaleCheck
- function checkAndScale
- function scaleUp
- function scaleDown
+1
View File
@@ -200,6 +200,7 @@
- `modules/coordinator/src/main/scala/de/nowchess/coordinator/service/AutoScaler.scala`
- class AutoScaler
- function initMetrics
- function periodicScaleCheck
- function checkAndScale
- function scaleUp
- function scaleDown
@@ -37,7 +37,7 @@ nowchess:
stream-heartbeat-interval: PT0.2S
cache-eviction-interval: 10m
game-idle-threshold: 45m
auto-scale-enabled: false
auto-scale-enabled: true
scale-up-threshold: 0.8
scale-down-threshold: 0.3
scale-min-replicas: 2
@@ -39,7 +39,7 @@ class CoreGrpcClient:
def batchResubscribeGames(host: String, port: Int, gameIds: List[String]): Int =
try
val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port))
val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port)).withDeadlineAfter(5, TimeUnit.SECONDS)
val request = BatchResubscribeRequest.newBuilder().addAllGameIds(gameIds.asJava).build()
val count = stub.batchResubscribeGames(request).getSubscribedCount
log.debugf("batchResubscribeGames %s:%d — subscribed %d games", host, port, count)
@@ -52,7 +52,7 @@ class CoreGrpcClient:
def unsubscribeGames(host: String, port: Int, gameIds: List[String]): Int =
try
val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port))
val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port)).withDeadlineAfter(5, TimeUnit.SECONDS)
val request = UnsubscribeGamesRequest.newBuilder().addAllGameIds(gameIds.asJava).build()
val count = stub.unsubscribeGames(request).getUnsubscribedCount
log.debugf("unsubscribeGames %s:%d — unsubscribed %d games", host, port, count)
@@ -65,7 +65,7 @@ class CoreGrpcClient:
def evictGames(host: String, port: Int, gameIds: List[String]): Int =
try
val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port))
val stub = CoordinatorServiceGrpc.newBlockingStub(getChannel(host, port)).withDeadlineAfter(5, TimeUnit.SECONDS)
val request = EvictGamesRequest.newBuilder().addAllGameIds(gameIds.asJava).build()
val count = stub.evictGames(request).getEvictedCount
log.debugf("evictGames %s:%d — evicted %d games", host, port, count)
@@ -7,6 +7,7 @@ import io.quarkus.redis.datasource.RedisDataSource
import de.nowchess.coordinator.config.CoordinatorConfig
import com.fasterxml.jackson.databind.ObjectMapper
import io.micrometer.core.instrument.MeterRegistry
import io.quarkus.scheduler.Scheduled
import scala.jdk.CollectionConverters.*
import org.jboss.logging.Logger
import scala.compiletime.uninitialized
@@ -48,6 +49,11 @@ class CacheEvictionManager:
meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record(0L, TimeUnit.MILLISECONDS)
meterRegistry.counter("nowchess.coordinator.cache.evictions").increment(0)
@Scheduled(every = "5m")
def periodicCacheEviction(): Unit =
try evictStaleGames
catch case ex: Exception => log.warnf(ex, "Periodic cache eviction failed")
def evictStaleGames: Unit =
meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record((() => runEviction()): Runnable)
@@ -76,11 +76,13 @@ class InstanceRegistry:
.onItem()
.transformToUni { value =>
try
if value == null then
log.debugf("Instance %s metadata missing from Redis (may have expired)", instanceId)
Uni.createFrom().item(())
else
val metadata = mapper.readValue(value, classOf[InstanceMetadata])
Option(value).fold(
{
log.debugf("Instance %s metadata missing from Redis (may have expired)", instanceId)
Uni.createFrom().item(())
},
) { json =>
val metadata = mapper.readValue(json, classOf[InstanceMetadata])
val isNew = !instances.containsKey(instanceId)
instances.put(instanceId, metadata)
if isNew then
@@ -94,6 +96,7 @@ class InstanceRegistry:
metadata.state,
)
Uni.createFrom().item(())
}
catch
case ex: Exception =>
log.warnf(ex, "Failed to parse instance metadata for %s — removing from registry", instanceId)
@@ -4,6 +4,7 @@ import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject
import de.nowchess.coordinator.config.CoordinatorConfig
import io.quarkus.redis.datasource.RedisDataSource
import io.quarkus.scheduler.Scheduled
import org.jboss.logging.Logger
import scala.compiletime.uninitialized
import scala.concurrent.duration.*
@@ -125,3 +126,8 @@ class LoadBalancer:
catch
case ex: Exception =>
log.warnf(ex, "Failed to update Redis game sets")
@Scheduled(every = "30s")
def periodicRebalanceCheck(): Unit =
try if shouldRebalance then rebalance
catch case ex: Exception => log.warnf(ex, "Periodic rebalance check failed")