feat: implement periodic scaling checks and enhance instance management in AutoScaler
Build & Test (NowChessSystems) TeamCity build failed

This commit is contained in:
2026-05-13 22:08:22 +02:00
parent 0a3c494fa8
commit 3f12f695f1
4 changed files with 85 additions and 50 deletions
@@ -8,6 +8,7 @@ import de.nowchess.coordinator.config.CoordinatorConfig
import io.fabric8.kubernetes.api.model.GenericKubernetesResource
import io.fabric8.kubernetes.client.KubernetesClient
import io.micrometer.core.instrument.{Gauge, MeterRegistry}
import io.quarkus.scheduler.Scheduled
import org.jboss.logging.Logger
import java.util.concurrent.atomic.AtomicReference
@@ -25,6 +26,12 @@ class AutoScaler:
@Inject
private var instanceRegistry: InstanceRegistry = uninitialized
@Inject
private var loadBalancer: LoadBalancer = uninitialized
@Inject
private var failoverService: FailoverService = uninitialized
@Inject
private var meterRegistry: MeterRegistry = uninitialized
// scalafix:on DisableSyntax.var
@@ -51,6 +58,11 @@ class AutoScaler:
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment(0)
()
@Scheduled(every = "10s")
def periodicScaleCheck(): Unit =
try checkAndScale
catch case ex: Exception => log.warnf(ex, "Auto-scale check failed")
// scalafix:off DisableSyntax.asInstanceOf
private def rolloutSpec(rollout: GenericKubernetesResource): Option[java.util.Map[String, AnyRef]] =
Option(rollout.get[AnyRef]("spec")).collect { case m: java.util.Map[?, ?] =>
@@ -105,6 +117,7 @@ class AutoScaler:
currentReplicas,
currentReplicas + 1,
)
loadBalancer.rebalance
else log.infof("Already at max replicas %d for %s", maxReplicas, config.k8sRolloutName)
case _ => ()
}
@@ -116,43 +129,61 @@ class AutoScaler:
def scaleDown(): Unit =
log.info("Scaling down Argo Rollout")
kubeClientOpt match
case None =>
log.warn("Kubernetes client not available, cannot scale")
case Some(kube) =>
try
Option(
kube
.genericKubernetesResources(argoApiVersion, argoKind)
.inNamespace(config.k8sNamespace)
.withName(config.k8sRolloutName)
.get(),
).foreach { rollout =>
rolloutSpec(rollout).foreach { spec =>
spec.get("replicas") match
case replicas: Integer =>
val currentReplicas = replicas.intValue()
val minReplicas = config.scaleMinReplicas
val underloadedInstance = instanceRegistry.getAllInstances
.filter(_.state == "HEALTHY")
.minByOption(_.subscriptionCount)
if currentReplicas > minReplicas then
spec.put("replicas", Integer.valueOf(currentReplicas - 1))
underloadedInstance.foreach { inst =>
log.infof("Draining instance %s before scale-down", inst.instanceId)
failoverService
.onInstanceStreamDropped(inst.instanceId)
.subscribe()
.`with`(
_ =>
kubeClientOpt match
case None =>
log.warn("Kubernetes client not available, cannot scale")
case Some(kube) =>
try
Option(
kube
.genericKubernetesResources(argoApiVersion, argoKind)
.inNamespace(config.k8sNamespace)
.resource(rollout)
.update()
meterRegistry.counter("nowchess.coordinator.scale.events", "direction", "down").increment()
log.infof(
"Scaled down %s from %d to %d replicas",
config.k8sRolloutName,
currentReplicas,
currentReplicas - 1,
)
else log.infof("Already at min replicas %d for %s", minReplicas, config.k8sRolloutName)
case _ => ()
}
}
catch
case ex: Exception =>
.withName(config.k8sRolloutName)
.get(),
).foreach { rollout =>
rolloutSpec(rollout).foreach { spec =>
spec.get("replicas") match
case replicas: Integer =>
val currentReplicas = replicas.intValue()
val minReplicas = config.scaleMinReplicas
if currentReplicas > minReplicas then
spec.put("replicas", Integer.valueOf(currentReplicas - 1))
kube
.genericKubernetesResources(argoApiVersion, argoKind)
.inNamespace(config.k8sNamespace)
.resource(rollout)
.update()
meterRegistry.counter("nowchess.coordinator.scale.events", "direction", "down").increment()
log.infof(
"Scaled down %s from %d to %d replicas",
config.k8sRolloutName,
currentReplicas,
currentReplicas - 1,
)
else log.infof("Already at min replicas %d for %s", minReplicas, config.k8sRolloutName)
case _ => ()
}
}
catch
case ex: Exception =>
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName),
ex =>
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName)
log.warnf(ex, "Failed to drain instance %s before scale-down", inst.instanceId),
)
}
if underloadedInstance.isEmpty then log.warn("No healthy instances found for scale-down")
@@ -76,20 +76,24 @@ class InstanceRegistry:
.onItem()
.transformToUni { value =>
try
val metadata = mapper.readValue(value, classOf[InstanceMetadata])
val isNew = !instances.containsKey(instanceId)
instances.put(instanceId, metadata)
if isNew then
meterRegistry.counter("nowchess.coordinator.instances.joined").increment()
log.infof("Instance %s joined registry (subscriptions=%d)", instanceId, metadata.subscriptionCount)
if value == null then
log.debugf("Instance %s metadata missing from Redis (may have expired)", instanceId)
Uni.createFrom().item(())
else
log.debugf(
"Instance %s updated (subscriptions=%d state=%s)",
instanceId,
metadata.subscriptionCount,
metadata.state,
)
Uni.createFrom().item(())
val metadata = mapper.readValue(value, classOf[InstanceMetadata])
val isNew = !instances.containsKey(instanceId)
instances.put(instanceId, metadata)
if isNew then
meterRegistry.counter("nowchess.coordinator.instances.joined").increment()
log.infof("Instance %s joined registry (subscriptions=%d)", instanceId, metadata.subscriptionCount)
else
log.debugf(
"Instance %s updated (subscriptions=%d state=%s)",
instanceId,
metadata.subscriptionCount,
metadata.state,
)
Uni.createFrom().item(())
catch
case ex: Exception =>
log.warnf(ex, "Failed to parse instance metadata for %s — removing from registry", instanceId)
@@ -60,8 +60,8 @@ class GameEngine(
@SuppressWarnings(Array("DisableSyntax.var"))
private var pendingTakebackRequest: Option[Color] = initialTakebackRequest
GameEngine.activeGamesCount.incrementAndGet()
meterRegistry.foreach { reg =>
GameEngine.activeGamesCount.incrementAndGet()
reg.counter("nowchess.games.started").increment()
}
private def gamesCompletedCounter(result: String): Counter =
@@ -199,7 +199,7 @@ class InstanceHeartbeatService:
val json = mapper.writeValueAsString(metadata)
reactiveRedis
.value(classOf[String])
.setex(key, 5L, json)
.setex(key, 15L, json)
.subscribe()
.`with`(
_ => redisHeartbeatPending.set(false),