feat: implement periodic scaling checks and enhance instance management in AutoScaler
Build & Test (NowChessSystems) TeamCity build failed

This commit is contained in:
2026-05-13 22:08:22 +02:00
parent 0a3c494fa8
commit 3f12f695f1
4 changed files with 85 additions and 50 deletions
@@ -8,6 +8,7 @@ import de.nowchess.coordinator.config.CoordinatorConfig
import io.fabric8.kubernetes.api.model.GenericKubernetesResource import io.fabric8.kubernetes.api.model.GenericKubernetesResource
import io.fabric8.kubernetes.client.KubernetesClient import io.fabric8.kubernetes.client.KubernetesClient
import io.micrometer.core.instrument.{Gauge, MeterRegistry} import io.micrometer.core.instrument.{Gauge, MeterRegistry}
import io.quarkus.scheduler.Scheduled
import org.jboss.logging.Logger import org.jboss.logging.Logger
import java.util.concurrent.atomic.AtomicReference import java.util.concurrent.atomic.AtomicReference
@@ -25,6 +26,12 @@ class AutoScaler:
@Inject @Inject
private var instanceRegistry: InstanceRegistry = uninitialized private var instanceRegistry: InstanceRegistry = uninitialized
@Inject
private var loadBalancer: LoadBalancer = uninitialized
@Inject
private var failoverService: FailoverService = uninitialized
@Inject @Inject
private var meterRegistry: MeterRegistry = uninitialized private var meterRegistry: MeterRegistry = uninitialized
// scalafix:on DisableSyntax.var // scalafix:on DisableSyntax.var
@@ -51,6 +58,11 @@ class AutoScaler:
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment(0) meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment(0)
() ()
@Scheduled(every = "10s")
def periodicScaleCheck(): Unit =
try checkAndScale
catch case ex: Exception => log.warnf(ex, "Auto-scale check failed")
// scalafix:off DisableSyntax.asInstanceOf // scalafix:off DisableSyntax.asInstanceOf
private def rolloutSpec(rollout: GenericKubernetesResource): Option[java.util.Map[String, AnyRef]] = private def rolloutSpec(rollout: GenericKubernetesResource): Option[java.util.Map[String, AnyRef]] =
Option(rollout.get[AnyRef]("spec")).collect { case m: java.util.Map[?, ?] => Option(rollout.get[AnyRef]("spec")).collect { case m: java.util.Map[?, ?] =>
@@ -105,6 +117,7 @@ class AutoScaler:
currentReplicas, currentReplicas,
currentReplicas + 1, currentReplicas + 1,
) )
loadBalancer.rebalance
else log.infof("Already at max replicas %d for %s", maxReplicas, config.k8sRolloutName) else log.infof("Already at max replicas %d for %s", maxReplicas, config.k8sRolloutName)
case _ => () case _ => ()
} }
@@ -116,6 +129,17 @@ class AutoScaler:
def scaleDown(): Unit = def scaleDown(): Unit =
log.info("Scaling down Argo Rollout") log.info("Scaling down Argo Rollout")
val underloadedInstance = instanceRegistry.getAllInstances
.filter(_.state == "HEALTHY")
.minByOption(_.subscriptionCount)
underloadedInstance.foreach { inst =>
log.infof("Draining instance %s before scale-down", inst.instanceId)
failoverService
.onInstanceStreamDropped(inst.instanceId)
.subscribe()
.`with`(
_ =>
kubeClientOpt match kubeClientOpt match
case None => case None =>
log.warn("Kubernetes client not available, cannot scale") log.warn("Kubernetes client not available, cannot scale")
@@ -155,4 +179,11 @@ class AutoScaler:
catch catch
case ex: Exception => case ex: Exception =>
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment() meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName) log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName),
ex =>
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
log.warnf(ex, "Failed to drain instance %s before scale-down", inst.instanceId),
)
}
if underloadedInstance.isEmpty then log.warn("No healthy instances found for scale-down")
@@ -76,6 +76,10 @@ class InstanceRegistry:
.onItem() .onItem()
.transformToUni { value => .transformToUni { value =>
try try
if value == null then
log.debugf("Instance %s metadata missing from Redis (may have expired)", instanceId)
Uni.createFrom().item(())
else
val metadata = mapper.readValue(value, classOf[InstanceMetadata]) val metadata = mapper.readValue(value, classOf[InstanceMetadata])
val isNew = !instances.containsKey(instanceId) val isNew = !instances.containsKey(instanceId)
instances.put(instanceId, metadata) instances.put(instanceId, metadata)
@@ -60,8 +60,8 @@ class GameEngine(
@SuppressWarnings(Array("DisableSyntax.var")) @SuppressWarnings(Array("DisableSyntax.var"))
private var pendingTakebackRequest: Option[Color] = initialTakebackRequest private var pendingTakebackRequest: Option[Color] = initialTakebackRequest
meterRegistry.foreach { reg =>
GameEngine.activeGamesCount.incrementAndGet() GameEngine.activeGamesCount.incrementAndGet()
meterRegistry.foreach { reg =>
reg.counter("nowchess.games.started").increment() reg.counter("nowchess.games.started").increment()
} }
private def gamesCompletedCounter(result: String): Counter = private def gamesCompletedCounter(result: String): Counter =
@@ -199,7 +199,7 @@ class InstanceHeartbeatService:
val json = mapper.writeValueAsString(metadata) val json = mapper.writeValueAsString(metadata)
reactiveRedis reactiveRedis
.value(classOf[String]) .value(classOf[String])
.setex(key, 5L, json) .setex(key, 15L, json)
.subscribe() .subscribe()
.`with`( .`with`(
_ => redisHeartbeatPending.set(false), _ => redisHeartbeatPending.set(false),