feat: implement periodic scaling checks and enhance instance management in AutoScaler
Build & Test (NowChessSystems) TeamCity build failed
Build & Test (NowChessSystems) TeamCity build failed
This commit is contained in:
+32
-1
@@ -8,6 +8,7 @@ import de.nowchess.coordinator.config.CoordinatorConfig
|
|||||||
import io.fabric8.kubernetes.api.model.GenericKubernetesResource
|
import io.fabric8.kubernetes.api.model.GenericKubernetesResource
|
||||||
import io.fabric8.kubernetes.client.KubernetesClient
|
import io.fabric8.kubernetes.client.KubernetesClient
|
||||||
import io.micrometer.core.instrument.{Gauge, MeterRegistry}
|
import io.micrometer.core.instrument.{Gauge, MeterRegistry}
|
||||||
|
import io.quarkus.scheduler.Scheduled
|
||||||
import org.jboss.logging.Logger
|
import org.jboss.logging.Logger
|
||||||
|
|
||||||
import java.util.concurrent.atomic.AtomicReference
|
import java.util.concurrent.atomic.AtomicReference
|
||||||
@@ -25,6 +26,12 @@ class AutoScaler:
|
|||||||
@Inject
|
@Inject
|
||||||
private var instanceRegistry: InstanceRegistry = uninitialized
|
private var instanceRegistry: InstanceRegistry = uninitialized
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
private var loadBalancer: LoadBalancer = uninitialized
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
private var failoverService: FailoverService = uninitialized
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
private var meterRegistry: MeterRegistry = uninitialized
|
private var meterRegistry: MeterRegistry = uninitialized
|
||||||
// scalafix:on DisableSyntax.var
|
// scalafix:on DisableSyntax.var
|
||||||
@@ -51,6 +58,11 @@ class AutoScaler:
|
|||||||
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment(0)
|
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment(0)
|
||||||
()
|
()
|
||||||
|
|
||||||
|
@Scheduled(every = "10s")
|
||||||
|
def periodicScaleCheck(): Unit =
|
||||||
|
try checkAndScale
|
||||||
|
catch case ex: Exception => log.warnf(ex, "Auto-scale check failed")
|
||||||
|
|
||||||
// scalafix:off DisableSyntax.asInstanceOf
|
// scalafix:off DisableSyntax.asInstanceOf
|
||||||
private def rolloutSpec(rollout: GenericKubernetesResource): Option[java.util.Map[String, AnyRef]] =
|
private def rolloutSpec(rollout: GenericKubernetesResource): Option[java.util.Map[String, AnyRef]] =
|
||||||
Option(rollout.get[AnyRef]("spec")).collect { case m: java.util.Map[?, ?] =>
|
Option(rollout.get[AnyRef]("spec")).collect { case m: java.util.Map[?, ?] =>
|
||||||
@@ -105,6 +117,7 @@ class AutoScaler:
|
|||||||
currentReplicas,
|
currentReplicas,
|
||||||
currentReplicas + 1,
|
currentReplicas + 1,
|
||||||
)
|
)
|
||||||
|
loadBalancer.rebalance
|
||||||
else log.infof("Already at max replicas %d for %s", maxReplicas, config.k8sRolloutName)
|
else log.infof("Already at max replicas %d for %s", maxReplicas, config.k8sRolloutName)
|
||||||
case _ => ()
|
case _ => ()
|
||||||
}
|
}
|
||||||
@@ -116,6 +129,17 @@ class AutoScaler:
|
|||||||
|
|
||||||
def scaleDown(): Unit =
|
def scaleDown(): Unit =
|
||||||
log.info("Scaling down Argo Rollout")
|
log.info("Scaling down Argo Rollout")
|
||||||
|
val underloadedInstance = instanceRegistry.getAllInstances
|
||||||
|
.filter(_.state == "HEALTHY")
|
||||||
|
.minByOption(_.subscriptionCount)
|
||||||
|
|
||||||
|
underloadedInstance.foreach { inst =>
|
||||||
|
log.infof("Draining instance %s before scale-down", inst.instanceId)
|
||||||
|
failoverService
|
||||||
|
.onInstanceStreamDropped(inst.instanceId)
|
||||||
|
.subscribe()
|
||||||
|
.`with`(
|
||||||
|
_ =>
|
||||||
kubeClientOpt match
|
kubeClientOpt match
|
||||||
case None =>
|
case None =>
|
||||||
log.warn("Kubernetes client not available, cannot scale")
|
log.warn("Kubernetes client not available, cannot scale")
|
||||||
@@ -155,4 +179,11 @@ class AutoScaler:
|
|||||||
catch
|
catch
|
||||||
case ex: Exception =>
|
case ex: Exception =>
|
||||||
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
||||||
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName)
|
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName),
|
||||||
|
ex =>
|
||||||
|
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
||||||
|
log.warnf(ex, "Failed to drain instance %s before scale-down", inst.instanceId),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if underloadedInstance.isEmpty then log.warn("No healthy instances found for scale-down")
|
||||||
|
|||||||
+4
@@ -76,6 +76,10 @@ class InstanceRegistry:
|
|||||||
.onItem()
|
.onItem()
|
||||||
.transformToUni { value =>
|
.transformToUni { value =>
|
||||||
try
|
try
|
||||||
|
if value == null then
|
||||||
|
log.debugf("Instance %s metadata missing from Redis (may have expired)", instanceId)
|
||||||
|
Uni.createFrom().item(())
|
||||||
|
else
|
||||||
val metadata = mapper.readValue(value, classOf[InstanceMetadata])
|
val metadata = mapper.readValue(value, classOf[InstanceMetadata])
|
||||||
val isNew = !instances.containsKey(instanceId)
|
val isNew = !instances.containsKey(instanceId)
|
||||||
instances.put(instanceId, metadata)
|
instances.put(instanceId, metadata)
|
||||||
|
|||||||
@@ -60,8 +60,8 @@ class GameEngine(
|
|||||||
@SuppressWarnings(Array("DisableSyntax.var"))
|
@SuppressWarnings(Array("DisableSyntax.var"))
|
||||||
private var pendingTakebackRequest: Option[Color] = initialTakebackRequest
|
private var pendingTakebackRequest: Option[Color] = initialTakebackRequest
|
||||||
|
|
||||||
meterRegistry.foreach { reg =>
|
|
||||||
GameEngine.activeGamesCount.incrementAndGet()
|
GameEngine.activeGamesCount.incrementAndGet()
|
||||||
|
meterRegistry.foreach { reg =>
|
||||||
reg.counter("nowchess.games.started").increment()
|
reg.counter("nowchess.games.started").increment()
|
||||||
}
|
}
|
||||||
private def gamesCompletedCounter(result: String): Counter =
|
private def gamesCompletedCounter(result: String): Counter =
|
||||||
|
|||||||
+1
-1
@@ -199,7 +199,7 @@ class InstanceHeartbeatService:
|
|||||||
val json = mapper.writeValueAsString(metadata)
|
val json = mapper.writeValueAsString(metadata)
|
||||||
reactiveRedis
|
reactiveRedis
|
||||||
.value(classOf[String])
|
.value(classOf[String])
|
||||||
.setex(key, 5L, json)
|
.setex(key, 15L, json)
|
||||||
.subscribe()
|
.subscribe()
|
||||||
.`with`(
|
.`with`(
|
||||||
_ => redisHeartbeatPending.set(false),
|
_ => redisHeartbeatPending.set(false),
|
||||||
|
|||||||
Reference in New Issue
Block a user