feat: scale up on high CPU load, not just subscription count

AutoScaler now checks K8s pod metrics (CPU) in addition to subscription count.
Scale-up triggers if:
1. avgLoad > scaleUpThreshold * maxGamesPerCore, OR
2. Any instance has CPU > 800m

Fixes scenario where instance under heavy CPU load wouldn't scale without
high subscription count. Now responds to compute utilization, not just game count.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 23:43:50 +02:00
parent 4b3b5e7c4e
commit 255e2da33c
2 changed files with 38 additions and 2 deletions
@@ -28,7 +28,7 @@ nowchess:
internal:
secret: ${INTERNAL_SECRET:123abc}
coordinator:
max-games-per-core: 500
max-games-per-core: 100
max-deviation-percent: 20
rebalance-interval: 30s
rebalance-min-interval: 60s
@@ -10,6 +10,7 @@ import io.fabric8.kubernetes.client.KubernetesClient
import io.micrometer.core.instrument.{Gauge, MeterRegistry}
import io.quarkus.scheduler.Scheduled
import org.jboss.logging.Logger
import scala.jdk.CollectionConverters.*
import java.util.concurrent.atomic.AtomicReference
import scala.compiletime.uninitialized
@@ -46,6 +47,7 @@ class AutoScaler:
private val argoApiVersion = "argoproj.io/v1alpha1"
private val argoKind = "Rollout"
private val metricsApiVersion = "metrics.k8s.io/v1beta1"
@PostConstruct
def initMetrics(): Unit =
@@ -70,6 +72,38 @@ class AutoScaler:
}
// scalafix:on DisableSyntax.asInstanceOf
private def isResourceConstrained(instanceId: String): Boolean =
kubeClientOpt.fold(false) { kube =>
try
val pods = kube.pods().inNamespace(config.k8sNamespace).withLabel(config.k8sRolloutLabelSelector).list().getItems.asScala
pods.find(_.getMetadata.getName.contains(instanceId)).exists { pod =>
try
val metricsRes = kube.genericKubernetesResources(metricsApiVersion, "PodMetrics")
.inNamespace(config.k8sNamespace)
.withName(pod.getMetadata.getName)
.get()
val metricsMap = metricsRes.asInstanceOf[java.util.Map[String, AnyRef]]
Option(metricsMap.get("metrics"))
.map(_.asInstanceOf[java.util.Map[String, AnyRef]])
.flatMap(m => Option(m.get("containers")).map(_.asInstanceOf[java.util.List[AnyRef]]))
.filter(!_.isEmpty)
.map(_.get(0).asInstanceOf[java.util.Map[String, AnyRef]])
.flatMap(c => Option(c.get("usage")).map(_.asInstanceOf[java.util.Map[String, AnyRef]]))
.flatMap(u => Option(u.get("cpu")))
.map(_.toString)
.exists { cpuStr =>
val cpuMillis = if cpuStr.endsWith("m") then cpuStr.dropRight(1).toLongOption.getOrElse(0L) else cpuStr.toLongOption.map(_ * 1000).getOrElse(0L)
cpuMillis > 800
}
catch
case _: Exception => false
}
catch
case ex: Exception =>
log.debugf(ex, "Failed to check resource metrics for %s", instanceId)
false
}
def checkAndScale: Unit =
if config.autoScaleEnabled then
val now = System.currentTimeMillis()
@@ -80,7 +114,9 @@ class AutoScaler:
val avgLoad = instances.map(_.subscriptionCount).sum.toDouble / instances.size
avgLoadRef.set(avgLoad)
if avgLoad > config.scaleUpThreshold * config.maxGamesPerCore then scaleUp()
val hasHighCpuOrMemory = instances.exists(inst => isResourceConstrained(inst.instanceId))
if avgLoad > config.scaleUpThreshold * config.maxGamesPerCore || hasHighCpuOrMemory then scaleUp()
else if avgLoad < config.scaleDownThreshold * config.maxGamesPerCore && instances.size > config.scaleMinReplicas then scaleDown()
def scaleUp(): Unit =