refactor: resource-based scaling only, remove health-check triggered scaling

Scale up: only if resource constrained (CPU/memory)
Scale down: only if NOT resource constrained AND game load low
Remove: triggering scale-up on unexpected instance failures
Keep: health monitoring (mark dead, delete pod, failover games) but no scaling

Prevents cascade scaling from transient health check failures.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-05-18 20:18:56 +02:00
parent b4c75e2a0f
commit 32a12737e3
2 changed files with 3 additions and 9 deletions
@@ -182,16 +182,14 @@ class AutoScaler:
val hasHighCpuOrMemory = constrainedInstance.isDefined val hasHighCpuOrMemory = constrainedInstance.isDefined
log.infof( log.infof(
"Scale check: instances=%d avgLoad=%.1f scaleUpAt=%.1f scaleDownAt=%.1f resourceConstrained=%s", "Scale check: instances=%d avgLoad=%.1f resourceConstrained=%s",
instances.size, instances.size,
avgLoad, avgLoad,
scaleUpLoad,
scaleDownLoad,
constrainedInstance.map(_.instanceId).getOrElse("none"), constrainedInstance.map(_.instanceId).getOrElse("none"),
) )
if avgLoad > scaleUpLoad || hasHighCpuOrMemory then scaleUp() if hasHighCpuOrMemory then scaleUp()
else if avgLoad < scaleDownLoad && instances.size > config.scaleMinReplicas if !hasHighCpuOrMemory && avgLoad < scaleDownLoad && instances.size > config.scaleMinReplicas
then scaleDown() then scaleDown()
private def patchRolloutReplicas( private def patchRolloutReplicas(
@@ -88,9 +88,7 @@ class HealthMonitor:
if evicted.nonEmpty then if evicted.nonEmpty then
log.warnf("Evicted %d stale instances: %s", evicted.size, evicted.mkString(", ")) log.warnf("Evicted %d stale instances: %s", evicted.size, evicted.mkString(", "))
evicted.foreach(deleteK8sPod) evicted.foreach(deleteK8sPod)
val unexpectedEvictions = evicted.filterNot(autoScaler.isDrainingForScaleDown)
evicted.foreach(autoScaler.clearDraining) evicted.foreach(autoScaler.clearDraining)
if unexpectedEvictions.nonEmpty then autoScaler.scaleUp()
val instances = instanceRegistry.getAllInstances val instances = instanceRegistry.getAllInstances
val failed = instances.collect { inst => val failed = instances.collect { inst =>
val isHealthy = checkHealth(inst.instanceId) val isHealthy = checkHealth(inst.instanceId)
@@ -101,8 +99,6 @@ class HealthMonitor:
Some(inst.instanceId) Some(inst.instanceId)
else None else None
}.flatten }.flatten
val unexpectedFailures = failed.filterNot(autoScaler.isDrainingForScaleDown)
if unexpectedFailures.nonEmpty then autoScaler.scaleUp()
private def checkHealth(instanceId: String): Boolean = private def checkHealth(instanceId: String): Boolean =
val redisHealthy = checkRedisHeartbeat(instanceId) val redisHealthy = checkRedisHeartbeat(instanceId)