From 32a12737e3b47500072d5246638d059cc626f21d Mon Sep 17 00:00:00 2001 From: Janis Date: Mon, 18 May 2026 20:18:56 +0200 Subject: [PATCH] refactor: resource-based scaling only, remove health-check triggered scaling Scale up: only if resource constrained (CPU/memory) Scale down: only if NOT resource constrained AND game load low Remove: triggering scale-up on unexpected instance failures Keep: health monitoring (mark dead, delete pod, failover games) but no scaling Prevents cascade scaling from transient health check failures. Co-Authored-By: Claude Haiku 4.5 --- .../de/nowchess/coordinator/service/AutoScaler.scala | 8 +++----- .../de/nowchess/coordinator/service/HealthMonitor.scala | 4 ---- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/AutoScaler.scala b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/AutoScaler.scala index c2ae167..376d4b8 100644 --- a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/AutoScaler.scala +++ b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/AutoScaler.scala @@ -182,16 +182,14 @@ class AutoScaler: val hasHighCpuOrMemory = constrainedInstance.isDefined log.infof( - "Scale check: instances=%d avgLoad=%.1f scaleUpAt=%.1f scaleDownAt=%.1f resourceConstrained=%s", + "Scale check: instances=%d avgLoad=%.1f resourceConstrained=%s", instances.size, avgLoad, - scaleUpLoad, - scaleDownLoad, constrainedInstance.map(_.instanceId).getOrElse("none"), ) - if avgLoad > scaleUpLoad || hasHighCpuOrMemory then scaleUp() - else if avgLoad < scaleDownLoad && instances.size > config.scaleMinReplicas + if hasHighCpuOrMemory then scaleUp() + if !hasHighCpuOrMemory && avgLoad < scaleDownLoad && instances.size > config.scaleMinReplicas then scaleDown() private def patchRolloutReplicas( diff --git a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala index 9b2de02..63dc79c 100644 --- a/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala +++ b/modules/coordinator/src/main/scala/de/nowchess/coordinator/service/HealthMonitor.scala @@ -88,9 +88,7 @@ class HealthMonitor: if evicted.nonEmpty then log.warnf("Evicted %d stale instances: %s", evicted.size, evicted.mkString(", ")) evicted.foreach(deleteK8sPod) - val unexpectedEvictions = evicted.filterNot(autoScaler.isDrainingForScaleDown) evicted.foreach(autoScaler.clearDraining) - if unexpectedEvictions.nonEmpty then autoScaler.scaleUp() val instances = instanceRegistry.getAllInstances val failed = instances.collect { inst => val isHealthy = checkHealth(inst.instanceId) @@ -101,8 +99,6 @@ class HealthMonitor: Some(inst.instanceId) else None }.flatten - val unexpectedFailures = failed.filterNot(autoScaler.isDrainingForScaleDown) - if unexpectedFailures.nonEmpty then autoScaler.scaleUp() private def checkHealth(instanceId: String): Boolean = val redisHealthy = checkRedisHeartbeat(instanceId)