Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c65a1393b9 | |||
| 4a36096a55 | |||
| 960a419792 | |||
| 68d6c1d36f | |||
| b991878214 | |||
| 43525d41a3 | |||
| 6bf1013710 | |||
| 255e2da33c | |||
| 4b3b5e7c4e | |||
| 1d121c727c |
@@ -264,6 +264,7 @@
|
||||
- class CacheEvictionManager
|
||||
- function setRedisPrefix
|
||||
- function initializeMetrics
|
||||
- function periodicCacheEviction
|
||||
- function evictStaleGames
|
||||
- `modules/coordinator/src/main/scala/de/nowchess/coordinator/service/FailoverService.scala`
|
||||
- class FailoverService
|
||||
@@ -289,6 +290,7 @@
|
||||
- function setRedisPrefix
|
||||
- function shouldRebalance
|
||||
- function rebalance
|
||||
- function periodicRebalanceCheck
|
||||
- `modules/core/src/main/scala/de/nowchess/chess/adapter/RuleSetRestAdapter.scala`
|
||||
- class RuleSetRestAdapter
|
||||
- function candidateMoves
|
||||
|
||||
@@ -208,6 +208,7 @@
|
||||
- class CacheEvictionManager
|
||||
- function setRedisPrefix
|
||||
- function initializeMetrics
|
||||
- function periodicCacheEviction
|
||||
- function evictStaleGames
|
||||
- `modules/coordinator/src/main/scala/de/nowchess/coordinator/service/FailoverService.scala`
|
||||
- class FailoverService
|
||||
@@ -233,6 +234,7 @@
|
||||
- function setRedisPrefix
|
||||
- function shouldRebalance
|
||||
- function rebalance
|
||||
- function periodicRebalanceCheck
|
||||
- `modules/core/src/main/scala/de/nowchess/chess/adapter/RuleSetRestAdapter.scala`
|
||||
- class RuleSetRestAdapter
|
||||
- function candidateMoves
|
||||
|
||||
@@ -438,3 +438,89 @@
|
||||
* streamline logging for evicted instances in InstanceRegistry ([10937e7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/10937e756a56e0e8fcf939decfdcaa4394506cc0))
|
||||
* update grpcServer variable to use Instance wrapper and add optional access method ([d5c8da2](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d5c8da20f8805199e920ea5afbd9cdb39a078e40))
|
||||
* update HealthMonitor to evict instances without associated pods ([0f41f13](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/0f41f13ce68b76846684bab67241a122250dfaf9))
|
||||
## (2026-05-13)
|
||||
|
||||
### Features
|
||||
|
||||
* add coordinator startup validation and K8s pod watch ([81b045d](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/81b045d01bb054a4bc9dc9e02fc30f814e756205))
|
||||
* add initialization metrics for various services ([d438e97](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d438e97f32bdde0bfc63c1b4a8cc810cdd093166))
|
||||
* add OpenTelemetry trace configuration with parentbased sampler ([3904d5a](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/3904d5ad8ad4930ddee65287a7bfab785a6148f5))
|
||||
* add periodic health check to evict dead instances ([380a2cc](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/380a2cceeb5873bf93ff17a1e87d62408ef8e178))
|
||||
* **config:** update application.yml for PostgreSQL and remove staging/production configurations ([2404e61](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/2404e6164c3b50ffccbea5238d636060d6abe4d6))
|
||||
* **config:** update application.yml for staging and production environments ([6113432](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/6113432a14c476a3a0dfc0d449e17d023697f2ba))
|
||||
* configure logging and add OpenTelemetry support ([#49](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/49)) ([d57c488](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d57c4886612d1d92da0e1b79209fc83e6ef537a1))
|
||||
* **docker:** add .dockerignore and .gitignore files for build exclusions ([c987d8e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/c987d8e258c0e6c4cfbdaa8381c64c410d7a2b83))
|
||||
* **docker:** add Dockerfiles for building Quarkus application in native and JVM modes ([3f2d2bb](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/3f2d2bb4c97fa8cddba66e1da4427c54236dfeed))
|
||||
* **docker:** add Dockerfiles for Quarkus application in JVM and native modes ([34b9933](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/34b993304670cf2aa62cd2f6460cee7b9864b08e))
|
||||
* implement periodic scaling checks and enhance instance management in AutoScaler ([3f12f69](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/3f12f695f132b92f634d98df2c037292498b6e86))
|
||||
* **logging:** add DEBUG/INFO/WARN logging across services (NCS-72) ([#41](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/41)) ([804a4bf](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/804a4bf179e3dfb19e2be4390e7e543caf5237c6))
|
||||
* NCS-78 Add Traceability to the Applications ([#46](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/46)) ([649566e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/649566eb3fcf38f91c8896a739f74ea318af312d))
|
||||
* NCS-78 Add Traceability to the Applications ([#47](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/47)) ([87dfc6c](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/87dfc6c2bcce7f7d58fc641bd8d468a2e584c108))
|
||||
* scale up on high CPU load, not just subscription count ([255e2da](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/255e2da33c37e186ed14f2862f2d2e1b4adc59bf))
|
||||
* true-microservices ([#40](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/40)) ([5909242](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/590924254e8a2754de661a57a03e43f89ceb6299))
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* add instance-dead-timeout configuration and update HealthMonitor to use it for stale instance eviction ([be0b710](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/be0b710543b542da5c301efef7d2d587d0ba758a))
|
||||
* clean up code formatting and improve error handling in gRPC server and failover service ([ad9495a](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/ad9495afa3e93593b57154a187346c9b01393911))
|
||||
* coordinator auto-scaling, cache eviction, rebalancing, and grpc timeouts ([d0c7169](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d0c71693bb6f55fafdce5bcea0d5f38b9bb505ef))
|
||||
* **coordinator:** refine type casting in rolloutSpec method ([#45](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/45)) ([d522f7f](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d522f7f6edf9c985f03dd16816439d4184f1a589))
|
||||
* **coordinator:** use genericKubernetesResources API for Argo Rollout scaling ([#43](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/43)) ([fa3c6b2](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/fa3c6b2886dc59c14c5dad834acc9b41e42023bb))
|
||||
* **coordinator:** use genericKubernetesResources API for Argo Rollout scaling ([#44](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/44)) ([82d0b75](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/82d0b754be1075084944b466858672d944f9f7d8))
|
||||
* **dependencies:** replace Fabric8 Kubernetes client with Quarkus Kubernetes client ([5f44570](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/5f44570b357277d09f33b7296860c421e2e70ce0))
|
||||
* don't block event loop during scale-down drain ([1d121c7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/1d121c727cbd4df477827cf64d065b7356a56e59))
|
||||
* don't trigger scale-down if already at min replicas ([4b3b5e7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/4b3b5e7c4ed9b3cd4fe2490e9f268f2e3a0d9e85))
|
||||
* enhance AutoScaler and InstanceRegistry for replica management and stale instance eviction ([b4920d3](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/b4920d3817e58bda94d7764e608b856ce9a909f7))
|
||||
* **middleware:** update paths for bot generation and stockfish configuration ([2dd0501](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/2dd0501687db08dcd242359f6837125baf8a2fdc))
|
||||
* **redis:** update Redis configuration with max pool size and waiting parameters ([5baf6a7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/5baf6a7cdbea484fc49c02e2b5a1c3919b7fa2c4))
|
||||
* remove corrupted instances immediately and evict dead instances ([43184d2](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/43184d296da5a6a7b760ac90c2b739220d86bce3))
|
||||
* replace null checks with Option in coordinator ([2b04d7f](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/2b04d7fa713e06662bff5afe3fb3f9d04541ce51))
|
||||
* scalafix violations in metrics check and health monitor ([b991878](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/b99187821489b296d66da5a5a13f5d545b6045c6))
|
||||
* scale up immediately when instance is lost ([43525d4](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/43525d41a3884c00f1db26bf3c8c4cd9a607c260))
|
||||
* streamline logging for evicted instances in InstanceRegistry ([10937e7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/10937e756a56e0e8fcf939decfdcaa4394506cc0))
|
||||
* update grpcServer variable to use Instance wrapper and add optional access method ([d5c8da2](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d5c8da20f8805199e920ea5afbd9cdb39a078e40))
|
||||
* update HealthMonitor to evict instances without associated pods ([0f41f13](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/0f41f13ce68b76846684bab67241a122250dfaf9))
|
||||
## (2026-05-14)
|
||||
|
||||
### Features
|
||||
|
||||
* add coordinator startup validation and K8s pod watch ([81b045d](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/81b045d01bb054a4bc9dc9e02fc30f814e756205))
|
||||
* add initialization metrics for various services ([d438e97](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d438e97f32bdde0bfc63c1b4a8cc810cdd093166))
|
||||
* add OpenTelemetry trace configuration with parentbased sampler ([3904d5a](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/3904d5ad8ad4930ddee65287a7bfab785a6148f5))
|
||||
* add periodic health check to evict dead instances ([380a2cc](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/380a2cceeb5873bf93ff17a1e87d62408ef8e178))
|
||||
* **config:** update application.yml for PostgreSQL and remove staging/production configurations ([2404e61](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/2404e6164c3b50ffccbea5238d636060d6abe4d6))
|
||||
* **config:** update application.yml for staging and production environments ([6113432](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/6113432a14c476a3a0dfc0d449e17d023697f2ba))
|
||||
* configure logging and add OpenTelemetry support ([#49](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/49)) ([d57c488](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d57c4886612d1d92da0e1b79209fc83e6ef537a1))
|
||||
* **docker:** add .dockerignore and .gitignore files for build exclusions ([c987d8e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/c987d8e258c0e6c4cfbdaa8381c64c410d7a2b83))
|
||||
* **docker:** add Dockerfiles for building Quarkus application in native and JVM modes ([3f2d2bb](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/3f2d2bb4c97fa8cddba66e1da4427c54236dfeed))
|
||||
* **docker:** add Dockerfiles for Quarkus application in JVM and native modes ([34b9933](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/34b993304670cf2aa62cd2f6460cee7b9864b08e))
|
||||
* implement periodic scaling checks and enhance instance management in AutoScaler ([3f12f69](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/3f12f695f132b92f634d98df2c037292498b6e86))
|
||||
* **logging:** add DEBUG/INFO/WARN logging across services (NCS-72) ([#41](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/41)) ([804a4bf](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/804a4bf179e3dfb19e2be4390e7e543caf5237c6))
|
||||
* NCS-78 Add Traceability to the Applications ([#46](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/46)) ([649566e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/649566eb3fcf38f91c8896a739f74ea318af312d))
|
||||
* NCS-78 Add Traceability to the Applications ([#47](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/47)) ([87dfc6c](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/87dfc6c2bcce7f7d58fc641bd8d468a2e584c108))
|
||||
* scale up on high CPU load, not just subscription count ([255e2da](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/255e2da33c37e186ed14f2862f2d2e1b4adc59bf))
|
||||
* true-microservices ([#40](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/40)) ([5909242](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/590924254e8a2754de661a57a03e43f89ceb6299))
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* add instance-dead-timeout configuration and update HealthMonitor to use it for stale instance eviction ([be0b710](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/be0b710543b542da5c301efef7d2d587d0ba758a))
|
||||
* clean up code formatting and improve error handling in gRPC server and failover service ([ad9495a](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/ad9495afa3e93593b57154a187346c9b01393911))
|
||||
* coordinator auto-scaling, cache eviction, rebalancing, and grpc timeouts ([d0c7169](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d0c71693bb6f55fafdce5bcea0d5f38b9bb505ef))
|
||||
* **coordinator:** refine type casting in rolloutSpec method ([#45](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/45)) ([d522f7f](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d522f7f6edf9c985f03dd16816439d4184f1a589))
|
||||
* **coordinator:** use genericKubernetesResources API for Argo Rollout scaling ([#43](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/43)) ([fa3c6b2](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/fa3c6b2886dc59c14c5dad834acc9b41e42023bb))
|
||||
* **coordinator:** use genericKubernetesResources API for Argo Rollout scaling ([#44](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/44)) ([82d0b75](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/82d0b754be1075084944b466858672d944f9f7d8))
|
||||
* **dependencies:** replace Fabric8 Kubernetes client with Quarkus Kubernetes client ([5f44570](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/5f44570b357277d09f33b7296860c421e2e70ce0))
|
||||
* don't block event loop during scale-down drain ([1d121c7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/1d121c727cbd4df477827cf64d065b7356a56e59))
|
||||
* don't trigger scale-down if already at min replicas ([4b3b5e7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/4b3b5e7c4ed9b3cd4fe2490e9f268f2e3a0d9e85))
|
||||
* enhance AutoScaler and InstanceRegistry for replica management and stale instance eviction ([b4920d3](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/b4920d3817e58bda94d7764e608b856ce9a909f7))
|
||||
* force-delete hanging pods and remove failed instances from registry ([960a419](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/960a419792e1161fb7241e465b7349efe4a10137))
|
||||
* linter formatting and improve code readability ([4a36096](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/4a36096a5586e3f321d9c34c53e60d02bcc02c55))
|
||||
* **middleware:** update paths for bot generation and stockfish configuration ([2dd0501](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/2dd0501687db08dcd242359f6837125baf8a2fdc))
|
||||
* **redis:** update Redis configuration with max pool size and waiting parameters ([5baf6a7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/5baf6a7cdbea484fc49c02e2b5a1c3919b7fa2c4))
|
||||
* remove corrupted instances immediately and evict dead instances ([43184d2](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/43184d296da5a6a7b760ac90c2b739220d86bce3))
|
||||
* replace null checks with Option in coordinator ([2b04d7f](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/2b04d7fa713e06662bff5afe3fb3f9d04541ce51))
|
||||
* scalafix violations in metrics check and health monitor ([b991878](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/b99187821489b296d66da5a5a13f5d545b6045c6))
|
||||
* scale up immediately when instance is lost ([43525d4](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/43525d41a3884c00f1db26bf3c8c4cd9a607c260))
|
||||
* streamline logging for evicted instances in InstanceRegistry ([10937e7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/10937e756a56e0e8fcf939decfdcaa4394506cc0))
|
||||
* update grpcServer variable to use Instance wrapper and add optional access method ([d5c8da2](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d5c8da20f8805199e920ea5afbd9cdb39a078e40))
|
||||
* update HealthMonitor to evict instances without associated pods ([0f41f13](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/0f41f13ce68b76846684bab67241a122250dfaf9))
|
||||
|
||||
@@ -28,7 +28,7 @@ nowchess:
|
||||
internal:
|
||||
secret: ${INTERNAL_SECRET:123abc}
|
||||
coordinator:
|
||||
max-games-per-core: 500
|
||||
max-games-per-core: 100
|
||||
max-deviation-percent: 20
|
||||
rebalance-interval: 30s
|
||||
rebalance-min-interval: 60s
|
||||
|
||||
+89
-50
@@ -10,6 +10,7 @@ import io.fabric8.kubernetes.client.KubernetesClient
|
||||
import io.micrometer.core.instrument.{Gauge, MeterRegistry}
|
||||
import io.quarkus.scheduler.Scheduled
|
||||
import org.jboss.logging.Logger
|
||||
import scala.jdk.CollectionConverters.*
|
||||
|
||||
import java.util.concurrent.atomic.AtomicReference
|
||||
import scala.compiletime.uninitialized
|
||||
@@ -44,8 +45,9 @@ class AutoScaler:
|
||||
if kubeClientInstance.isUnsatisfied then None
|
||||
else Some(kubeClientInstance.get())
|
||||
|
||||
private val argoApiVersion = "argoproj.io/v1alpha1"
|
||||
private val argoKind = "Rollout"
|
||||
private val argoApiVersion = "argoproj.io/v1alpha1"
|
||||
private val argoKind = "Rollout"
|
||||
private val metricsApiVersion = "metrics.k8s.io/v1beta1"
|
||||
|
||||
@PostConstruct
|
||||
def initMetrics(): Unit =
|
||||
@@ -70,6 +72,43 @@ class AutoScaler:
|
||||
}
|
||||
// scalafix:on DisableSyntax.asInstanceOf
|
||||
|
||||
// scalafix:off DisableSyntax.asInstanceOf
|
||||
private def isResourceConstrained(instanceId: String): Boolean =
|
||||
kubeClientOpt.fold(false) { kube =>
|
||||
try
|
||||
val pods =
|
||||
kube.pods().inNamespace(config.k8sNamespace).withLabel(config.k8sRolloutLabelSelector).list().getItems.asScala
|
||||
pods.find(_.getMetadata.getName.contains(instanceId)).exists { pod =>
|
||||
try
|
||||
val metricsRes = kube
|
||||
.genericKubernetesResources(metricsApiVersion, "PodMetrics")
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(pod.getMetadata.getName)
|
||||
.get()
|
||||
val metricsMap = metricsRes.asInstanceOf[java.util.Map[String, AnyRef]]
|
||||
Option(metricsMap.get("metrics"))
|
||||
.map(_.asInstanceOf[java.util.Map[String, AnyRef]])
|
||||
.flatMap(m => Option(m.get("containers")).map(_.asInstanceOf[java.util.List[AnyRef]]))
|
||||
.filter(!_.isEmpty)
|
||||
.map(_.get(0).asInstanceOf[java.util.Map[String, AnyRef]])
|
||||
.flatMap(c => Option(c.get("usage")).map(_.asInstanceOf[java.util.Map[String, AnyRef]]))
|
||||
.flatMap(u => Option(u.get("cpu")))
|
||||
.map(_.toString)
|
||||
.exists { cpuStr =>
|
||||
val cpuMillis =
|
||||
if cpuStr.endsWith("m") then cpuStr.dropRight(1).toLongOption.getOrElse(0L)
|
||||
else cpuStr.toLongOption.map(_ * 1000).getOrElse(0L)
|
||||
cpuMillis > 800
|
||||
}
|
||||
catch case _: Exception => false
|
||||
}
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.debugf(ex, "Failed to check resource metrics for %s", instanceId)
|
||||
false
|
||||
}
|
||||
// scalafix:on DisableSyntax.asInstanceOf
|
||||
|
||||
def checkAndScale: Unit =
|
||||
if config.autoScaleEnabled then
|
||||
val now = System.currentTimeMillis()
|
||||
@@ -80,8 +119,11 @@ class AutoScaler:
|
||||
val avgLoad = instances.map(_.subscriptionCount).sum.toDouble / instances.size
|
||||
avgLoadRef.set(avgLoad)
|
||||
|
||||
if avgLoad > config.scaleUpThreshold * config.maxGamesPerCore then scaleUp()
|
||||
else if avgLoad < config.scaleDownThreshold * config.maxGamesPerCore then scaleDown()
|
||||
val hasHighCpuOrMemory = instances.exists(inst => isResourceConstrained(inst.instanceId))
|
||||
|
||||
if avgLoad > config.scaleUpThreshold * config.maxGamesPerCore || hasHighCpuOrMemory then scaleUp()
|
||||
else if avgLoad < config.scaleDownThreshold * config.maxGamesPerCore && instances.size > config.scaleMinReplicas
|
||||
then scaleDown()
|
||||
|
||||
def scaleUp(): Unit =
|
||||
log.info("Scaling up Argo Rollout")
|
||||
@@ -134,56 +176,53 @@ class AutoScaler:
|
||||
.minByOption(_.subscriptionCount)
|
||||
|
||||
underloadedInstance.foreach { inst =>
|
||||
log.infof("Draining instance %s before scale-down", inst.instanceId)
|
||||
log.infof("Marking instance %s for drain before scale-down", inst.instanceId)
|
||||
failoverService
|
||||
.onInstanceStreamDropped(inst.instanceId)
|
||||
.subscribe()
|
||||
.`with`(
|
||||
_ =>
|
||||
kubeClientOpt match
|
||||
case None =>
|
||||
log.warn("Kubernetes client not available, cannot scale")
|
||||
case Some(kube) =>
|
||||
try
|
||||
Option(
|
||||
kube
|
||||
.genericKubernetesResources(argoApiVersion, argoKind)
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.get(),
|
||||
).foreach { rollout =>
|
||||
rolloutSpec(rollout).foreach { spec =>
|
||||
spec.get("replicas") match
|
||||
case replicas: Integer =>
|
||||
val currentReplicas = replicas.intValue()
|
||||
val minReplicas = config.scaleMinReplicas
|
||||
|
||||
if currentReplicas > minReplicas then
|
||||
spec.put("replicas", Integer.valueOf(currentReplicas - 1))
|
||||
kube
|
||||
.genericKubernetesResources(argoApiVersion, argoKind)
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.resource(rollout)
|
||||
.update()
|
||||
meterRegistry.counter("nowchess.coordinator.scale.events", "direction", "down").increment()
|
||||
log.infof(
|
||||
"Scaled down %s from %d to %d replicas",
|
||||
config.k8sRolloutName,
|
||||
currentReplicas,
|
||||
currentReplicas - 1,
|
||||
)
|
||||
else log.infof("Already at min replicas %d for %s", minReplicas, config.k8sRolloutName)
|
||||
case _ => ()
|
||||
}
|
||||
}
|
||||
catch
|
||||
case ex: Exception =>
|
||||
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
||||
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName),
|
||||
ex =>
|
||||
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
||||
log.warnf(ex, "Failed to drain instance %s before scale-down", inst.instanceId),
|
||||
_ => log.debugf("Instance %s drained for scale-down", inst.instanceId),
|
||||
ex => log.warnf(ex, "Drain failed for %s, proceeding with scale-down", inst.instanceId),
|
||||
)
|
||||
}
|
||||
|
||||
if underloadedInstance.isEmpty then log.warn("No healthy instances found for scale-down")
|
||||
kubeClientOpt match
|
||||
case None =>
|
||||
log.warn("Kubernetes client not available, cannot scale")
|
||||
case Some(kube) =>
|
||||
try
|
||||
Option(
|
||||
kube
|
||||
.genericKubernetesResources(argoApiVersion, argoKind)
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.get(),
|
||||
).foreach { rollout =>
|
||||
rolloutSpec(rollout).foreach { spec =>
|
||||
spec.get("replicas") match
|
||||
case replicas: Integer =>
|
||||
val currentReplicas = replicas.intValue()
|
||||
val minReplicas = config.scaleMinReplicas
|
||||
|
||||
if currentReplicas > minReplicas then
|
||||
spec.put("replicas", Integer.valueOf(currentReplicas - 1))
|
||||
kube
|
||||
.genericKubernetesResources(argoApiVersion, argoKind)
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.resource(rollout)
|
||||
.update()
|
||||
meterRegistry.counter("nowchess.coordinator.scale.events", "direction", "down").increment()
|
||||
log.infof(
|
||||
"Scaled down %s from %d to %d replicas",
|
||||
config.k8sRolloutName,
|
||||
currentReplicas,
|
||||
currentReplicas - 1,
|
||||
)
|
||||
else log.infof("Already at min replicas %d for %s", minReplicas, config.k8sRolloutName)
|
||||
case _ => ()
|
||||
}
|
||||
}
|
||||
catch
|
||||
case ex: Exception =>
|
||||
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
||||
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName)
|
||||
|
||||
+17
-5
@@ -45,6 +45,9 @@ class HealthMonitor:
|
||||
@Inject
|
||||
private var failoverService: FailoverService = uninitialized
|
||||
|
||||
@Inject
|
||||
private var autoScaler: AutoScaler = uninitialized
|
||||
|
||||
private val log = Logger.getLogger(classOf[HealthMonitor])
|
||||
private var redisPrefix = "nowchess"
|
||||
// scalafix:on DisableSyntax.var
|
||||
@@ -85,14 +88,18 @@ class HealthMonitor:
|
||||
if evicted.nonEmpty then
|
||||
log.warnf("Evicted %d stale instances: %s", evicted.size, evicted.mkString(", "))
|
||||
evicted.foreach(deleteK8sPod)
|
||||
autoScaler.scaleUp()
|
||||
val instances = instanceRegistry.getAllInstances
|
||||
instances.foreach { inst =>
|
||||
val failed = instances.collect { inst =>
|
||||
val isHealthy = checkHealth(inst.instanceId)
|
||||
if !isHealthy && inst.state == "HEALTHY" then
|
||||
log.warnf("Instance %s marked unhealthy", inst.instanceId)
|
||||
instanceRegistry.markInstanceDead(inst.instanceId)
|
||||
deleteK8sPod(inst.instanceId)
|
||||
}
|
||||
Some(inst.instanceId)
|
||||
else None
|
||||
}.flatten
|
||||
if failed.nonEmpty then autoScaler.scaleUp()
|
||||
|
||||
private def checkHealth(instanceId: String): Boolean =
|
||||
val redisHealthy = checkRedisHeartbeat(instanceId)
|
||||
@@ -178,14 +185,19 @@ class HealthMonitor:
|
||||
pods.find(pod => pod.getMetadata.getName.contains(instanceId)) match
|
||||
case Some(pod) =>
|
||||
val podName = pod.getMetadata.getName
|
||||
kube.pods().inNamespace(config.k8sNamespace).withName(podName).delete()
|
||||
kube.pods().inNamespace(config.k8sNamespace).withName(podName).withGracePeriod(0L).delete()
|
||||
meterRegistry.counter("nowchess.coordinator.pods.deleted").increment()
|
||||
log.infof("Deleted pod %s for dead instance %s", podName, instanceId)
|
||||
log.infof("Force-deleted pod %s for dead instance %s", podName, instanceId)
|
||||
case None =>
|
||||
log.debugf("No pod found for instance %s, skipping deletion", instanceId)
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to delete pod for instance %s", instanceId)
|
||||
log.warnf(
|
||||
ex,
|
||||
"Failed to delete pod for instance %s — removing from registry to prevent blocking scale-down",
|
||||
instanceId,
|
||||
)
|
||||
instanceRegistry.removeInstance(instanceId)
|
||||
|
||||
private def validateStartupInstances(timeoutMs: Long): Unit =
|
||||
Thread.sleep(timeoutMs)
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
MAJOR=0
|
||||
MINOR=21
|
||||
MINOR=23
|
||||
PATCH=0
|
||||
|
||||
Reference in New Issue
Block a user