refactor: clean up code formatting and improve readability across multiple files
This commit is contained in:
@@ -15,4 +15,3 @@ class JacksonConfig extends ObjectMapperCustomizer:
|
||||
new Version(2, 21, 1, null, "com.fasterxml.jackson.module", "jackson-module-scala")
|
||||
// scalafix:on DisableSyntax.null
|
||||
})
|
||||
|
||||
|
||||
-1
@@ -11,4 +11,3 @@ import io.quarkus.runtime.annotations.RegisterForReflection
|
||||
),
|
||||
)
|
||||
class NativeReflectionConfig
|
||||
|
||||
|
||||
+2
@@ -28,7 +28,9 @@ class CoordinatorGrpcServer extends CoordinatorServiceGrpc.CoordinatorServiceImp
|
||||
responseObserver: StreamObserver[CoordinatorCommand],
|
||||
): StreamObserver[HeartbeatFrame] =
|
||||
new StreamObserver[HeartbeatFrame]:
|
||||
// scalafix:off DisableSyntax.var
|
||||
private var lastInstanceId = ""
|
||||
// scalafix:on DisableSyntax.var
|
||||
|
||||
override def onNext(frame: HeartbeatFrame): Unit =
|
||||
lastInstanceId = frame.getInstanceId
|
||||
|
||||
+1
-3
@@ -16,9 +16,7 @@ class CoreGrpcClient:
|
||||
private val channels = ConcurrentHashMap[String, ManagedChannel]()
|
||||
|
||||
private def getChannel(host: String, port: Int): ManagedChannel =
|
||||
channels.computeIfAbsent(s"$host:$port", _ =>
|
||||
ManagedChannelBuilder.forAddress(host, port).usePlaintext().build(),
|
||||
)
|
||||
channels.computeIfAbsent(s"$host:$port", _ => ManagedChannelBuilder.forAddress(host, port).usePlaintext().build())
|
||||
|
||||
private def evictStaleChannel(host: String, port: Int): Unit =
|
||||
Option(channels.remove(s"$host:$port")).foreach(_.shutdownNow())
|
||||
|
||||
+1
-1
@@ -33,7 +33,7 @@ class CoordinatorResource:
|
||||
@Path("/instances")
|
||||
@Produces(Array(MediaType.APPLICATION_JSON))
|
||||
def listInstances: java.util.List[InstanceMetadata] =
|
||||
instanceRegistry.getAllInstances.asJava.asInstanceOf[java.util.List[InstanceMetadata]]
|
||||
instanceRegistry.getAllInstances.asJava
|
||||
|
||||
@GET
|
||||
@Path("/metrics")
|
||||
|
||||
+101
-74
@@ -1,6 +1,7 @@
|
||||
package de.nowchess.coordinator.service
|
||||
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
import jakarta.enterprise.inject.Instance
|
||||
import jakarta.inject.Inject
|
||||
import de.nowchess.coordinator.config.CoordinatorConfig
|
||||
import io.fabric8.kubernetes.api.model.GenericKubernetesResource
|
||||
@@ -11,98 +12,124 @@ import scala.compiletime.uninitialized
|
||||
|
||||
@ApplicationScoped
|
||||
class AutoScaler:
|
||||
// scalafix:off DisableSyntax.var
|
||||
@Inject
|
||||
private var kubeClient: KubernetesClient = null
|
||||
private var kubeClientInstance: Instance[KubernetesClient] = uninitialized
|
||||
|
||||
@Inject
|
||||
private var config: CoordinatorConfig = uninitialized
|
||||
|
||||
@Inject
|
||||
private var instanceRegistry: InstanceRegistry = uninitialized
|
||||
// scalafix:on DisableSyntax.var
|
||||
|
||||
private val log = Logger.getLogger(classOf[AutoScaler])
|
||||
private val lastScaleTime = new java.util.concurrent.atomic.AtomicLong(0L)
|
||||
|
||||
private def kubeClientOpt: Option[KubernetesClient] =
|
||||
if kubeClientInstance.isUnsatisfied then None
|
||||
else Some(kubeClientInstance.get())
|
||||
|
||||
// scalafix:off DisableSyntax.asInstanceOf
|
||||
// scalafix:off DisableSyntax.isInstanceOf
|
||||
private def rolloutSpec(rollout: GenericKubernetesResource): Option[java.util.Map[String, AnyRef]] =
|
||||
Option(rollout.get("spec")).collect {
|
||||
case m if m.isInstanceOf[java.util.Map[?, ?]] => m.asInstanceOf[java.util.Map[String, AnyRef]]
|
||||
}
|
||||
// scalafix:on DisableSyntax.asInstanceOf
|
||||
// scalafix:on DisableSyntax.isInstanceOf
|
||||
|
||||
def checkAndScale: Unit =
|
||||
if !config.autoScaleEnabled then return
|
||||
if config.autoScaleEnabled then
|
||||
val now = System.currentTimeMillis()
|
||||
val last = lastScaleTime.get()
|
||||
if now - last >= 120000 && lastScaleTime.compareAndSet(last, now) then
|
||||
val instances = instanceRegistry.getAllInstances.filter(_.state == "HEALTHY")
|
||||
if instances.nonEmpty then
|
||||
val avgLoad = instances.map(_.subscriptionCount).sum.toDouble / instances.size
|
||||
|
||||
val now = System.currentTimeMillis()
|
||||
val last = lastScaleTime.get()
|
||||
if now - last < 120000 then return
|
||||
if !lastScaleTime.compareAndSet(last, now) then return
|
||||
|
||||
val instances = instanceRegistry.getAllInstances.filter(_.state == "HEALTHY")
|
||||
if instances.isEmpty then return
|
||||
|
||||
val avgLoad = instances.map(_.subscriptionCount).sum.toDouble / instances.size
|
||||
|
||||
if avgLoad > config.scaleUpThreshold * config.maxGamesPerCore then scaleUp()
|
||||
else if avgLoad < config.scaleDownThreshold * config.maxGamesPerCore && instances.size > config.scaleMinReplicas
|
||||
then scaleDown()
|
||||
if avgLoad > config.scaleUpThreshold * config.maxGamesPerCore then scaleUp()
|
||||
else if avgLoad < config.scaleDownThreshold * config.maxGamesPerCore && instances.size > config.scaleMinReplicas
|
||||
then scaleDown()
|
||||
|
||||
def scaleUp(): Unit =
|
||||
log.info("Scaling up Argo Rollout")
|
||||
if kubeClient == null then
|
||||
log.warn("Kubernetes client not available, cannot scale")
|
||||
return
|
||||
kubeClientOpt match
|
||||
case None =>
|
||||
log.warn("Kubernetes client not available, cannot scale")
|
||||
case Some(kube) =>
|
||||
try
|
||||
Option(
|
||||
kube
|
||||
.resources(classOf[GenericKubernetesResource])
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.get(),
|
||||
).foreach { rollout =>
|
||||
rolloutSpec(rollout).foreach { spec =>
|
||||
spec.get("replicas") match
|
||||
case replicas: Integer =>
|
||||
val currentReplicas = replicas.intValue()
|
||||
val maxReplicas = config.scaleMaxReplicas
|
||||
|
||||
try
|
||||
val rollout = kubeClient
|
||||
.resources(classOf[GenericKubernetesResource])
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.get()
|
||||
|
||||
if rollout != null then
|
||||
val spec = rollout.get("spec").asInstanceOf[java.util.Map[String, Any]]
|
||||
val currentReplicas = spec.get("replicas").asInstanceOf[Integer].intValue()
|
||||
val maxReplicas = config.scaleMaxReplicas
|
||||
|
||||
if currentReplicas < maxReplicas then
|
||||
spec.put("replicas", currentReplicas + 1)
|
||||
kubeClient
|
||||
.resources(classOf[GenericKubernetesResource])
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.createOrReplace(rollout)
|
||||
log.infof("Scaled up %s from %d to %d replicas", config.k8sRolloutName, currentReplicas, currentReplicas + 1)
|
||||
else log.infof("Already at max replicas %d for %s", maxReplicas, config.k8sRolloutName)
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to scale up %s", config.k8sRolloutName)
|
||||
if currentReplicas < maxReplicas then
|
||||
spec.put("replicas", String.valueOf(currentReplicas + 1))
|
||||
kube
|
||||
.resources(classOf[GenericKubernetesResource])
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.update()
|
||||
log.infof(
|
||||
"Scaled up %s from %d to %d replicas",
|
||||
config.k8sRolloutName,
|
||||
currentReplicas,
|
||||
currentReplicas + 1,
|
||||
)
|
||||
else log.infof("Already at max replicas %d for %s", maxReplicas, config.k8sRolloutName)
|
||||
case _ => ()
|
||||
}
|
||||
}
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to scale up %s", config.k8sRolloutName)
|
||||
|
||||
def scaleDown(): Unit =
|
||||
log.info("Scaling down Argo Rollout")
|
||||
if kubeClient == null then
|
||||
log.warn("Kubernetes client not available, cannot scale")
|
||||
return
|
||||
kubeClientOpt match
|
||||
case None =>
|
||||
log.warn("Kubernetes client not available, cannot scale")
|
||||
case Some(kube) =>
|
||||
try
|
||||
Option(
|
||||
kube
|
||||
.resources(classOf[GenericKubernetesResource])
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.get(),
|
||||
).foreach { rollout =>
|
||||
rolloutSpec(rollout).foreach { spec =>
|
||||
spec.get("replicas") match
|
||||
case replicas: Integer =>
|
||||
val currentReplicas = replicas.intValue()
|
||||
val minReplicas = config.scaleMinReplicas
|
||||
|
||||
try
|
||||
val rollout = kubeClient
|
||||
.resources(classOf[GenericKubernetesResource])
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.get()
|
||||
|
||||
if rollout != null then
|
||||
val spec = rollout.get("spec").asInstanceOf[java.util.Map[String, Any]]
|
||||
val currentReplicas = spec.get("replicas").asInstanceOf[Integer].intValue()
|
||||
val minReplicas = config.scaleMinReplicas
|
||||
|
||||
if currentReplicas > minReplicas then
|
||||
spec.put("replicas", currentReplicas - 1)
|
||||
kubeClient
|
||||
.resources(classOf[GenericKubernetesResource])
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.createOrReplace(rollout)
|
||||
log.infof(
|
||||
"Scaled down %s from %d to %d replicas",
|
||||
config.k8sRolloutName,
|
||||
currentReplicas,
|
||||
currentReplicas - 1,
|
||||
)
|
||||
else log.infof("Already at min replicas %d for %s", minReplicas, config.k8sRolloutName)
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName)
|
||||
if currentReplicas > minReplicas then
|
||||
spec.put("replicas", String.valueOf(currentReplicas - 1))
|
||||
kube
|
||||
.resources(classOf[GenericKubernetesResource])
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withName(config.k8sRolloutName)
|
||||
.update()
|
||||
log.infof(
|
||||
"Scaled down %s from %d to %d replicas",
|
||||
config.k8sRolloutName,
|
||||
currentReplicas,
|
||||
currentReplicas - 1,
|
||||
)
|
||||
else log.infof("Already at min replicas %d for %s", minReplicas, config.k8sRolloutName)
|
||||
case _ => ()
|
||||
}
|
||||
}
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName)
|
||||
|
||||
+16
-13
@@ -14,6 +14,7 @@ import de.nowchess.coordinator.grpc.CoreGrpcClient
|
||||
|
||||
@ApplicationScoped
|
||||
class CacheEvictionManager:
|
||||
// scalafix:off DisableSyntax.var
|
||||
@Inject
|
||||
private var redis: RedisDataSource = uninitialized
|
||||
|
||||
@@ -31,6 +32,7 @@ class CacheEvictionManager:
|
||||
|
||||
private val log = Logger.getLogger(classOf[CacheEvictionManager])
|
||||
private var redisPrefix = "nowchess"
|
||||
// scalafix:on DisableSyntax.var
|
||||
|
||||
def setRedisPrefix(prefix: String): Unit =
|
||||
redisPrefix = prefix
|
||||
@@ -38,44 +40,45 @@ class CacheEvictionManager:
|
||||
def evictStaleGames: Unit =
|
||||
log.info("Starting cache eviction scan")
|
||||
|
||||
val pattern = s"$redisPrefix:game:entry:*"
|
||||
val keys = redis.key(classOf[String]).keys(pattern)
|
||||
|
||||
val pattern = s"$redisPrefix:game:entry:*"
|
||||
val keys = redis.key(classOf[String]).keys(pattern)
|
||||
val now = System.currentTimeMillis()
|
||||
val idleThresholdMs = config.gameIdleThreshold.toMillis
|
||||
|
||||
var evictedCount = 0
|
||||
keys.asScala.foreach { key =>
|
||||
val evictedCount = keys.asScala.foldLeft(0) { (count, key) =>
|
||||
try
|
||||
val value = redis.value(classOf[String]).get(key)
|
||||
if value != null then
|
||||
Option(redis.value(classOf[String]).get(key)).fold(count) { value =>
|
||||
val gameId = key.stripPrefix(s"$redisPrefix:game:entry:")
|
||||
val lastUpdated = extractLastUpdatedTimestamp(value)
|
||||
|
||||
if lastUpdated > 0 && (now - lastUpdated) > idleThresholdMs then
|
||||
findInstanceWithGame(gameId).foreach { instance =>
|
||||
findInstanceWithGame(gameId).fold(count) { instance =>
|
||||
try
|
||||
coreGrpcClient.evictGames(instance.hostname, instance.grpcPort, List(gameId))
|
||||
redis.key(classOf[String]).del(key)
|
||||
evictedCount += 1
|
||||
log.infof("Evicted idle game %s from %s", gameId, instance.instanceId)
|
||||
count + 1
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to evict game %s", gameId)
|
||||
count
|
||||
}
|
||||
else count
|
||||
}
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Error processing game key %s", key)
|
||||
count
|
||||
}
|
||||
|
||||
log.infof("Cache eviction scan completed, evicted %d games", evictedCount)
|
||||
|
||||
private def extractLastUpdatedTimestamp(json: String): Long =
|
||||
Try {
|
||||
val parsed = objectMapper.readTree(json)
|
||||
val lastHeartbeat = parsed.get("lastHeartbeat")
|
||||
if lastHeartbeat != null && lastHeartbeat.isTextual then Instant.parse(lastHeartbeat.asText()).toEpochMilli
|
||||
else 0L
|
||||
val parsed = objectMapper.readTree(json)
|
||||
Option(parsed.get("lastHeartbeat"))
|
||||
.filter(_.isTextual)
|
||||
.fold(0L)(lh => Instant.parse(lh.asText()).toEpochMilli)
|
||||
}.getOrElse(0L)
|
||||
|
||||
private def findInstanceWithGame(gameId: String): Option[de.nowchess.coordinator.dto.InstanceMetadata] =
|
||||
|
||||
+30
-18
@@ -11,6 +11,7 @@ import de.nowchess.coordinator.grpc.CoreGrpcClient
|
||||
|
||||
@ApplicationScoped
|
||||
class FailoverService:
|
||||
// scalafix:off DisableSyntax.var
|
||||
@Inject
|
||||
private var redis: RedisDataSource = uninitialized
|
||||
|
||||
@@ -22,6 +23,7 @@ class FailoverService:
|
||||
|
||||
private val log = Logger.getLogger(classOf[FailoverService])
|
||||
private var redisPrefix = "nowchess"
|
||||
// scalafix:on DisableSyntax.var
|
||||
|
||||
def setRedisPrefix(prefix: String): Unit =
|
||||
redisPrefix = prefix
|
||||
@@ -58,32 +60,42 @@ class FailoverService:
|
||||
healthyInstances: List[InstanceMetadata],
|
||||
deadInstanceId: String,
|
||||
): Unit =
|
||||
if gameIds.isEmpty || healthyInstances.isEmpty then return
|
||||
if gameIds.nonEmpty && healthyInstances.nonEmpty then
|
||||
val batchSize = math.max(1, gameIds.size / healthyInstances.size)
|
||||
val batches = gameIds.grouped(batchSize).toList
|
||||
|
||||
val batchSize = math.max(1, gameIds.size / healthyInstances.size)
|
||||
val batches = gameIds.grouped(batchSize).toList
|
||||
batches.zipWithIndex.foreach { case (batch, idx) =>
|
||||
if !tryMigrateBatch(batch, idx, healthyInstances, deadInstanceId) then
|
||||
log.errorf(
|
||||
"Failed to migrate batch of %d games from %s to any healthy instance",
|
||||
batch.size,
|
||||
deadInstanceId,
|
||||
)
|
||||
}
|
||||
|
||||
batches.zipWithIndex.foreach { case (batch, idx) =>
|
||||
var migrated = false
|
||||
var attempt = 0
|
||||
while !migrated && attempt < healthyInstances.size do
|
||||
val target = healthyInstances((idx + attempt) % healthyInstances.size)
|
||||
attempt += 1
|
||||
@scala.annotation.tailrec
|
||||
private def tryMigrateBatch(
|
||||
batch: List[String],
|
||||
batchIdx: Int,
|
||||
instances: List[InstanceMetadata],
|
||||
deadId: String,
|
||||
attempt: Int = 0,
|
||||
): Boolean =
|
||||
if attempt >= instances.size then false
|
||||
else
|
||||
val target = instances((batchIdx + attempt) % instances.size)
|
||||
val success =
|
||||
try
|
||||
val subscribed = coreGrpcClient.batchResubscribeGames(target.hostname, target.grpcPort, batch)
|
||||
if subscribed > 0 then
|
||||
log.infof("Migrated %d games from %s to %s", subscribed, deadInstanceId, target.instanceId)
|
||||
migrated = true
|
||||
log.infof("Migrated %d games from %s to %s", subscribed, deadId, target.instanceId)
|
||||
true
|
||||
else false
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to migrate batch to %s, trying next", target.instanceId)
|
||||
if !migrated then
|
||||
log.errorf(
|
||||
"Failed to migrate batch of %d games from %s to any healthy instance",
|
||||
batch.size,
|
||||
deadInstanceId,
|
||||
)
|
||||
}
|
||||
false
|
||||
if success then true else tryMigrateBatch(batch, batchIdx, instances, deadId, attempt + 1)
|
||||
|
||||
private def cleanupDeadInstance(instanceId: String): Unit =
|
||||
val setKey = s"$redisPrefix:instance:$instanceId:games"
|
||||
|
||||
+45
-44
@@ -1,6 +1,7 @@
|
||||
package de.nowchess.coordinator.service
|
||||
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
import jakarta.enterprise.inject.Instance
|
||||
import jakarta.inject.Inject
|
||||
import de.nowchess.coordinator.config.CoordinatorConfig
|
||||
import io.fabric8.kubernetes.client.KubernetesClient
|
||||
@@ -13,8 +14,9 @@ import java.time.Instant
|
||||
|
||||
@ApplicationScoped
|
||||
class HealthMonitor:
|
||||
// scalafix:off DisableSyntax.var
|
||||
@Inject
|
||||
private var kubeClient: KubernetesClient = null
|
||||
private var kubeClientInstance: Instance[KubernetesClient] = uninitialized
|
||||
|
||||
@Inject
|
||||
private var config: CoordinatorConfig = uninitialized
|
||||
@@ -27,6 +29,11 @@ class HealthMonitor:
|
||||
|
||||
private val log = Logger.getLogger(classOf[HealthMonitor])
|
||||
private var redisPrefix = "nowchess"
|
||||
// scalafix:on DisableSyntax.var
|
||||
|
||||
private def kubeClientOpt: Option[KubernetesClient] =
|
||||
if kubeClientInstance.isUnsatisfied then None
|
||||
else Some(kubeClientInstance.get())
|
||||
|
||||
def setRedisPrefix(prefix: String): Unit =
|
||||
redisPrefix = prefix
|
||||
@@ -55,10 +62,9 @@ class HealthMonitor:
|
||||
false
|
||||
|
||||
private def checkK8sPodStatus(instanceId: String): Boolean =
|
||||
if kubeClient == null then true
|
||||
else
|
||||
kubeClientOpt.fold(true) { kube =>
|
||||
try
|
||||
val pods = kubeClient
|
||||
val pods = kube
|
||||
.pods()
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withLabel(config.k8sRolloutLabelSelector)
|
||||
@@ -74,49 +80,44 @@ class HealthMonitor:
|
||||
case ex: Exception =>
|
||||
log.debugf(ex, "K8s pod status check failed for %s", instanceId)
|
||||
true
|
||||
}
|
||||
|
||||
def watchK8sPods: Unit =
|
||||
if kubeClient == null then
|
||||
log.debug("Kubernetes client not available for pod watch")
|
||||
return
|
||||
kubeClientOpt match
|
||||
case None =>
|
||||
log.debug("Kubernetes client not available for pod watch")
|
||||
case Some(kube) =>
|
||||
try
|
||||
val pods = kube
|
||||
.pods()
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withLabel(config.k8sRolloutLabelSelector)
|
||||
.list()
|
||||
.getItems
|
||||
.asScala
|
||||
|
||||
try
|
||||
val pods = kubeClient
|
||||
.pods()
|
||||
.inNamespace(config.k8sNamespace)
|
||||
.withLabel(config.k8sRolloutLabelSelector)
|
||||
.list()
|
||||
.getItems
|
||||
.asScala
|
||||
val instances = instanceRegistry.getAllInstances
|
||||
instances.foreach { inst =>
|
||||
val matchingPod = pods.find { pod =>
|
||||
pod.getMetadata.getName.contains(inst.instanceId)
|
||||
}
|
||||
|
||||
val instances = instanceRegistry.getAllInstances
|
||||
instances.foreach { inst =>
|
||||
val matchingPod = pods.find { pod =>
|
||||
pod.getMetadata.getName.contains(inst.instanceId)
|
||||
}
|
||||
|
||||
matchingPod match
|
||||
case Some(pod) =>
|
||||
val isReady = isPodReady(pod)
|
||||
if !isReady && inst.state == "HEALTHY" then
|
||||
log.warnf("Pod %s not ready, marking instance %s dead", pod.getMetadata.getName, inst.instanceId)
|
||||
instanceRegistry.markInstanceDead(inst.instanceId)
|
||||
case None =>
|
||||
if inst.state == "HEALTHY" then
|
||||
log.warnf("No pod found for instance %s, marking dead", inst.instanceId)
|
||||
instanceRegistry.markInstanceDead(inst.instanceId)
|
||||
}
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to watch k8s pods")
|
||||
matchingPod match
|
||||
case Some(pod) =>
|
||||
val isReady = isPodReady(pod)
|
||||
if !isReady && inst.state == "HEALTHY" then
|
||||
log.warnf("Pod %s not ready, marking instance %s dead", pod.getMetadata.getName, inst.instanceId)
|
||||
instanceRegistry.markInstanceDead(inst.instanceId)
|
||||
case None =>
|
||||
if inst.state == "HEALTHY" then
|
||||
log.warnf("No pod found for instance %s, marking dead", inst.instanceId)
|
||||
instanceRegistry.markInstanceDead(inst.instanceId)
|
||||
}
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to watch k8s pods")
|
||||
|
||||
private def isPodReady(pod: Pod): Boolean =
|
||||
val status = pod.getStatus
|
||||
if status == null then false
|
||||
else
|
||||
val conditions = status.getConditions
|
||||
if conditions == null then false
|
||||
else
|
||||
conditions.asScala.exists { cond =>
|
||||
cond.getType == "Ready" && cond.getStatus == "True"
|
||||
}
|
||||
Option(pod.getStatus)
|
||||
.flatMap(s => Option(s.getConditions))
|
||||
.exists(_.asScala.exists(cond => cond.getType == "Ready" && cond.getStatus == "True"))
|
||||
|
||||
+8
-6
@@ -11,12 +11,14 @@ import java.util.concurrent.ConcurrentHashMap
|
||||
|
||||
@ApplicationScoped
|
||||
class InstanceRegistry:
|
||||
// scalafix:off DisableSyntax.var
|
||||
@Inject
|
||||
private var redis: RedisDataSource = uninitialized
|
||||
private var redisPrefix = "nowchess"
|
||||
// scalafix:on DisableSyntax.var
|
||||
|
||||
private val mapper = ObjectMapper()
|
||||
private val instances = ConcurrentHashMap[String, InstanceMetadata]()
|
||||
private var redisPrefix = "nowchess"
|
||||
private val mapper = ObjectMapper()
|
||||
private val instances = ConcurrentHashMap[String, InstanceMetadata]()
|
||||
|
||||
def setRedisPrefix(prefix: String): Unit =
|
||||
redisPrefix = prefix
|
||||
@@ -28,13 +30,13 @@ class InstanceRegistry:
|
||||
instances.values.asScala.toList
|
||||
|
||||
def updateInstanceFromRedis(instanceId: String): Unit =
|
||||
val key = s"$redisPrefix:instances:$instanceId"
|
||||
val value = redis.value(classOf[String]).get(key)
|
||||
if value != null then
|
||||
val key = s"$redisPrefix:instances:$instanceId"
|
||||
Option(redis.value(classOf[String]).get(key)).foreach { value =>
|
||||
try
|
||||
val metadata = mapper.readValue(value, classOf[InstanceMetadata])
|
||||
instances.put(instanceId, metadata)
|
||||
catch case _: Exception => ()
|
||||
}
|
||||
|
||||
def markInstanceDead(instanceId: String): Unit =
|
||||
instances.computeIfPresent(instanceId, (_, inst) => inst.copy(state = "DEAD"))
|
||||
|
||||
+41
-42
@@ -12,6 +12,7 @@ import de.nowchess.coordinator.grpc.CoreGrpcClient
|
||||
|
||||
@ApplicationScoped
|
||||
class LoadBalancer:
|
||||
// scalafix:off DisableSyntax.var
|
||||
@Inject
|
||||
private var config: CoordinatorConfig = uninitialized
|
||||
|
||||
@@ -27,6 +28,7 @@ class LoadBalancer:
|
||||
private val log = Logger.getLogger(classOf[LoadBalancer])
|
||||
private val lastRebalanceTime = new java.util.concurrent.atomic.AtomicLong(0L)
|
||||
private var redisPrefix = "nowchess"
|
||||
// scalafix:on DisableSyntax.var
|
||||
|
||||
def setRedisPrefix(prefix: String): Unit =
|
||||
redisPrefix = prefix
|
||||
@@ -34,22 +36,22 @@ class LoadBalancer:
|
||||
def shouldRebalance: Boolean =
|
||||
val now = System.currentTimeMillis()
|
||||
val minInterval = config.rebalanceMinInterval.toMillis
|
||||
if now - lastRebalanceTime.get() < minInterval then return false
|
||||
if now - lastRebalanceTime.get() < minInterval then false
|
||||
else
|
||||
val instances = instanceRegistry.getAllInstances
|
||||
if instances.isEmpty then false
|
||||
else
|
||||
val loads = instances.map(_.subscriptionCount)
|
||||
val maxLoad = loads.max
|
||||
val minLoad = loads.min
|
||||
val avgLoad = loads.sum.toDouble / loads.size
|
||||
|
||||
val instances = instanceRegistry.getAllInstances
|
||||
if instances.isEmpty then return false
|
||||
val exceededMax = maxLoad > config.maxGamesPerCore
|
||||
val deviationPercent = 100.0 * (maxLoad - avgLoad) / avgLoad
|
||||
val exceededDeviation =
|
||||
maxLoad > avgLoad && deviationPercent > config.maxDeviationPercent && (maxLoad - minLoad) > 50
|
||||
|
||||
val loads = instances.map(_.subscriptionCount)
|
||||
val maxLoad = loads.max
|
||||
val minLoad = loads.min
|
||||
val avgLoad = loads.sum.toDouble / loads.size
|
||||
|
||||
val exceededMax = maxLoad > config.maxGamesPerCore
|
||||
val deviationPercent = 100.0 * (maxLoad - avgLoad) / avgLoad
|
||||
val exceededDeviation =
|
||||
maxLoad > avgLoad && deviationPercent > config.maxDeviationPercent && (maxLoad - minLoad) > 50
|
||||
|
||||
exceededMax || exceededDeviation
|
||||
exceededMax || exceededDeviation
|
||||
|
||||
def rebalance: Unit =
|
||||
log.info("Starting rebalance")
|
||||
@@ -59,34 +61,32 @@ class LoadBalancer:
|
||||
try
|
||||
val instances = instanceRegistry.getAllInstances.filter(_.state == "HEALTHY")
|
||||
|
||||
if instances.size < 2 then
|
||||
log.info("Not enough healthy instances for rebalance")
|
||||
return
|
||||
if instances.size < 2 then log.info("Not enough healthy instances for rebalance")
|
||||
else
|
||||
val loads = instances.map(_.subscriptionCount)
|
||||
val avgLoad = loads.sum.toDouble / loads.size
|
||||
|
||||
val loads = instances.map(_.subscriptionCount)
|
||||
val avgLoad = loads.sum.toDouble / loads.size
|
||||
val overloaded = instances
|
||||
.filter(_.subscriptionCount > config.maxGamesPerCore)
|
||||
.sortBy[Int](_.subscriptionCount)
|
||||
.reverse
|
||||
val underloaded = instances
|
||||
.filter(_.subscriptionCount < avgLoad * 0.8)
|
||||
.sortBy(_.subscriptionCount)
|
||||
|
||||
val overloaded = instances
|
||||
.filter(_.subscriptionCount > config.maxGamesPerCore)
|
||||
.sortBy[Int](_.subscriptionCount)
|
||||
.reverse
|
||||
val underloaded = instances
|
||||
.filter(_.subscriptionCount < avgLoad * 0.8)
|
||||
.sortBy(_.subscriptionCount)
|
||||
if underloaded.isEmpty then log.info("No underloaded instances available for rebalance")
|
||||
else
|
||||
val allBatches = overloaded.flatMap { over =>
|
||||
val excess = math.max(0, over.subscriptionCount - avgLoad.toInt)
|
||||
val gamesToMove = getGamesToMove(over.instanceId, excess)
|
||||
if gamesToMove.isEmpty then List.empty
|
||||
else
|
||||
val batchSize = math.max(1, (gamesToMove.size + underloaded.size - 1) / underloaded.size)
|
||||
gamesToMove.grouped(batchSize).toList.map((over, _))
|
||||
}
|
||||
|
||||
if underloaded.isEmpty then
|
||||
log.info("No underloaded instances available for rebalance")
|
||||
return
|
||||
|
||||
var targetIdx = 0
|
||||
overloaded.foreach { over =>
|
||||
val excess = math.max(0, over.subscriptionCount - avgLoad.toInt)
|
||||
val gamesToMove = getGamesToMove(over.instanceId, excess)
|
||||
if gamesToMove.nonEmpty then
|
||||
val batchSize = math.max(1, (gamesToMove.size + underloaded.size - 1) / underloaded.size)
|
||||
gamesToMove.grouped(batchSize).foreach { batch =>
|
||||
val target = underloaded(targetIdx % underloaded.size)
|
||||
targetIdx += 1
|
||||
allBatches.zipWithIndex.foreach { case ((over, batch), idx) =>
|
||||
val target = underloaded(idx % underloaded.size)
|
||||
try
|
||||
coreGrpcClient.unsubscribeGames(over.hostname, over.grpcPort, batch)
|
||||
val subscribed = coreGrpcClient.batchResubscribeGames(target.hostname, target.grpcPort, batch)
|
||||
@@ -97,10 +97,9 @@ class LoadBalancer:
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Failed to move games from %s to %s", over.instanceId, target.instanceId)
|
||||
}
|
||||
}
|
||||
|
||||
val elapsed = System.currentTimeMillis() - startTime
|
||||
log.infof("Rebalance completed in %dms", elapsed)
|
||||
val elapsed = System.currentTimeMillis() - startTime
|
||||
log.infof("Rebalance completed in %dms", elapsed)
|
||||
catch
|
||||
case ex: Exception =>
|
||||
log.warnf(ex, "Rebalance failed")
|
||||
|
||||
Reference in New Issue
Block a user