fix: don't block event loop during scale-down drain
Scale-down was calling failoverService.onInstanceStreamDropped synchronously and waiting for it to complete. Failover retries for up to 30s waiting for healthy instances, which blocks the Quarkus event loop thread. This caused: - Event loop blocked for 15+ seconds - Redis health checks timing out (also on event loop) - Scale-down operations failing Fix: Trigger drain asynchronously without waiting. Scale-down proceeds immediately while drain happens in background. Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
+7
-10
@@ -134,12 +134,16 @@ class AutoScaler:
|
|||||||
.minByOption(_.subscriptionCount)
|
.minByOption(_.subscriptionCount)
|
||||||
|
|
||||||
underloadedInstance.foreach { inst =>
|
underloadedInstance.foreach { inst =>
|
||||||
log.infof("Draining instance %s before scale-down", inst.instanceId)
|
log.infof("Marking instance %s for drain before scale-down", inst.instanceId)
|
||||||
failoverService
|
failoverService
|
||||||
.onInstanceStreamDropped(inst.instanceId)
|
.onInstanceStreamDropped(inst.instanceId)
|
||||||
.subscribe()
|
.subscribe()
|
||||||
.`with`(
|
.`with`(
|
||||||
_ =>
|
_ => log.debugf("Instance %s drained for scale-down", inst.instanceId),
|
||||||
|
ex => log.warnf(ex, "Drain failed for %s, proceeding with scale-down", inst.instanceId),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
kubeClientOpt match
|
kubeClientOpt match
|
||||||
case None =>
|
case None =>
|
||||||
log.warn("Kubernetes client not available, cannot scale")
|
log.warn("Kubernetes client not available, cannot scale")
|
||||||
@@ -179,11 +183,4 @@ class AutoScaler:
|
|||||||
catch
|
catch
|
||||||
case ex: Exception =>
|
case ex: Exception =>
|
||||||
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
||||||
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName),
|
log.warnf(ex, "Failed to scale down %s", config.k8sRolloutName)
|
||||||
ex =>
|
|
||||||
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment()
|
|
||||||
log.warnf(ex, "Failed to drain instance %s before scale-down", inst.instanceId),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if underloadedInstance.isEmpty then log.warn("No healthy instances found for scale-down")
|
|
||||||
|
|||||||
Reference in New Issue
Block a user