feat: add initialization metrics for various services

This commit is contained in:
2026-05-11 22:37:22 +02:00
parent fcc251f777
commit d438e97f32
11 changed files with 108 additions and 2 deletions
@@ -7,6 +7,7 @@ import de.nowchess.account.repository.{BotAccountRepository, OfficialBotAccountR
import io.micrometer.core.instrument.MeterRegistry
import io.quarkus.elytron.security.common.BcryptUtil
import io.smallrye.jwt.build.Jwt
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject
import jakarta.transaction.Transactional
@@ -35,6 +36,13 @@ class AccountService:
var meterRegistry: MeterRegistry = uninitialized
// scalafix:on
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.counter("nowchess.users.registrations", "result", "success").increment(0)
meterRegistry.counter("nowchess.users.registrations", "result", "failure").increment(0)
meterRegistry.counter("nowchess.auth.logins", "result", "success").increment(0)
meterRegistry.counter("nowchess.auth.logins", "result", "failure").increment(0)
@Transactional
def register(req: RegisterRequest): Either[AccountError, UserAccount] =
log.infof("Registering user %s", req.username)
@@ -19,6 +19,7 @@ import de.nowchess.account.dto.{
import de.nowchess.account.error.ChallengeError
import de.nowchess.account.repository.{ChallengeRepository, UserAccountRepository}
import io.micrometer.core.instrument.MeterRegistry
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject
import jakarta.transaction.Transactional
@@ -54,6 +55,12 @@ class ChallengeService:
var meterRegistry: MeterRegistry = uninitialized
// scalafix:on
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.counter("nowchess.challenges.created").increment(0)
meterRegistry.counter("nowchess.challenges.accepted").increment(0)
meterRegistry.counter("nowchess.challenges.declined").increment(0)
@Transactional
def create(challengerId: UUID, destUsername: String, req: ChallengeRequest): Either[ChallengeError, Challenge] =
val result = createChallenge(challengerId, destUsername, req)
@@ -45,6 +45,10 @@ class AutoScaler:
Gauge
.builder("nowchess.coordinator.load.average", avgLoadRef, _.get())
.register(meterRegistry)
meterRegistry.counter("nowchess.coordinator.scale.events", "direction", "up").increment(0)
meterRegistry.counter("nowchess.coordinator.scale.events", "direction", "down").increment(0)
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "up").increment(0)
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment(0)
()
// scalafix:off DisableSyntax.asInstanceOf
@@ -1,5 +1,6 @@
package de.nowchess.coordinator.service
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject
import io.quarkus.redis.datasource.RedisDataSource
@@ -11,6 +12,7 @@ import org.jboss.logging.Logger
import scala.compiletime.uninitialized
import scala.util.Try
import java.time.Instant
import java.util.concurrent.TimeUnit
import de.nowchess.coordinator.grpc.CoreGrpcClient
@ApplicationScoped
@@ -41,6 +43,11 @@ class CacheEvictionManager:
def setRedisPrefix(prefix: String): Unit =
redisPrefix = prefix
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record(0L, TimeUnit.MILLISECONDS)
meterRegistry.counter("nowchess.coordinator.cache.evictions").increment(0)
def evictStaleGames: Unit =
meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record((() => runEviction()): Runnable)
@@ -1,5 +1,6 @@
package de.nowchess.coordinator.service
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
import jakarta.enterprise.inject.Instance
import jakarta.inject.Inject
@@ -42,16 +43,24 @@ class HealthMonitor:
def setRedisPrefix(prefix: String): Unit =
redisPrefix = prefix
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.counter("nowchess.coordinator.health.checks").increment(0)
meterRegistry.counter("nowchess.coordinator.pods.unhealthy").increment(0)
def checkInstanceHealth: Unit =
meterRegistry.counter("nowchess.coordinator.health.checks").increment()
val evicted = instanceRegistry.evictStaleInstances(config.instanceDeadTimeout)
if evicted.nonEmpty then log.warnf("Evicted %d stale instances: %s", evicted.size, evicted.mkString(", "))
if evicted.nonEmpty then
log.warnf("Evicted %d stale instances: %s", evicted.size, evicted.mkString(", "))
evicted.foreach(deleteK8sPod)
val instances = instanceRegistry.getAllInstances
instances.foreach { inst =>
val isHealthy = checkHealth(inst.instanceId)
if !isHealthy && inst.state == "HEALTHY" then
log.warnf("Instance %s marked unhealthy", inst.instanceId)
instanceRegistry.markInstanceDead(inst.instanceId)
deleteK8sPod(inst.instanceId)
}
private def checkHealth(instanceId: String): Boolean =
@@ -116,6 +125,7 @@ class HealthMonitor:
meterRegistry.counter("nowchess.coordinator.pods.unhealthy").increment()
log.warnf("Pod %s not ready, marking instance %s dead", pod.getMetadata.getName, inst.instanceId)
instanceRegistry.markInstanceDead(inst.instanceId)
deleteK8sPod(inst.instanceId)
case None =>
log.warnf("No pod found for instance %s, evicting from registry", inst.instanceId)
instanceRegistry.removeInstance(inst.instanceId)
@@ -128,3 +138,29 @@ class HealthMonitor:
Option(pod.getStatus)
.flatMap(s => Option(s.getConditions))
.exists(_.asScala.exists(cond => cond.getType == "Ready" && cond.getStatus == "True"))
private def deleteK8sPod(instanceId: String): Unit =
kubeClientOpt match
case None =>
log.debugf("Kubernetes client not available, skipping pod deletion for %s", instanceId)
case Some(kube) =>
try
val pods = kube
.pods()
.inNamespace(config.k8sNamespace)
.withLabel(config.k8sRolloutLabelSelector)
.list()
.getItems
.asScala
pods.find(pod => pod.getMetadata.getName.contains(instanceId)) match
case Some(pod) =>
val podName = pod.getMetadata.getName
kube.pods().inNamespace(config.k8sNamespace).withName(podName).delete()
meterRegistry.counter("nowchess.coordinator.pods.deleted").increment()
log.infof("Deleted pod %s for dead instance %s", podName, instanceId)
case None =>
log.debugf("No pod found for instance %s, skipping deletion", instanceId)
catch
case ex: Exception =>
log.warnf(ex, "Failed to delete pod for instance %s", instanceId)
@@ -34,6 +34,9 @@ class InstanceRegistry:
Gauge
.builder("nowchess.coordinator.instances.active", instances, m => m.size().toDouble)
.register(meterRegistry)
meterRegistry.counter("nowchess.coordinator.instances.joined").increment(0)
meterRegistry.counter("nowchess.coordinator.instances.removed").increment(0)
meterRegistry.counter("nowchess.coordinator.instances.evicted").increment(0)
()
def setRedisPrefix(prefix: String): Unit =
@@ -48,6 +48,21 @@ class RedisGameRegistry extends GameRegistry:
Gauge
.builder("nowchess.games.active", GameEngine.activeGamesCount, _.get().toDouble)
.register(meterRegistry)
meterRegistry.counter("nowchess.games.cache.hits", "source", "local").increment(0)
meterRegistry.counter("nowchess.games.cache.hits", "source", "redis").increment(0)
meterRegistry.counter("nowchess.games.cache.hits", "source", "db").increment(0)
meterRegistry.counter("nowchess.games.cache.misses").increment(0)
meterRegistry.counter("nowchess.games.started").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "checkmate").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.agreement").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.fifty_move").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.threefold").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.stalemate").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.insufficient").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "resignation").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "timeout").increment(0)
meterRegistry.counter("nowchess.moves.processed").increment(0)
meterRegistry.timer("nowchess.moves.duration").record(0L, java.util.concurrent.TimeUnit.MILLISECONDS)
()
private def cacheKey(gameId: String) = s"${redisConfig.prefix}:game:entry:$gameId"
@@ -65,6 +65,7 @@ class InstanceHeartbeatService:
def onStart(@Observes event: StartupEvent): Unit =
Gauge.builder("nowchess.instance.subscriptions", subscriptionCount, _.get().toDouble).register(meterRegistry)
meterRegistry.counter("nowchess.heartbeat.failures").increment(0)
if coordinatorEnabled then
try
shuttingDown = false
@@ -9,11 +9,13 @@ import de.nowchess.io.fen.FenParser
import io.micrometer.core.instrument.MeterRegistry
import io.quarkus.redis.datasource.RedisDataSource
import io.quarkus.runtime.StartupEvent
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
import jakarta.enterprise.event.Observes
import jakarta.inject.Inject
import scala.compiletime.uninitialized
import java.util.function.Consumer
import java.util.concurrent.TimeUnit
@ApplicationScoped
class OfficialBotService:
@@ -29,6 +31,13 @@ class OfficialBotService:
private val terminalStatuses =
Set("checkmate", "resign", "timeout", "stalemate", "insufficientMaterial", "draw")
@PostConstruct
def initializeMetrics(): Unit =
BotController.listBots.foreach { bot =>
meterRegistry.timer("nowchess.bot.move.duration", "bot", bot).record(0L, TimeUnit.MILLISECONDS)
meterRegistry.counter("nowchess.bot.moves.computed", "bot", bot).increment(0)
}
def onStart(@Observes event: StartupEvent): Unit =
BotController.listBots.foreach(subscribeToEventChannel)
@@ -4,6 +4,7 @@ import de.nowchess.api.dto.GameWritebackEventDto
import de.nowchess.store.domain.GameRecord
import de.nowchess.store.repository.GameRecordRepository
import io.micrometer.core.instrument.{Counter, MeterRegistry, Timer}
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject
import jakarta.transaction.Transactional
@@ -20,7 +21,14 @@ class GameWritebackService:
@Inject
var meterRegistry: MeterRegistry = uninitialized
// scalafix:on DisableSyntax.var
// scalafix:on
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.timer("nowchess.store.writeback.duration").record(0L, java.util.concurrent.TimeUnit.MILLISECONDS)
meterRegistry.counter("nowchess.store.writeback.skipped").increment(0)
meterRegistry.counter("nowchess.store.games.written", "operation", "create").increment(0)
meterRegistry.counter("nowchess.store.games.written", "operation", "update").increment(0)
private lazy val writebackTimer: Timer =
meterRegistry.timer("nowchess.store.writeback.duration")
@@ -6,6 +6,7 @@ import io.quarkus.redis.datasource.RedisDataSource
import io.quarkus.redis.datasource.pubsub.PubSubCommands
import io.quarkus.websockets.next.*
import io.smallrye.jwt.auth.principal.JWTParser
import jakarta.annotation.PostConstruct
import jakarta.inject.Inject
import org.jboss.logging.Logger
import scala.compiletime.uninitialized
@@ -34,6 +35,13 @@ class GameWebSocketResource:
private val connections = new ConcurrentHashMap[String, ConnectionMeta]()
@PostConstruct
def initializeMetrics(): Unit =
_ = connectionsOpened
_ = connectionsClosed
_ = messagesReceived
_ = activeGauge
private lazy val connectionsOpened: Counter =
meterRegistry.counter("nowchess.ws.connections.opened")