feat: add initialization metrics for various services

This commit is contained in:
2026-05-11 22:37:22 +02:00
parent fcc251f777
commit d438e97f32
11 changed files with 108 additions and 2 deletions
@@ -7,6 +7,7 @@ import de.nowchess.account.repository.{BotAccountRepository, OfficialBotAccountR
import io.micrometer.core.instrument.MeterRegistry import io.micrometer.core.instrument.MeterRegistry
import io.quarkus.elytron.security.common.BcryptUtil import io.quarkus.elytron.security.common.BcryptUtil
import io.smallrye.jwt.build.Jwt import io.smallrye.jwt.build.Jwt
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject import jakarta.inject.Inject
import jakarta.transaction.Transactional import jakarta.transaction.Transactional
@@ -35,6 +36,13 @@ class AccountService:
var meterRegistry: MeterRegistry = uninitialized var meterRegistry: MeterRegistry = uninitialized
// scalafix:on // scalafix:on
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.counter("nowchess.users.registrations", "result", "success").increment(0)
meterRegistry.counter("nowchess.users.registrations", "result", "failure").increment(0)
meterRegistry.counter("nowchess.auth.logins", "result", "success").increment(0)
meterRegistry.counter("nowchess.auth.logins", "result", "failure").increment(0)
@Transactional @Transactional
def register(req: RegisterRequest): Either[AccountError, UserAccount] = def register(req: RegisterRequest): Either[AccountError, UserAccount] =
log.infof("Registering user %s", req.username) log.infof("Registering user %s", req.username)
@@ -19,6 +19,7 @@ import de.nowchess.account.dto.{
import de.nowchess.account.error.ChallengeError import de.nowchess.account.error.ChallengeError
import de.nowchess.account.repository.{ChallengeRepository, UserAccountRepository} import de.nowchess.account.repository.{ChallengeRepository, UserAccountRepository}
import io.micrometer.core.instrument.MeterRegistry import io.micrometer.core.instrument.MeterRegistry
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject import jakarta.inject.Inject
import jakarta.transaction.Transactional import jakarta.transaction.Transactional
@@ -54,6 +55,12 @@ class ChallengeService:
var meterRegistry: MeterRegistry = uninitialized var meterRegistry: MeterRegistry = uninitialized
// scalafix:on // scalafix:on
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.counter("nowchess.challenges.created").increment(0)
meterRegistry.counter("nowchess.challenges.accepted").increment(0)
meterRegistry.counter("nowchess.challenges.declined").increment(0)
@Transactional @Transactional
def create(challengerId: UUID, destUsername: String, req: ChallengeRequest): Either[ChallengeError, Challenge] = def create(challengerId: UUID, destUsername: String, req: ChallengeRequest): Either[ChallengeError, Challenge] =
val result = createChallenge(challengerId, destUsername, req) val result = createChallenge(challengerId, destUsername, req)
@@ -45,6 +45,10 @@ class AutoScaler:
Gauge Gauge
.builder("nowchess.coordinator.load.average", avgLoadRef, _.get()) .builder("nowchess.coordinator.load.average", avgLoadRef, _.get())
.register(meterRegistry) .register(meterRegistry)
meterRegistry.counter("nowchess.coordinator.scale.events", "direction", "up").increment(0)
meterRegistry.counter("nowchess.coordinator.scale.events", "direction", "down").increment(0)
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "up").increment(0)
meterRegistry.counter("nowchess.coordinator.scale.failures", "direction", "down").increment(0)
() ()
// scalafix:off DisableSyntax.asInstanceOf // scalafix:off DisableSyntax.asInstanceOf
@@ -1,5 +1,6 @@
package de.nowchess.coordinator.service package de.nowchess.coordinator.service
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject import jakarta.inject.Inject
import io.quarkus.redis.datasource.RedisDataSource import io.quarkus.redis.datasource.RedisDataSource
@@ -11,6 +12,7 @@ import org.jboss.logging.Logger
import scala.compiletime.uninitialized import scala.compiletime.uninitialized
import scala.util.Try import scala.util.Try
import java.time.Instant import java.time.Instant
import java.util.concurrent.TimeUnit
import de.nowchess.coordinator.grpc.CoreGrpcClient import de.nowchess.coordinator.grpc.CoreGrpcClient
@ApplicationScoped @ApplicationScoped
@@ -41,6 +43,11 @@ class CacheEvictionManager:
def setRedisPrefix(prefix: String): Unit = def setRedisPrefix(prefix: String): Unit =
redisPrefix = prefix redisPrefix = prefix
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record(0L, TimeUnit.MILLISECONDS)
meterRegistry.counter("nowchess.coordinator.cache.evictions").increment(0)
def evictStaleGames: Unit = def evictStaleGames: Unit =
meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record((() => runEviction()): Runnable) meterRegistry.timer("nowchess.coordinator.cache.eviction.duration").record((() => runEviction()): Runnable)
@@ -1,5 +1,6 @@
package de.nowchess.coordinator.service package de.nowchess.coordinator.service
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import jakarta.enterprise.inject.Instance import jakarta.enterprise.inject.Instance
import jakarta.inject.Inject import jakarta.inject.Inject
@@ -42,16 +43,24 @@ class HealthMonitor:
def setRedisPrefix(prefix: String): Unit = def setRedisPrefix(prefix: String): Unit =
redisPrefix = prefix redisPrefix = prefix
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.counter("nowchess.coordinator.health.checks").increment(0)
meterRegistry.counter("nowchess.coordinator.pods.unhealthy").increment(0)
def checkInstanceHealth: Unit = def checkInstanceHealth: Unit =
meterRegistry.counter("nowchess.coordinator.health.checks").increment() meterRegistry.counter("nowchess.coordinator.health.checks").increment()
val evicted = instanceRegistry.evictStaleInstances(config.instanceDeadTimeout) val evicted = instanceRegistry.evictStaleInstances(config.instanceDeadTimeout)
if evicted.nonEmpty then log.warnf("Evicted %d stale instances: %s", evicted.size, evicted.mkString(", ")) if evicted.nonEmpty then
log.warnf("Evicted %d stale instances: %s", evicted.size, evicted.mkString(", "))
evicted.foreach(deleteK8sPod)
val instances = instanceRegistry.getAllInstances val instances = instanceRegistry.getAllInstances
instances.foreach { inst => instances.foreach { inst =>
val isHealthy = checkHealth(inst.instanceId) val isHealthy = checkHealth(inst.instanceId)
if !isHealthy && inst.state == "HEALTHY" then if !isHealthy && inst.state == "HEALTHY" then
log.warnf("Instance %s marked unhealthy", inst.instanceId) log.warnf("Instance %s marked unhealthy", inst.instanceId)
instanceRegistry.markInstanceDead(inst.instanceId) instanceRegistry.markInstanceDead(inst.instanceId)
deleteK8sPod(inst.instanceId)
} }
private def checkHealth(instanceId: String): Boolean = private def checkHealth(instanceId: String): Boolean =
@@ -116,6 +125,7 @@ class HealthMonitor:
meterRegistry.counter("nowchess.coordinator.pods.unhealthy").increment() meterRegistry.counter("nowchess.coordinator.pods.unhealthy").increment()
log.warnf("Pod %s not ready, marking instance %s dead", pod.getMetadata.getName, inst.instanceId) log.warnf("Pod %s not ready, marking instance %s dead", pod.getMetadata.getName, inst.instanceId)
instanceRegistry.markInstanceDead(inst.instanceId) instanceRegistry.markInstanceDead(inst.instanceId)
deleteK8sPod(inst.instanceId)
case None => case None =>
log.warnf("No pod found for instance %s, evicting from registry", inst.instanceId) log.warnf("No pod found for instance %s, evicting from registry", inst.instanceId)
instanceRegistry.removeInstance(inst.instanceId) instanceRegistry.removeInstance(inst.instanceId)
@@ -128,3 +138,29 @@ class HealthMonitor:
Option(pod.getStatus) Option(pod.getStatus)
.flatMap(s => Option(s.getConditions)) .flatMap(s => Option(s.getConditions))
.exists(_.asScala.exists(cond => cond.getType == "Ready" && cond.getStatus == "True")) .exists(_.asScala.exists(cond => cond.getType == "Ready" && cond.getStatus == "True"))
private def deleteK8sPod(instanceId: String): Unit =
kubeClientOpt match
case None =>
log.debugf("Kubernetes client not available, skipping pod deletion for %s", instanceId)
case Some(kube) =>
try
val pods = kube
.pods()
.inNamespace(config.k8sNamespace)
.withLabel(config.k8sRolloutLabelSelector)
.list()
.getItems
.asScala
pods.find(pod => pod.getMetadata.getName.contains(instanceId)) match
case Some(pod) =>
val podName = pod.getMetadata.getName
kube.pods().inNamespace(config.k8sNamespace).withName(podName).delete()
meterRegistry.counter("nowchess.coordinator.pods.deleted").increment()
log.infof("Deleted pod %s for dead instance %s", podName, instanceId)
case None =>
log.debugf("No pod found for instance %s, skipping deletion", instanceId)
catch
case ex: Exception =>
log.warnf(ex, "Failed to delete pod for instance %s", instanceId)
@@ -34,6 +34,9 @@ class InstanceRegistry:
Gauge Gauge
.builder("nowchess.coordinator.instances.active", instances, m => m.size().toDouble) .builder("nowchess.coordinator.instances.active", instances, m => m.size().toDouble)
.register(meterRegistry) .register(meterRegistry)
meterRegistry.counter("nowchess.coordinator.instances.joined").increment(0)
meterRegistry.counter("nowchess.coordinator.instances.removed").increment(0)
meterRegistry.counter("nowchess.coordinator.instances.evicted").increment(0)
() ()
def setRedisPrefix(prefix: String): Unit = def setRedisPrefix(prefix: String): Unit =
@@ -48,6 +48,21 @@ class RedisGameRegistry extends GameRegistry:
Gauge Gauge
.builder("nowchess.games.active", GameEngine.activeGamesCount, _.get().toDouble) .builder("nowchess.games.active", GameEngine.activeGamesCount, _.get().toDouble)
.register(meterRegistry) .register(meterRegistry)
meterRegistry.counter("nowchess.games.cache.hits", "source", "local").increment(0)
meterRegistry.counter("nowchess.games.cache.hits", "source", "redis").increment(0)
meterRegistry.counter("nowchess.games.cache.hits", "source", "db").increment(0)
meterRegistry.counter("nowchess.games.cache.misses").increment(0)
meterRegistry.counter("nowchess.games.started").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "checkmate").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.agreement").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.fifty_move").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.threefold").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.stalemate").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "draw.insufficient").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "resignation").increment(0)
meterRegistry.counter("nowchess.games.completed", "result", "timeout").increment(0)
meterRegistry.counter("nowchess.moves.processed").increment(0)
meterRegistry.timer("nowchess.moves.duration").record(0L, java.util.concurrent.TimeUnit.MILLISECONDS)
() ()
private def cacheKey(gameId: String) = s"${redisConfig.prefix}:game:entry:$gameId" private def cacheKey(gameId: String) = s"${redisConfig.prefix}:game:entry:$gameId"
@@ -65,6 +65,7 @@ class InstanceHeartbeatService:
def onStart(@Observes event: StartupEvent): Unit = def onStart(@Observes event: StartupEvent): Unit =
Gauge.builder("nowchess.instance.subscriptions", subscriptionCount, _.get().toDouble).register(meterRegistry) Gauge.builder("nowchess.instance.subscriptions", subscriptionCount, _.get().toDouble).register(meterRegistry)
meterRegistry.counter("nowchess.heartbeat.failures").increment(0)
if coordinatorEnabled then if coordinatorEnabled then
try try
shuttingDown = false shuttingDown = false
@@ -9,11 +9,13 @@ import de.nowchess.io.fen.FenParser
import io.micrometer.core.instrument.MeterRegistry import io.micrometer.core.instrument.MeterRegistry
import io.quarkus.redis.datasource.RedisDataSource import io.quarkus.redis.datasource.RedisDataSource
import io.quarkus.runtime.StartupEvent import io.quarkus.runtime.StartupEvent
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import jakarta.enterprise.event.Observes import jakarta.enterprise.event.Observes
import jakarta.inject.Inject import jakarta.inject.Inject
import scala.compiletime.uninitialized import scala.compiletime.uninitialized
import java.util.function.Consumer import java.util.function.Consumer
import java.util.concurrent.TimeUnit
@ApplicationScoped @ApplicationScoped
class OfficialBotService: class OfficialBotService:
@@ -29,6 +31,13 @@ class OfficialBotService:
private val terminalStatuses = private val terminalStatuses =
Set("checkmate", "resign", "timeout", "stalemate", "insufficientMaterial", "draw") Set("checkmate", "resign", "timeout", "stalemate", "insufficientMaterial", "draw")
@PostConstruct
def initializeMetrics(): Unit =
BotController.listBots.foreach { bot =>
meterRegistry.timer("nowchess.bot.move.duration", "bot", bot).record(0L, TimeUnit.MILLISECONDS)
meterRegistry.counter("nowchess.bot.moves.computed", "bot", bot).increment(0)
}
def onStart(@Observes event: StartupEvent): Unit = def onStart(@Observes event: StartupEvent): Unit =
BotController.listBots.foreach(subscribeToEventChannel) BotController.listBots.foreach(subscribeToEventChannel)
@@ -4,6 +4,7 @@ import de.nowchess.api.dto.GameWritebackEventDto
import de.nowchess.store.domain.GameRecord import de.nowchess.store.domain.GameRecord
import de.nowchess.store.repository.GameRecordRepository import de.nowchess.store.repository.GameRecordRepository
import io.micrometer.core.instrument.{Counter, MeterRegistry, Timer} import io.micrometer.core.instrument.{Counter, MeterRegistry, Timer}
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import jakarta.inject.Inject import jakarta.inject.Inject
import jakarta.transaction.Transactional import jakarta.transaction.Transactional
@@ -20,7 +21,14 @@ class GameWritebackService:
@Inject @Inject
var meterRegistry: MeterRegistry = uninitialized var meterRegistry: MeterRegistry = uninitialized
// scalafix:on DisableSyntax.var // scalafix:on
@PostConstruct
def initializeMetrics(): Unit =
meterRegistry.timer("nowchess.store.writeback.duration").record(0L, java.util.concurrent.TimeUnit.MILLISECONDS)
meterRegistry.counter("nowchess.store.writeback.skipped").increment(0)
meterRegistry.counter("nowchess.store.games.written", "operation", "create").increment(0)
meterRegistry.counter("nowchess.store.games.written", "operation", "update").increment(0)
private lazy val writebackTimer: Timer = private lazy val writebackTimer: Timer =
meterRegistry.timer("nowchess.store.writeback.duration") meterRegistry.timer("nowchess.store.writeback.duration")
@@ -6,6 +6,7 @@ import io.quarkus.redis.datasource.RedisDataSource
import io.quarkus.redis.datasource.pubsub.PubSubCommands import io.quarkus.redis.datasource.pubsub.PubSubCommands
import io.quarkus.websockets.next.* import io.quarkus.websockets.next.*
import io.smallrye.jwt.auth.principal.JWTParser import io.smallrye.jwt.auth.principal.JWTParser
import jakarta.annotation.PostConstruct
import jakarta.inject.Inject import jakarta.inject.Inject
import org.jboss.logging.Logger import org.jboss.logging.Logger
import scala.compiletime.uninitialized import scala.compiletime.uninitialized
@@ -34,6 +35,13 @@ class GameWebSocketResource:
private val connections = new ConcurrentHashMap[String, ConnectionMeta]() private val connections = new ConcurrentHashMap[String, ConnectionMeta]()
@PostConstruct
def initializeMetrics(): Unit =
_ = connectionsOpened
_ = connectionsClosed
_ = messagesReceived
_ = activeGauge
private lazy val connectionsOpened: Counter = private lazy val connectionsOpened: Counter =
meterRegistry.counter("nowchess.ws.connections.opened") meterRegistry.counter("nowchess.ws.connections.opened")