feat(analytics): add Spark batch analytics module

New standalone modules:analytics submodule with two Spark jobs: - OpeningBookJob: reads game_records.pgn, extracts first N plies using pure Catalyst SQL expressions (no UDFs), aggregates win/draw/loss rates per opening sequence, writes Parquet + CSV top-1000 summary. - PlayerStatsJob: unions each game into a player-centric view, aggregates total_games/wins/losses/draws/avg_move_count/win_rate per player_id, writes Parquet. Module uses Scala 3 calling spark-sql_2.13 via JVM binary compatibility (DataFrame API only; no spark.implicits._ / typed Datasets). Spark is compileOnly; the fat jar bundles only scala3-library + postgresql driver. Submit via spark-submit; see build.gradle.kts header for invocation. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-15 21:58:05 +02:00
parent 0a5a216032
commit 259b3bbb24
5 changed files with 271 additions and 0 deletions
@@ -53,6 +53,8 @@ val coverageExclusions = listOf(
    "**/core/src/main/scala/de/nowchess/chess/resource/GameWebSocketResource.scala",
    // Coordinator infrastructure — gRPC, microservice orchestration
    "**/coordinator/src/main/scala/**",
+    // Analytics module — standalone Spark batch jobs; coverage not applicable (no Quarkus, no scoverage plugin)
+    "modules/analytics/**",
 )

 // Converts a Sonar-style glob to a scoverage regex (matched against full source path).
@@ -0,0 +1,86 @@
+// Standalone Spark batch-analytics module.
+//
+// Spark 3.5.x ships only Scala 2.12/2.13 artifacts; Scala 3 code can consume
+// them via JVM binary compatibility so long as we avoid macro-expanded APIs
+// (spark.implicits._, typed Dataset[T]).  We use the untyped DataFrame API
+// exclusively, which is safe to call from Scala 3.
+//
+// Spark is declared compileOnly — the cluster provides it at runtime via
+// spark-submit.  Only the PostgreSQL driver and the Scala 3 runtime are
+// bundled into the fat jar produced by the "jar" task.
+//
+// Build the submission jar:
+//   ./gradlew :modules:analytics:jar
+//
+// Run a job:
+//   spark-submit \
+//     --class de.nowchess.analytics.OpeningBookJob \
+//     modules/analytics/build/libs/analytics-<version>.jar \
+//     [outputDir] [maxPlies]
+//
+// Environment variables consumed:
+//   NOWCHESS_JDBC_URL  (default: jdbc:postgresql://localhost:5432/nowchess)
+//   NOWCHESS_DB_USER   (default: nowchess)
+//   NOWCHESS_DB_PASS   (default: nowchess)
+
+plugins {
+    id("scala")
+    application
+}
+
+group = "de.nowchess"
+version = "1.0-SNAPSHOT"
+
+@Suppress("UNCHECKED_CAST")
+val versions = rootProject.extra["VERSIONS"] as Map<String, String>
+
+repositories {
+    mavenCentral()
+}
+
+scala {
+    scalaVersion = versions["SCALA3"]!!
+}
+
+val sparkVersion = "3.5.4"
+
+dependencies {
+    compileOnly("org.scala-lang:scala3-compiler_3") {
+        version { strictly(versions["SCALA3"]!!) }
+    }
+    implementation("org.scala-lang:scala3-library_3") {
+        version { strictly(versions["SCALA3"]!!) }
+    }
+    implementation("org.scala-lang:scala-library") {
+        version { strictly(versions["SCALA_LIBRARY"]!!) }
+    }
+
+    // Spark is provided by the cluster — compile-only, not bundled.
+    compileOnly("org.apache.spark:spark-sql_2.13:$sparkVersion") {
+        exclude(group = "org.slf4j", module = "slf4j-log4j12")
+    }
+    compileOnly("org.apache.spark:spark-core_2.13:$sparkVersion") {
+        exclude(group = "org.slf4j", module = "slf4j-log4j12")
+    }
+
+    // PostgreSQL JDBC driver bundled so it is available on executor classpath.
+    implementation("org.postgresql:postgresql:42.7.4")
+}
+
+application {
+    mainClass.set("de.nowchess.analytics.OpeningBookJob")
+}
+
+// Fat jar: includes runtimeClasspath (our code + pg driver + scala3-library)
+// but NOT compileOnly Spark jars.
+tasks.jar {
+    manifest {
+        attributes["Main-Class"] = "de.nowchess.analytics.OpeningBookJob"
+    }
+    from(configurations.runtimeClasspath.get().map { if (it.isDirectory) it else zipTree(it) })
+    duplicatesStrategy = DuplicatesStrategy.EXCLUDE
+}
+
+tasks.withType<ScalaCompile> {
+    scalaCompileOptions.additionalParameters = listOf("-encoding", "UTF-8")
+}
@@ -0,0 +1,97 @@
+package de.nowchess.analytics
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions as F
+
+/** Reads completed games from the game_records table and produces an opening-book statistics table: for each unique
+  * opening (first N plies), it reports total games played and win/draw/loss rates from each side.
+  *
+  * Output is written as Parquet to `outputDir/opening_book` and a human-readable CSV summary (top-1000 openings by
+  * popularity) to `outputDir/opening_book_top1000`.
+  *
+  * PGN parsing is done entirely with Spark SQL string functions — no UDFs — so the Catalyst optimizer can push
+  * predicates and the job scales to any cluster size.
+  */
+object OpeningBookJob:
+
+  def main(args: Array[String]): Unit =
+    val jdbcUrl   = sys.env.getOrElse("NOWCHESS_JDBC_URL", "jdbc:postgresql://localhost:5432/nowchess")
+    val dbUser    = sys.env.getOrElse("NOWCHESS_DB_USER", "nowchess")
+    val dbPass    = sys.env.getOrElse("NOWCHESS_DB_PASS", "nowchess")
+    val outputDir = if args.length > 0 then args(0) else "/tmp/nowchess-opening-book"
+    val maxPlies  = if args.length > 1 then args(1).toInt else 10
+
+    val spark = SparkSession
+      .builder()
+      .appName("NowChess Opening Book Generator")
+      .getOrCreate()
+
+    run(spark, jdbcUrl, dbUser, dbPass, outputDir, maxPlies)
+    spark.stop()
+
+  def run(
+      spark: SparkSession,
+      jdbcUrl: String,
+      dbUser: String,
+      dbPass: String,
+      outputDir: String,
+      maxPlies: Int,
+  ): Unit =
+    val games = spark.read
+      .format("jdbc")
+      .option("url", jdbcUrl)
+      .option("dbtable", "game_records")
+      .option("user", dbUser)
+      .option("password", dbPass)
+      .option("driver", "org.postgresql.Driver")
+      .option("fetchsize", "10000")
+      .load()
+      .select("pgn", "result")
+      .filter(F.col("result").isNotNull.and(F.col("pgn").isNotNull))
+
+    val openingCol = extractOpening(F.col("pgn"), maxPlies)
+
+    val withOpening = games
+      .withColumn("opening", openingCol)
+      .filter(F.col("opening").isNotNull.and(F.length(F.col("opening")) > 0))
+
+    val stats = withOpening
+      .groupBy("opening")
+      .agg(
+        F.count("*").as("total"),
+        F.sum(F.when(F.col("result") === "white", 1).otherwise(0)).as("white_wins"),
+        F.sum(F.when(F.col("result") === "black", 1).otherwise(0)).as("black_wins"),
+        F.sum(F.when(F.col("result") === "draw", 1).otherwise(0)).as("draws"),
+      )
+      .withColumn("white_win_rate", F.round(F.col("white_wins") / F.col("total").cast("double"), 3))
+      .withColumn("black_win_rate", F.round(F.col("black_wins") / F.col("total").cast("double"), 3))
+      .withColumn("draw_rate", F.round(F.col("draws") / F.col("total").cast("double"), 3))
+      .orderBy(F.desc("total"))
+
+    stats.write
+      .mode("overwrite")
+      .parquet(s"$outputDir/opening_book")
+
+    stats
+      .limit(1000)
+      .write
+      .mode("overwrite")
+      .option("header", "true")
+      .csv(s"$outputDir/opening_book_top1000")
+
+  /** Extracts the first `maxPlies` moves from a PGN column as a space-separated string.
+    *
+    * PGN format produced by PgnExporter: [Event "?"]\n[White "?"]\n...\n\n1. e4 e5 2. Nf3 Nc6 *
+    *
+    * Steps:
+    *   1. Split on double-newline; take the moves section (index 1). 2. Strip the terminal result token (*, 1-0, 0-1,
+    *      1/2-1/2). 3. Strip move numbers (e.g., "1. ", "12. "). 4. Strip check/checkmate suffixes (+ #) for
+    *      position-independent lookup. 5. Tokenize on whitespace, take first maxPlies tokens, rejoin with spaces.
+    */
+  private def extractOpening(pgnCol: org.apache.spark.sql.Column, maxPlies: Int): org.apache.spark.sql.Column =
+    val moveSection   = F.coalesce(F.split(pgnCol, "\n\n").getItem(1), pgnCol)
+    val noResult      = F.regexp_replace(moveSection, "(1-0|0-1|1/2-1/2|\\*)\\s*$", "")
+    val noMoveNumbers = F.regexp_replace(noResult, "\\d+\\.+\\s*", " ")
+    val noAnnotations = F.regexp_replace(noMoveNumbers, "[+#]", "")
+    val moveArray     = F.split(F.trim(noAnnotations), "\\s+")
+    F.array_join(F.slice(moveArray, 1, maxPlies), " ")
@@ -0,0 +1,85 @@
+package de.nowchess.analytics
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions as F
+
+/** Aggregates per-player statistics from completed games.
+  *
+  * Each game contributes one row per player (as white and as black), so the dataset is first unioned into a
+  * player-centric view before grouping. Output columns: player_id, total_games, wins, losses, draws, games_as_white,
+  * games_as_black, avg_move_count, win_rate
+  *
+  * Output is written as Parquet to `outputDir/player_stats`.
+  */
+object PlayerStatsJob:
+
+  def main(args: Array[String]): Unit =
+    val jdbcUrl   = sys.env.getOrElse("NOWCHESS_JDBC_URL", "jdbc:postgresql://localhost:5432/nowchess")
+    val dbUser    = sys.env.getOrElse("NOWCHESS_DB_USER", "nowchess")
+    val dbPass    = sys.env.getOrElse("NOWCHESS_DB_PASS", "nowchess")
+    val outputDir = if args.length > 0 then args(0) else "/tmp/nowchess-player-stats"
+
+    val spark = SparkSession
+      .builder()
+      .appName("NowChess Player Stats")
+      .getOrCreate()
+
+    run(spark, jdbcUrl, dbUser, dbPass, outputDir)
+    spark.stop()
+
+  def run(
+      spark: SparkSession,
+      jdbcUrl: String,
+      dbUser: String,
+      dbPass: String,
+      outputDir: String,
+  ): Unit =
+    val games = spark.read
+      .format("jdbc")
+      .option("url", jdbcUrl)
+      .option("dbtable", "game_records")
+      .option("user", dbUser)
+      .option("password", dbPass)
+      .option("driver", "org.postgresql.Driver")
+      .option("fetchsize", "10000")
+      .load()
+      .select("white_id", "black_id", "result", "move_count")
+      .filter(F.col("result").isNotNull)
+
+    // Flatten each game into two rows: one per player, tagged with their side.
+    val asWhite = games.select(
+      F.col("white_id").as("player_id"),
+      F.col("result"),
+      F.col("move_count"),
+      F.lit("white").as("color"),
+    )
+    val asBlack = games.select(
+      F.col("black_id").as("player_id"),
+      F.col("result"),
+      F.col("move_count"),
+      F.lit("black").as("color"),
+    )
+
+    val playerGames = asWhite.union(asBlack)
+
+    val wonGame = F.col("color") === F.col("result")
+    val lostGame = (F.col("color") === "white" && F.col("result") === "black")
+      .or(F.col("color") === "black" && F.col("result") === "white")
+
+    val stats = playerGames
+      .groupBy("player_id")
+      .agg(
+        F.count("*").as("total_games"),
+        F.sum(F.when(wonGame, 1).otherwise(0)).as("wins"),
+        F.sum(F.when(lostGame, 1).otherwise(0)).as("losses"),
+        F.sum(F.when(F.col("result") === "draw", 1).otherwise(0)).as("draws"),
+        F.sum(F.when(F.col("color") === "white", 1).otherwise(0)).as("games_as_white"),
+        F.sum(F.when(F.col("color") === "black", 1).otherwise(0)).as("games_as_black"),
+        F.round(F.avg(F.col("move_count")), 1).as("avg_move_count"),
+      )
+      .withColumn("win_rate", F.round(F.col("wins") / F.col("total_games").cast("double"), 3))
+      .orderBy(F.desc("total_games"))
+
+    stats.write
+      .mode("overwrite")
+      .parquet(s"$outputDir/player_stats")
@@ -27,4 +27,5 @@ include(
    "modules:store",
    "modules:coordinator",
    "modules:tournament",
+    "modules:analytics",
 )