e1d80b9331
Three new Spark jobs demonstrating complementary Spark pillars: LiveDashboardJob (Structured Streaming): - Simulates NowChess game-over event stream via rate source - Watermarking (45 s late-data tolerance) - Tumbling 1-min windows → append-mode Parquet output - Sliding 5-min/1-min windows → update-mode console output - Checkpointing for exactly-once fault tolerance - Production wiring comments show Kafka / spark-redis swap-in PlayerClusteringJob (MLlib): - Derives 4 player features from game_records via JDBC - VectorAssembler + StandardScaler + KMeans inside a Pipeline - ClusteringEvaluator (silhouette score) to measure quality - Per-cluster archetype averages show what each tier represents PlayerGraphJob (GraphX): - Builds directed player graph (vertices=players, edges=games) - PageRank — identifies most influential/active players - ConnectedComponents — finds isolated player communities - Bridges GraphX RDD results back to DataFrames via explicit schema (avoids spark.implicits._ which breaks Scala 3 → Spark 2.13 interop) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
93 lines
2.9 KiB
Kotlin
93 lines
2.9 KiB
Kotlin
// Standalone Spark batch-analytics module.
|
|
//
|
|
// Spark 3.5.x ships only Scala 2.12/2.13 artifacts; Scala 3 code can consume
|
|
// them via JVM binary compatibility so long as we avoid macro-expanded APIs
|
|
// (spark.implicits._, typed Dataset[T]). We use the untyped DataFrame API
|
|
// exclusively, which is safe to call from Scala 3.
|
|
//
|
|
// Spark is declared compileOnly — the cluster provides it at runtime via
|
|
// spark-submit. Only the PostgreSQL driver and the Scala 3 runtime are
|
|
// bundled into the fat jar produced by the "jar" task.
|
|
//
|
|
// Build the submission jar:
|
|
// ./gradlew :modules:analytics:jar
|
|
//
|
|
// Run a job:
|
|
// spark-submit \
|
|
// --class de.nowchess.analytics.OpeningBookJob \
|
|
// modules/analytics/build/libs/analytics-<version>.jar \
|
|
// [outputDir] [maxPlies]
|
|
//
|
|
// Environment variables consumed:
|
|
// NOWCHESS_JDBC_URL (default: jdbc:postgresql://localhost:5432/nowchess)
|
|
// NOWCHESS_DB_USER (default: nowchess)
|
|
// NOWCHESS_DB_PASS (default: nowchess)
|
|
|
|
plugins {
|
|
id("scala")
|
|
application
|
|
}
|
|
|
|
group = "de.nowchess"
|
|
version = "1.0-SNAPSHOT"
|
|
|
|
@Suppress("UNCHECKED_CAST")
|
|
val versions = rootProject.extra["VERSIONS"] as Map<String, String>
|
|
|
|
repositories {
|
|
mavenCentral()
|
|
}
|
|
|
|
scala {
|
|
scalaVersion = versions["SCALA3"]!!
|
|
}
|
|
|
|
val sparkVersion = "3.5.4"
|
|
|
|
dependencies {
|
|
compileOnly("org.scala-lang:scala3-compiler_3") {
|
|
version { strictly(versions["SCALA3"]!!) }
|
|
}
|
|
implementation("org.scala-lang:scala3-library_3") {
|
|
version { strictly(versions["SCALA3"]!!) }
|
|
}
|
|
implementation("org.scala-lang:scala-library") {
|
|
version { strictly(versions["SCALA_LIBRARY"]!!) }
|
|
}
|
|
|
|
// Spark is provided by the cluster — compile-only, not bundled.
|
|
compileOnly("org.apache.spark:spark-sql_2.13:$sparkVersion") {
|
|
exclude(group = "org.slf4j", module = "slf4j-log4j12")
|
|
}
|
|
compileOnly("org.apache.spark:spark-core_2.13:$sparkVersion") {
|
|
exclude(group = "org.slf4j", module = "slf4j-log4j12")
|
|
}
|
|
compileOnly("org.apache.spark:spark-mllib_2.13:$sparkVersion") {
|
|
exclude(group = "org.slf4j", module = "slf4j-log4j12")
|
|
}
|
|
compileOnly("org.apache.spark:spark-graphx_2.13:$sparkVersion") {
|
|
exclude(group = "org.slf4j", module = "slf4j-log4j12")
|
|
}
|
|
|
|
// PostgreSQL JDBC driver bundled so it is available on executor classpath.
|
|
implementation("org.postgresql:postgresql:42.7.4")
|
|
}
|
|
|
|
application {
|
|
mainClass.set("de.nowchess.analytics.OpeningBookJob")
|
|
}
|
|
|
|
// Fat jar: includes runtimeClasspath (our code + pg driver + scala3-library)
|
|
// but NOT compileOnly Spark jars.
|
|
tasks.jar {
|
|
manifest {
|
|
attributes["Main-Class"] = "de.nowchess.analytics.OpeningBookJob"
|
|
}
|
|
from(configurations.runtimeClasspath.get().map { if (it.isDirectory) it else zipTree(it) })
|
|
duplicatesStrategy = DuplicatesStrategy.EXCLUDE
|
|
}
|
|
|
|
tasks.withType<ScalaCompile> {
|
|
scalaCompileOptions.additionalParameters = listOf("-encoding", "UTF-8")
|
|
}
|