feat(analytics): always write results to PostgreSQL regardless of input source
Build & Test (NowChessSystems) TeamCity build failed

Remove isPgnMode JDBC guard from all 4 original jobs so staging (Lichess PGN mode)
and production (game_records JDBC mode) both persist analytics results to the DB.

Add JDBC write-back to all 7 new jobs:
- GameLengthJob → analytics_game_length_distribution + analytics_game_length_by_result
- ColorAdvantageJob → analytics_color_advantage
- EloDistributionJob → analytics_elo_distribution
- TimeControlJob → analytics_time_control_stats
- DailyActivityJob → analytics_hourly_activity + analytics_weekly_activity
- RatingMismatchJob → analytics_rating_mismatch
- TerminationStatsJob → analytics_termination_stats

Add analytics_component_sizes JDBC write to PlayerGraphJob.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Janis Eccarius
2026-06-21 15:35:34 +02:00
parent a6c600d6ce
commit da0e6d1ee2
11 changed files with 145 additions and 49 deletions
@@ -60,3 +60,13 @@ object ColorAdvantageJob:
.mode("overwrite")
.option("header", "true")
.csv(s"$outputDir/color_advantage")
stats.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_color_advantage")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
@@ -49,6 +49,16 @@ object DailyActivityJob:
.option("header", "true")
.csv(s"$outputDir/hourly_activity")
hourly.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_hourly_activity")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
val dayName = F
.when(F.col("dow") === 1, "Sunday")
.when(F.col("dow") === 2, "Monday")
@@ -77,3 +87,13 @@ object DailyActivityJob:
.mode("overwrite")
.option("header", "true")
.csv(s"$outputDir/weekly_activity")
weekly.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_weekly_activity")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
@@ -46,3 +46,13 @@ object EloDistributionJob:
.mode("overwrite")
.option("header", "true")
.csv(s"$outputDir/elo_distribution")
distribution.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_elo_distribution")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
@@ -76,6 +76,16 @@ object GameLengthJob:
.option("header", "true")
.csv(s"$outputDir/game_length_distribution")
distribution.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_game_length_distribution")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
val byResult = games
.groupBy("result")
.agg(
@@ -89,3 +99,13 @@ object GameLengthJob:
.mode("overwrite")
.option("header", "true")
.csv(s"$outputDir/game_length_by_result")
byResult.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_game_length_by_result")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
@@ -72,16 +72,15 @@ object OpeningBookJob:
.option("header", "true")
.csv(s"$outputDir/opening_book_top1000")
if !GameSource.isPgnMode then
top1000.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_opening_stats")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
top1000.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_opening_stats")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
/** Extracts the first `maxPlies` moves from a PGN column as a space-separated string.
*
@@ -119,26 +119,25 @@ object PlayerClusteringJob:
.option("header", "true")
.csv(s"$outputDir/cluster_archetypes")
if !GameSource.isPgnMode then
clustersDf.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_player_clusters")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
clustersDf.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_player_clusters")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
archetypes.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_cluster_archetypes")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
archetypes.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_cluster_archetypes")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
private def buildPlayerStats(games: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame =
val asWhite = games.select(
@@ -109,16 +109,15 @@ object PlayerGraphJob:
.mode("overwrite")
.parquet(s"$outputDir/player_graph")
if !GameSource.isPgnMode then
result.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_player_graph")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
result.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_player_graph")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
// How many players belong to each connected component?
// A large dominant component + many singletons is the expected shape.
@@ -135,6 +134,16 @@ object PlayerGraphJob:
.option("header", "true")
.csv(s"$outputDir/component_sizes")
componentSizes.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_component_sizes")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
// Build a two-column DataFrame (vertex_id: Long, valueCol: valueType) from an RDD.
// Used to bridge GraphX RDD results into the DataFrame API without implicits.
private def rddToFrame[T](
@@ -82,13 +82,12 @@ object PlayerStatsJob:
.option("header", "true")
.csv(s"$outputDir/player_stats_csv")
if !GameSource.isPgnMode then
stats.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_player_stats")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
stats.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_player_stats")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
@@ -63,3 +63,13 @@ object RatingMismatchJob:
.mode("overwrite")
.option("header", "true")
.csv(s"$outputDir/rating_mismatch")
stats.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_rating_mismatch")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
@@ -42,3 +42,13 @@ object TerminationStatsJob:
.mode("overwrite")
.option("header", "true")
.csv(s"$outputDir/termination_stats")
stats.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_termination_stats")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()
@@ -56,3 +56,13 @@ object TimeControlJob:
.mode("overwrite")
.option("header", "true")
.csv(s"$outputDir/time_control_stats")
stats.write
.mode("overwrite")
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", "analytics_time_control_stats")
.option("user", dbUser)
.option("password", dbPass)
.option("driver", "org.postgresql.Driver")
.save()