feat(analytics): add PostgreSQL JDBC write-back to all four batch jobs

Each batch job now writes its results to a Postgres table in addition to the existing Parquet/CSV output. OpeningBookJob → analytics_opening_stats, PlayerStatsJob → analytics_player_stats, PlayerClusteringJob → analytics_player_clusters + analytics_cluster_archetypes, PlayerGraphJob → analytics_player_graph. MLlib Vector columns are excluded from the JDBC write by reusing the already-selected scalar DataFrame in PlayerClusteringJob. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-15 22:35:30 +02:00
parent 95215b6a42
commit 0e0ea4c989
4 changed files with 56 additions and 6 deletions
@@ -115,9 +115,9 @@ object PlayerClusteringJob:

    archetypes.show(20, false)

-    predictions
-      .select("player_id", "total_games", "win_rate", "avg_move_count", "cluster")
-      .write
+    val clustersDf = predictions.select("player_id", "total_games", "win_rate", "avg_move_count", "cluster")
+
+    clustersDf.write
      .mode("overwrite")
      .parquet(s"$outputDir/player_clusters")

@@ -126,6 +126,26 @@ object PlayerClusteringJob:
      .option("header", "true")
      .csv(s"$outputDir/cluster_archetypes")

+    clustersDf.write
+      .mode("overwrite")
+      .format("jdbc")
+      .option("url", jdbcUrl)
+      .option("dbtable", "analytics_player_clusters")
+      .option("user", dbUser)
+      .option("password", dbPass)
+      .option("driver", "org.postgresql.Driver")
+      .save()
+
+    archetypes.write
+      .mode("overwrite")
+      .format("jdbc")
+      .option("url", jdbcUrl)
+      .option("dbtable", "analytics_cluster_archetypes")
+      .option("user", dbUser)
+      .option("password", dbPass)
+      .option("driver", "org.postgresql.Driver")
+      .save()
+
  private def buildPlayerStats(games: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame =
    val asWhite = games.select(
      F.col("white_id").as("player_id"),