From 3437dab49b2cc3f7b7e726febb07ad759e878079 Mon Sep 17 00:00:00 2001
From: Janis <janis-e@gmx.de>
Date: Tue, 30 Jun 2026 12:18:25 +0200
Subject: [PATCH] feat(bot): add Lazy SMP parallel search for the NNUE bot

Adds optional multithreaded search behind a thread count that defaults to
1, so the live bot's play is unchanged until explicitly configured.

- ParallelSearch runs N AlphaBetaSearch workers over one shared,
  already-lock-protected TranspositionTable. Each worker has its own NNUE
  evaluator (independent accumulator) and ordering state; helpers only
  deepen the shared TT, the main worker's move is returned.
- AlphaBetaSearch gains bestMoveWithTimeSharedTt: the coordinator clears
  the shared TT once before launching workers, so helpers must not clear.
- EvaluationNNUE.freshEvaluator builds independent evaluators sharing the
  immutable weights (one per thread); the singleton still backs the
  default single-instance path.
- NNUEBot uses ParallelSearch with NNUE_SEARCH_THREADS (default 1).

numThreads <= 1 takes the single-worker clearing path, identical to the
previous sequential search. Strength can be validated by self-play
(threads N vs 1) before promoting the default.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../scala/de/nowchess/bot/bots/NNUEBot.scala  |  8 ++-
 .../bot/bots/nnue/EvaluationNNUE.scala        | 33 ++++++++++-
 .../nowchess/bot/logic/AlphaBetaSearch.scala  | 18 +++++-
 .../nowchess/bot/logic/ParallelSearch.scala   | 56 +++++++++++++++++++
 .../de/nowchess/bot/ParallelSearchTest.scala  | 28 ++++++++++
 5 files changed, 135 insertions(+), 8 deletions(-)
 create mode 100644 modules/official-bots/src/main/scala/de/nowchess/bot/logic/ParallelSearch.scala
 create mode 100644 modules/official-bots/src/test/scala/de/nowchess/bot/ParallelSearchTest.scala

diff --git a/modules/official-bots/src/main/scala/de/nowchess/bot/bots/NNUEBot.scala b/modules/official-bots/src/main/scala/de/nowchess/bot/bots/NNUEBot.scala
index 1faa1bf..75747c8 100644
--- a/modules/official-bots/src/main/scala/de/nowchess/bot/bots/NNUEBot.scala
+++ b/modules/official-bots/src/main/scala/de/nowchess/bot/bots/NNUEBot.scala
@@ -5,19 +5,23 @@ import de.nowchess.api.game.GameContext
 import de.nowchess.api.move.Move
 import de.nowchess.api.rules.RuleSet
 import de.nowchess.bot.bots.nnue.EvaluationNNUE
-import de.nowchess.bot.logic.AlphaBetaSearch
+import de.nowchess.bot.logic.{ParallelSearch, TranspositionTable}
 import de.nowchess.bot.util.{PolyglotBook, ZobristHash}
 import de.nowchess.bot.{BotDifficulty, BotMoveRepetition}
 import de.nowchess.rules.sets.DefaultRules
 
 object NNUEBot:
+  private def defaultThreads: Int =
+    sys.env.get("NNUE_SEARCH_THREADS").flatMap(_.toIntOption).filter(_ >= 1).getOrElse(1)
+
   def apply(
       difficulty: BotDifficulty,
       rules: RuleSet = DefaultRules,
       book: Option[PolyglotBook] = None,
       fixedMoveTimeMs: Option[Long] = None,
+      searchThreads: Int = defaultThreads,
   ): Bot =
-    val search = AlphaBetaSearch(rules, weights = EvaluationNNUE)
+    val search = ParallelSearch(rules, TranspositionTable(), () => EvaluationNNUE.freshEvaluator(), searchThreads)
     context =>
       val blockedMoves = BotMoveRepetition.blockedMoves(context)
       book
diff --git a/modules/official-bots/src/main/scala/de/nowchess/bot/bots/nnue/EvaluationNNUE.scala b/modules/official-bots/src/main/scala/de/nowchess/bot/bots/nnue/EvaluationNNUE.scala
index e45ae3a..ad9212d 100644
--- a/modules/official-bots/src/main/scala/de/nowchess/bot/bots/nnue/EvaluationNNUE.scala
+++ b/modules/official-bots/src/main/scala/de/nowchess/bot/bots/nnue/EvaluationNNUE.scala
@@ -4,9 +4,11 @@ import de.nowchess.api.game.GameContext
 import de.nowchess.api.move.Move
 import de.nowchess.bot.ai.Evaluation
 
-object EvaluationNNUE extends Evaluation:
-
-  private val nnue = NNUE(NbaiLoader.loadDefault())
+/** One independent NNUE evaluator: wraps its own [[NNUE]] (own accumulator stack, scratch buffers and eval cache) plus
+  * the endgame mop-up correction. Independent instances may run concurrently as long as they share only the read-only
+  * [[NNUEWeights]].
+  */
+final class NNUEEvaluator(nnue: NNUE) extends Evaluation:
 
   val CHECKMATE_SCORE: Int = 10_000_000
   val DRAW_SCORE: Int      = 0
@@ -29,3 +31,28 @@ object EvaluationNNUE extends Evaluation:
 
   override def evaluateAccumulator(ply: Int, context: GameContext, hash: Long): Int =
     nnue.evaluateAtPlyWithValidation(ply, context.turn, hash, context.board) + MopUp.score(context)
+
+/** Default singleton evaluator plus a factory for independent per-thread evaluators that share the loaded weights. */
+object EvaluationNNUE extends Evaluation:
+
+  private val weights = NNUEWeights(NbaiLoader.loadDefault())
+  private val default = NNUEEvaluator(NNUE(weights))
+
+  /** Build a fresh evaluator backed by its own [[NNUE]] but sharing the immutable [[weights]] — one per search thread.
+    */
+  def freshEvaluator(): Evaluation = NNUEEvaluator(NNUE(weights))
+
+  val CHECKMATE_SCORE: Int = default.CHECKMATE_SCORE
+  val DRAW_SCORE: Int      = default.DRAW_SCORE
+
+  def evaluate(context: GameContext): Int = default.evaluate(context)
+
+  override def initAccumulator(context: GameContext): Unit = default.initAccumulator(context)
+
+  override def copyAccumulator(parentPly: Int, childPly: Int): Unit = default.copyAccumulator(parentPly, childPly)
+
+  override def pushAccumulator(childPly: Int, move: Move, parent: GameContext, child: GameContext): Unit =
+    default.pushAccumulator(childPly, move, parent, child)
+
+  override def evaluateAccumulator(ply: Int, context: GameContext, hash: Long): Int =
+    default.evaluateAccumulator(ply, context, hash)
diff --git a/modules/official-bots/src/main/scala/de/nowchess/bot/logic/AlphaBetaSearch.scala b/modules/official-bots/src/main/scala/de/nowchess/bot/logic/AlphaBetaSearch.scala
index 3f7fefa..bc6b208 100644
--- a/modules/official-bots/src/main/scala/de/nowchess/bot/logic/AlphaBetaSearch.scala
+++ b/modules/official-bots/src/main/scala/de/nowchess/bot/logic/AlphaBetaSearch.scala
@@ -95,7 +95,7 @@ final class AlphaBetaSearch(
     bestMoveWithTime(context, timeBudgetMs, Set.empty)
 
   def bestMoveWithTime(context: GameContext, timeBudgetMs: Long, excludedRootMoves: Set[Move]): Option[Move] =
-    doTimedSearch(context, timeBudgetMs, excludedRootMoves, Map.empty)
+    doTimedSearch(context, timeBudgetMs, excludedRootMoves, Map.empty, clearTt = true)
 
   def bestMoveWithTime(
       context: GameContext,
@@ -103,15 +103,27 @@ final class AlphaBetaSearch(
       excludedRootMoves: Set[Move],
       hints: Map[Move, Int],
   ): Option[Move] =
-    doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints)
+    doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints, clearTt = true)
+
+  /** Timed search over a transposition table that is shared with other workers (Lazy SMP): the caller is responsible
+    * for clearing it once before launching all workers, so this worker must not clear it.
+    */
+  def bestMoveWithTimeSharedTt(
+      context: GameContext,
+      timeBudgetMs: Long,
+      excludedRootMoves: Set[Move],
+      hints: Map[Move, Int],
+  ): Option[Move] =
+    doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints, clearTt = false)
 
   private def doTimedSearch(
       context: GameContext,
       timeBudgetMs: Long,
       excludedRootMoves: Set[Move],
       hints: Map[Move, Int],
+      clearTt: Boolean,
   ): Option[Move] =
-    tt.clear()
+    if clearTt then tt.clear()
     ordering.clear()
     weights.initAccumulator(context)
     timeStartMs.set(System.currentTimeMillis)
diff --git a/modules/official-bots/src/main/scala/de/nowchess/bot/logic/ParallelSearch.scala b/modules/official-bots/src/main/scala/de/nowchess/bot/logic/ParallelSearch.scala
new file mode 100644
index 0000000..e446bb5
--- /dev/null
+++ b/modules/official-bots/src/main/scala/de/nowchess/bot/logic/ParallelSearch.scala
@@ -0,0 +1,56 @@
+package de.nowchess.bot.logic
+
+import de.nowchess.api.game.GameContext
+import de.nowchess.api.move.Move
+import de.nowchess.api.rules.RuleSet
+import de.nowchess.bot.ai.Evaluation
+import de.nowchess.rules.sets.DefaultRules
+
+import java.util.concurrent.{Callable, Executors}
+import scala.jdk.CollectionConverters.*
+
+/** Lazy SMP search coordinator.
+  *
+  * Runs `numThreads` independent [[AlphaBetaSearch]] workers over one shared transposition table for the same time
+  * budget. Every worker has its own evaluator (independent NNUE accumulator) and move-ordering state, but they share
+  * the thread-safe TT, so faster-progressing threads deepen entries the others reuse. Only the main worker's move is
+  * returned; helpers exist purely to enrich the shared TT.
+  *
+  * `numThreads <= 1` runs a single worker via the ordinary clearing entry point, byte-identical to sequential
+  * [[AlphaBetaSearch]].
+  */
+final class ParallelSearch(
+    rules: RuleSet = DefaultRules,
+    tt: TranspositionTable = TranspositionTable(),
+    evalFactory: () => Evaluation,
+    numThreads: Int = 1,
+):
+
+  private val threadCount = math.max(1, numThreads)
+  private val workers     = Vector.fill(threadCount)(AlphaBetaSearch(rules, tt, evalFactory()))
+
+  def bestMoveWithTime(
+      context: GameContext,
+      timeBudgetMs: Long,
+      excludedRootMoves: Set[Move] = Set.empty,
+      hints: Map[Move, Int] = Map.empty,
+  ): Option[Move] =
+    if threadCount == 1 then workers.head.bestMoveWithTime(context, timeBudgetMs, excludedRootMoves, hints)
+    else runParallel(context, timeBudgetMs, excludedRootMoves, hints)
+
+  private def runParallel(
+      context: GameContext,
+      timeBudgetMs: Long,
+      excludedRootMoves: Set[Move],
+      hints: Map[Move, Int],
+  ): Option[Move] =
+    tt.clear()
+    val pool = Executors.newFixedThreadPool(threadCount)
+    try
+      val tasks = workers.map { worker =>
+        new Callable[Option[Move]]:
+          def call(): Option[Move] =
+            worker.bestMoveWithTimeSharedTt(context, timeBudgetMs, excludedRootMoves, hints)
+      }
+      pool.invokeAll(tasks.asJava).get(0).get()
+    finally pool.shutdownNow()
diff --git a/modules/official-bots/src/test/scala/de/nowchess/bot/ParallelSearchTest.scala b/modules/official-bots/src/test/scala/de/nowchess/bot/ParallelSearchTest.scala
new file mode 100644
index 0000000..063606a
--- /dev/null
+++ b/modules/official-bots/src/test/scala/de/nowchess/bot/ParallelSearchTest.scala
@@ -0,0 +1,28 @@
+package de.nowchess.bot
+
+import de.nowchess.api.game.GameContext
+import de.nowchess.bot.bots.classic.EvaluationClassic
+import de.nowchess.bot.logic.{ParallelSearch, TranspositionTable}
+import de.nowchess.rules.sets.DefaultRules
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers
+
+class ParallelSearchTest extends AnyFunSuite with Matchers:
+
+  private def search(threads: Int): ParallelSearch =
+    ParallelSearch(DefaultRules, TranspositionTable(), () => EvaluationClassic, threads)
+
+  test("single-threaded coordinator returns a legal move on the initial position"):
+    val move = search(1).bestMoveWithTime(GameContext.initial, 200L)
+    move should not be None
+    DefaultRules.allLegalMoves(GameContext.initial) should contain(move.get)
+
+  test("multi-threaded Lazy SMP returns a legal move and does not crash under concurrency"):
+    val parallel = search(4)
+    for _ <- 1 to 5 do
+      val move = parallel.bestMoveWithTime(GameContext.initial, 200L)
+      move should not be None
+      DefaultRules.allLegalMoves(GameContext.initial) should contain(move.get)
+
+  test("numThreads below one is clamped to a single worker"):
+    search(0).bestMoveWithTime(GameContext.initial, 100L) should not be None