feat(bot): add Lazy SMP parallel search for the NNUE bot

Adds optional multithreaded search behind a thread count that defaults to 1, so the live bot's play is unchanged until explicitly configured. - ParallelSearch runs N AlphaBetaSearch workers over one shared, already-lock-protected TranspositionTable. Each worker has its own NNUE evaluator (independent accumulator) and ordering state; helpers only deepen the shared TT, the main worker's move is returned. - AlphaBetaSearch gains bestMoveWithTimeSharedTt: the coordinator clears the shared TT once before launching workers, so helpers must not clear. - EvaluationNNUE.freshEvaluator builds independent evaluators sharing the immutable weights (one per thread); the singleton still backs the default single-instance path. - NNUEBot uses ParallelSearch with NNUE_SEARCH_THREADS (default 1). numThreads <= 1 takes the single-worker clearing path, identical to the previous sequential search. Strength can be validated by self-play (threads N vs 1) before promoting the default. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 12:18:25 +02:00
parent b72e8ec017
commit 3437dab49b
5 changed files with 135 additions and 8 deletions
@@ -5,19 +5,23 @@ import de.nowchess.api.game.GameContext
 import de.nowchess.api.move.Move
 import de.nowchess.api.rules.RuleSet
 import de.nowchess.bot.bots.nnue.EvaluationNNUE
-import de.nowchess.bot.logic.AlphaBetaSearch
+import de.nowchess.bot.logic.{ParallelSearch, TranspositionTable}
 import de.nowchess.bot.util.{PolyglotBook, ZobristHash}
 import de.nowchess.bot.{BotDifficulty, BotMoveRepetition}
 import de.nowchess.rules.sets.DefaultRules

 object NNUEBot:
+  private def defaultThreads: Int =
+    sys.env.get("NNUE_SEARCH_THREADS").flatMap(_.toIntOption).filter(_ >= 1).getOrElse(1)
+
  def apply(
      difficulty: BotDifficulty,
      rules: RuleSet = DefaultRules,
      book: Option[PolyglotBook] = None,
      fixedMoveTimeMs: Option[Long] = None,
+      searchThreads: Int = defaultThreads,
  ): Bot =
-    val search = AlphaBetaSearch(rules, weights = EvaluationNNUE)
+    val search = ParallelSearch(rules, TranspositionTable(), () => EvaluationNNUE.freshEvaluator(), searchThreads)
    context =>
      val blockedMoves = BotMoveRepetition.blockedMoves(context)
      book
@@ -4,9 +4,11 @@ import de.nowchess.api.game.GameContext
 import de.nowchess.api.move.Move
 import de.nowchess.bot.ai.Evaluation

-object EvaluationNNUE extends Evaluation:
-
-  private val nnue = NNUE(NbaiLoader.loadDefault())
+/** One independent NNUE evaluator: wraps its own [[NNUE]] (own accumulator stack, scratch buffers and eval cache) plus
+  * the endgame mop-up correction. Independent instances may run concurrently as long as they share only the read-only
+  * [[NNUEWeights]].
+  */
+final class NNUEEvaluator(nnue: NNUE) extends Evaluation:

  val CHECKMATE_SCORE: Int = 10_000_000
  val DRAW_SCORE: Int      = 0
@@ -29,3 +31,28 @@ object EvaluationNNUE extends Evaluation:

  override def evaluateAccumulator(ply: Int, context: GameContext, hash: Long): Int =
    nnue.evaluateAtPlyWithValidation(ply, context.turn, hash, context.board) + MopUp.score(context)
+
+/** Default singleton evaluator plus a factory for independent per-thread evaluators that share the loaded weights. */
+object EvaluationNNUE extends Evaluation:
+
+  private val weights = NNUEWeights(NbaiLoader.loadDefault())
+  private val default = NNUEEvaluator(NNUE(weights))
+
+  /** Build a fresh evaluator backed by its own [[NNUE]] but sharing the immutable [[weights]] — one per search thread.
+    */
+  def freshEvaluator(): Evaluation = NNUEEvaluator(NNUE(weights))
+
+  val CHECKMATE_SCORE: Int = default.CHECKMATE_SCORE
+  val DRAW_SCORE: Int      = default.DRAW_SCORE
+
+  def evaluate(context: GameContext): Int = default.evaluate(context)
+
+  override def initAccumulator(context: GameContext): Unit = default.initAccumulator(context)
+
+  override def copyAccumulator(parentPly: Int, childPly: Int): Unit = default.copyAccumulator(parentPly, childPly)
+
+  override def pushAccumulator(childPly: Int, move: Move, parent: GameContext, child: GameContext): Unit =
+    default.pushAccumulator(childPly, move, parent, child)
+
+  override def evaluateAccumulator(ply: Int, context: GameContext, hash: Long): Int =
+    default.evaluateAccumulator(ply, context, hash)
@@ -95,7 +95,7 @@ final class AlphaBetaSearch(
    bestMoveWithTime(context, timeBudgetMs, Set.empty)

  def bestMoveWithTime(context: GameContext, timeBudgetMs: Long, excludedRootMoves: Set[Move]): Option[Move] =
-    doTimedSearch(context, timeBudgetMs, excludedRootMoves, Map.empty)
+    doTimedSearch(context, timeBudgetMs, excludedRootMoves, Map.empty, clearTt = true)

  def bestMoveWithTime(
      context: GameContext,
@@ -103,15 +103,27 @@ final class AlphaBetaSearch(
      excludedRootMoves: Set[Move],
      hints: Map[Move, Int],
  ): Option[Move] =
-    doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints)
+    doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints, clearTt = true)
+
+  /** Timed search over a transposition table that is shared with other workers (Lazy SMP): the caller is responsible
+    * for clearing it once before launching all workers, so this worker must not clear it.
+    */
+  def bestMoveWithTimeSharedTt(
+      context: GameContext,
+      timeBudgetMs: Long,
+      excludedRootMoves: Set[Move],
+      hints: Map[Move, Int],
+  ): Option[Move] =
+    doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints, clearTt = false)

  private def doTimedSearch(
      context: GameContext,
      timeBudgetMs: Long,
      excludedRootMoves: Set[Move],
      hints: Map[Move, Int],
+      clearTt: Boolean,
  ): Option[Move] =
-    tt.clear()
+    if clearTt then tt.clear()
    ordering.clear()
    weights.initAccumulator(context)
    timeStartMs.set(System.currentTimeMillis)
@@ -0,0 +1,56 @@
+package de.nowchess.bot.logic
+
+import de.nowchess.api.game.GameContext
+import de.nowchess.api.move.Move
+import de.nowchess.api.rules.RuleSet
+import de.nowchess.bot.ai.Evaluation
+import de.nowchess.rules.sets.DefaultRules
+
+import java.util.concurrent.{Callable, Executors}
+import scala.jdk.CollectionConverters.*
+
+/** Lazy SMP search coordinator.
+  *
+  * Runs `numThreads` independent [[AlphaBetaSearch]] workers over one shared transposition table for the same time
+  * budget. Every worker has its own evaluator (independent NNUE accumulator) and move-ordering state, but they share
+  * the thread-safe TT, so faster-progressing threads deepen entries the others reuse. Only the main worker's move is
+  * returned; helpers exist purely to enrich the shared TT.
+  *
+  * `numThreads <= 1` runs a single worker via the ordinary clearing entry point, byte-identical to sequential
+  * [[AlphaBetaSearch]].
+  */
+final class ParallelSearch(
+    rules: RuleSet = DefaultRules,
+    tt: TranspositionTable = TranspositionTable(),
+    evalFactory: () => Evaluation,
+    numThreads: Int = 1,
+):
+
+  private val threadCount = math.max(1, numThreads)
+  private val workers     = Vector.fill(threadCount)(AlphaBetaSearch(rules, tt, evalFactory()))
+
+  def bestMoveWithTime(
+      context: GameContext,
+      timeBudgetMs: Long,
+      excludedRootMoves: Set[Move] = Set.empty,
+      hints: Map[Move, Int] = Map.empty,
+  ): Option[Move] =
+    if threadCount == 1 then workers.head.bestMoveWithTime(context, timeBudgetMs, excludedRootMoves, hints)
+    else runParallel(context, timeBudgetMs, excludedRootMoves, hints)
+
+  private def runParallel(
+      context: GameContext,
+      timeBudgetMs: Long,
+      excludedRootMoves: Set[Move],
+      hints: Map[Move, Int],
+  ): Option[Move] =
+    tt.clear()
+    val pool = Executors.newFixedThreadPool(threadCount)
+    try
+      val tasks = workers.map { worker =>
+        new Callable[Option[Move]]:
+          def call(): Option[Move] =
+            worker.bestMoveWithTimeSharedTt(context, timeBudgetMs, excludedRootMoves, hints)
+      }
+      pool.invokeAll(tasks.asJava).get(0).get()
+    finally pool.shutdownNow()
@@ -0,0 +1,28 @@
+package de.nowchess.bot
+
+import de.nowchess.api.game.GameContext
+import de.nowchess.bot.bots.classic.EvaluationClassic
+import de.nowchess.bot.logic.{ParallelSearch, TranspositionTable}
+import de.nowchess.rules.sets.DefaultRules
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers
+
+class ParallelSearchTest extends AnyFunSuite with Matchers:
+
+  private def search(threads: Int): ParallelSearch =
+    ParallelSearch(DefaultRules, TranspositionTable(), () => EvaluationClassic, threads)
+
+  test("single-threaded coordinator returns a legal move on the initial position"):
+    val move = search(1).bestMoveWithTime(GameContext.initial, 200L)
+    move should not be None
+    DefaultRules.allLegalMoves(GameContext.initial) should contain(move.get)
+
+  test("multi-threaded Lazy SMP returns a legal move and does not crash under concurrency"):
+    val parallel = search(4)
+    for _ <- 1 to 5 do
+      val move = parallel.bestMoveWithTime(GameContext.initial, 200L)
+      move should not be None
+      DefaultRules.allLegalMoves(GameContext.initial) should contain(move.get)
+
+  test("numThreads below one is clamped to a single worker"):
+    search(0).bestMoveWithTime(GameContext.initial, 100L) should not be None