feat(bot): add Lazy SMP parallel search for the NNUE bot

Adds optional multithreaded search behind a thread count that defaults to
1, so the live bot's play is unchanged until explicitly configured.

- ParallelSearch runs N AlphaBetaSearch workers over one shared,
  already-lock-protected TranspositionTable. Each worker has its own NNUE
  evaluator (independent accumulator) and ordering state; helpers only
  deepen the shared TT, the main worker's move is returned.
- AlphaBetaSearch gains bestMoveWithTimeSharedTt: the coordinator clears
  the shared TT once before launching workers, so helpers must not clear.
- EvaluationNNUE.freshEvaluator builds independent evaluators sharing the
  immutable weights (one per thread); the singleton still backs the
  default single-instance path.
- NNUEBot uses ParallelSearch with NNUE_SEARCH_THREADS (default 1).

numThreads <= 1 takes the single-worker clearing path, identical to the
previous sequential search. Strength can be validated by self-play
(threads N vs 1) before promoting the default.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-30 12:18:25 +02:00
parent b72e8ec017
commit 3437dab49b
5 changed files with 135 additions and 8 deletions
@@ -5,19 +5,23 @@ import de.nowchess.api.game.GameContext
import de.nowchess.api.move.Move
import de.nowchess.api.rules.RuleSet
import de.nowchess.bot.bots.nnue.EvaluationNNUE
import de.nowchess.bot.logic.AlphaBetaSearch
import de.nowchess.bot.logic.{ParallelSearch, TranspositionTable}
import de.nowchess.bot.util.{PolyglotBook, ZobristHash}
import de.nowchess.bot.{BotDifficulty, BotMoveRepetition}
import de.nowchess.rules.sets.DefaultRules
object NNUEBot:
private def defaultThreads: Int =
sys.env.get("NNUE_SEARCH_THREADS").flatMap(_.toIntOption).filter(_ >= 1).getOrElse(1)
def apply(
difficulty: BotDifficulty,
rules: RuleSet = DefaultRules,
book: Option[PolyglotBook] = None,
fixedMoveTimeMs: Option[Long] = None,
searchThreads: Int = defaultThreads,
): Bot =
val search = AlphaBetaSearch(rules, weights = EvaluationNNUE)
val search = ParallelSearch(rules, TranspositionTable(), () => EvaluationNNUE.freshEvaluator(), searchThreads)
context =>
val blockedMoves = BotMoveRepetition.blockedMoves(context)
book
@@ -4,9 +4,11 @@ import de.nowchess.api.game.GameContext
import de.nowchess.api.move.Move
import de.nowchess.bot.ai.Evaluation
object EvaluationNNUE extends Evaluation:
private val nnue = NNUE(NbaiLoader.loadDefault())
/** One independent NNUE evaluator: wraps its own [[NNUE]] (own accumulator stack, scratch buffers and eval cache) plus
* the endgame mop-up correction. Independent instances may run concurrently as long as they share only the read-only
* [[NNUEWeights]].
*/
final class NNUEEvaluator(nnue: NNUE) extends Evaluation:
val CHECKMATE_SCORE: Int = 10_000_000
val DRAW_SCORE: Int = 0
@@ -29,3 +31,28 @@ object EvaluationNNUE extends Evaluation:
override def evaluateAccumulator(ply: Int, context: GameContext, hash: Long): Int =
nnue.evaluateAtPlyWithValidation(ply, context.turn, hash, context.board) + MopUp.score(context)
/** Default singleton evaluator plus a factory for independent per-thread evaluators that share the loaded weights. */
object EvaluationNNUE extends Evaluation:
private val weights = NNUEWeights(NbaiLoader.loadDefault())
private val default = NNUEEvaluator(NNUE(weights))
/** Build a fresh evaluator backed by its own [[NNUE]] but sharing the immutable [[weights]] — one per search thread.
*/
def freshEvaluator(): Evaluation = NNUEEvaluator(NNUE(weights))
val CHECKMATE_SCORE: Int = default.CHECKMATE_SCORE
val DRAW_SCORE: Int = default.DRAW_SCORE
def evaluate(context: GameContext): Int = default.evaluate(context)
override def initAccumulator(context: GameContext): Unit = default.initAccumulator(context)
override def copyAccumulator(parentPly: Int, childPly: Int): Unit = default.copyAccumulator(parentPly, childPly)
override def pushAccumulator(childPly: Int, move: Move, parent: GameContext, child: GameContext): Unit =
default.pushAccumulator(childPly, move, parent, child)
override def evaluateAccumulator(ply: Int, context: GameContext, hash: Long): Int =
default.evaluateAccumulator(ply, context, hash)
@@ -95,7 +95,7 @@ final class AlphaBetaSearch(
bestMoveWithTime(context, timeBudgetMs, Set.empty)
def bestMoveWithTime(context: GameContext, timeBudgetMs: Long, excludedRootMoves: Set[Move]): Option[Move] =
doTimedSearch(context, timeBudgetMs, excludedRootMoves, Map.empty)
doTimedSearch(context, timeBudgetMs, excludedRootMoves, Map.empty, clearTt = true)
def bestMoveWithTime(
context: GameContext,
@@ -103,15 +103,27 @@ final class AlphaBetaSearch(
excludedRootMoves: Set[Move],
hints: Map[Move, Int],
): Option[Move] =
doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints)
doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints, clearTt = true)
/** Timed search over a transposition table that is shared with other workers (Lazy SMP): the caller is responsible
* for clearing it once before launching all workers, so this worker must not clear it.
*/
def bestMoveWithTimeSharedTt(
context: GameContext,
timeBudgetMs: Long,
excludedRootMoves: Set[Move],
hints: Map[Move, Int],
): Option[Move] =
doTimedSearch(context, timeBudgetMs, excludedRootMoves, hints, clearTt = false)
private def doTimedSearch(
context: GameContext,
timeBudgetMs: Long,
excludedRootMoves: Set[Move],
hints: Map[Move, Int],
clearTt: Boolean,
): Option[Move] =
tt.clear()
if clearTt then tt.clear()
ordering.clear()
weights.initAccumulator(context)
timeStartMs.set(System.currentTimeMillis)
@@ -0,0 +1,56 @@
package de.nowchess.bot.logic
import de.nowchess.api.game.GameContext
import de.nowchess.api.move.Move
import de.nowchess.api.rules.RuleSet
import de.nowchess.bot.ai.Evaluation
import de.nowchess.rules.sets.DefaultRules
import java.util.concurrent.{Callable, Executors}
import scala.jdk.CollectionConverters.*
/** Lazy SMP search coordinator.
*
* Runs `numThreads` independent [[AlphaBetaSearch]] workers over one shared transposition table for the same time
* budget. Every worker has its own evaluator (independent NNUE accumulator) and move-ordering state, but they share
* the thread-safe TT, so faster-progressing threads deepen entries the others reuse. Only the main worker's move is
* returned; helpers exist purely to enrich the shared TT.
*
* `numThreads <= 1` runs a single worker via the ordinary clearing entry point, byte-identical to sequential
* [[AlphaBetaSearch]].
*/
final class ParallelSearch(
rules: RuleSet = DefaultRules,
tt: TranspositionTable = TranspositionTable(),
evalFactory: () => Evaluation,
numThreads: Int = 1,
):
private val threadCount = math.max(1, numThreads)
private val workers = Vector.fill(threadCount)(AlphaBetaSearch(rules, tt, evalFactory()))
def bestMoveWithTime(
context: GameContext,
timeBudgetMs: Long,
excludedRootMoves: Set[Move] = Set.empty,
hints: Map[Move, Int] = Map.empty,
): Option[Move] =
if threadCount == 1 then workers.head.bestMoveWithTime(context, timeBudgetMs, excludedRootMoves, hints)
else runParallel(context, timeBudgetMs, excludedRootMoves, hints)
private def runParallel(
context: GameContext,
timeBudgetMs: Long,
excludedRootMoves: Set[Move],
hints: Map[Move, Int],
): Option[Move] =
tt.clear()
val pool = Executors.newFixedThreadPool(threadCount)
try
val tasks = workers.map { worker =>
new Callable[Option[Move]]:
def call(): Option[Move] =
worker.bestMoveWithTimeSharedTt(context, timeBudgetMs, excludedRootMoves, hints)
}
pool.invokeAll(tasks.asJava).get(0).get()
finally pool.shutdownNow()
@@ -0,0 +1,28 @@
package de.nowchess.bot
import de.nowchess.api.game.GameContext
import de.nowchess.bot.bots.classic.EvaluationClassic
import de.nowchess.bot.logic.{ParallelSearch, TranspositionTable}
import de.nowchess.rules.sets.DefaultRules
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers
class ParallelSearchTest extends AnyFunSuite with Matchers:
private def search(threads: Int): ParallelSearch =
ParallelSearch(DefaultRules, TranspositionTable(), () => EvaluationClassic, threads)
test("single-threaded coordinator returns a legal move on the initial position"):
val move = search(1).bestMoveWithTime(GameContext.initial, 200L)
move should not be None
DefaultRules.allLegalMoves(GameContext.initial) should contain(move.get)
test("multi-threaded Lazy SMP returns a legal move and does not crash under concurrency"):
val parallel = search(4)
for _ <- 1 to 5 do
val move = parallel.bestMoveWithTime(GameContext.initial, 200L)
move should not be None
DefaultRules.allLegalMoves(GameContext.initial) should contain(move.get)
test("numThreads below one is clamped to a single worker"):
search(0).bestMoveWithTime(GameContext.initial, 100L) should not be None