feat(official-bots): standalone self-play + one-shot dataset builder for NNUE training
Build & Test (NowChessSystems) TeamCity build finished

Add an easy local data pipeline feeding GPU training on Colab.

- SelfPlayMain: standalone NNUEBot self-play (no microservices) writing FENs
  for labeling; randomised openings for game diversity, sequential due to the
  shared EvaluationNNUE accumulator. Exposed via the `selfPlay` Gradle task and
  selfplay.sh.
- NNUEBot: optional fixedMoveTimeMs so self-play runs fast (default unchanged).
- NbaiLoader: honor `-Dnnue.weights=<path>` to load weights from a file before
  falling back to the bundled resource.
- build_dataset.py / dataset.sh: one command builds the entire dataset
  (Lichess eval-DB backbone + self-play + tactical + random filler), dedups,
  balances the eval histogram, writes append-only zstd shards + manifest, and
  rclone-pushes to Drive.
- train.py: NNUEDataset reads a directory of .jsonl.zst shards (streaming) in
  addition to a single file.
- NNUETraining.ipynb: clone to ephemeral /content, sync shards from Drive
  (cache-aware), train on the shards dir; removed Colab generation/upload steps.
- Concept + implementation plan docs.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Janis Eccarius
2026-06-24 22:04:22 +02:00
parent c8cbcdca3b
commit 1c80abdb8a
11 changed files with 909 additions and 198 deletions
+45 -19
View File
@@ -14,6 +14,33 @@ from datetime import datetime, timedelta
import re
import numpy as np
def _shard_files(data_file):
"""Resolve a data path to a list of shard files. Accepts a single .jsonl/.jsonl.zst
file, or a directory (searched recursively for shards, e.g. a synced datasets/ dir)."""
p = Path(data_file)
if p.is_dir():
shards = sorted(p.rglob("*.jsonl.zst")) or sorted(p.rglob("*.jsonl"))
if not shards:
raise FileNotFoundError(f"No .jsonl/.jsonl.zst shards found under {p}")
print(f"Loading {len(shards)} shard(s) from {p}")
return shards
return [p]
def _iter_dataset_lines(data_file):
"""Yield text lines from every shard, transparently decompressing .zst shards."""
import io
for shard in _shard_files(data_file):
if str(shard).endswith(".zst"):
import zstandard as zstd
with open(shard, "rb") as fh, zstd.ZstdDecompressor().stream_reader(fh) as reader:
yield from io.TextIOWrapper(reader, encoding="utf-8")
else:
with open(shard, "r") as fh:
yield from fh
class NNUEDataset(Dataset):
"""Dataset of chess positions with evaluations."""
@@ -23,27 +50,26 @@ class NNUEDataset(Dataset):
self.evals_raw = []
self.is_normalized = None
with open(data_file, 'r') as f:
for line in f:
try:
data = json.loads(line)
fen = data['fen']
eval_val = data['eval']
self.positions.append(fen)
self.evals.append(eval_val)
for line in _iter_dataset_lines(data_file):
try:
data = json.loads(line)
fen = data['fen']
eval_val = data['eval']
self.positions.append(fen)
self.evals.append(eval_val)
# Check if normalized or raw
if self.is_normalized is None:
# If eval is in range [-1, 1], assume normalized
self.is_normalized = abs(eval_val) <= 1.0
# Check if normalized or raw
if self.is_normalized is None:
# If eval is in range [-1, 1], assume normalized
self.is_normalized = abs(eval_val) <= 1.0
# Store raw if available
if 'eval_raw' in data:
self.evals_raw.append(data['eval_raw'])
else:
self.evals_raw.append(eval_val)
except (json.JSONDecodeError, KeyError):
pass
# Store raw if available
if 'eval_raw' in data:
self.evals_raw.append(data['eval_raw'])
else:
self.evals_raw.append(eval_val)
except (json.JSONDecodeError, KeyError):
pass
def __len__(self):
return len(self.positions)