Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4a397eed7f | |||
| 9d656624d8 | |||
| e2b4342f60 |
@@ -1146,3 +1146,59 @@
|
||||
### Reverts
|
||||
|
||||
* Revert "refactor: update metrics paths formatting in application.yml for clarity" ([3870566](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/38705663498d5f47c40dafe2f26198589ede8656))
|
||||
## (2026-06-24)
|
||||
|
||||
### Features
|
||||
|
||||
* add initialization metrics for various services ([d438e97](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d438e97f32bdde0bfc63c1b4a8cc810cdd093166))
|
||||
* add OpenTelemetry trace configuration with parentbased sampler ([3904d5a](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/3904d5ad8ad4930ddee65287a7bfab785a6148f5))
|
||||
* **analytics:** add Spark batch analytics module ([#70](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/70)) ([39f1657](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/39f1657e1db6e84889af338c43be8cb5c03c3ec3))
|
||||
* **config:** update application.yml for PostgreSQL and remove staging/production configurations ([2404e61](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/2404e6164c3b50ffccbea5238d636060d6abe4d6))
|
||||
* **config:** update application.yml for staging and production environments ([6113432](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/6113432a14c476a3a0dfc0d449e17d023697f2ba))
|
||||
* configure logging and add OpenTelemetry support ([#49](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/49)) ([d57c488](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/d57c4886612d1d92da0e1b79209fc83e6ef537a1))
|
||||
* **docker:** add .dockerignore and .gitignore files for build exclusions ([c987d8e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/c987d8e258c0e6c4cfbdaa8381c64c410d7a2b83))
|
||||
* **docker:** add Dockerfiles for building Quarkus application in native and JVM modes ([3f2d2bb](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/3f2d2bb4c97fa8cddba66e1da4427c54236dfeed))
|
||||
* **docker:** add Dockerfiles for Quarkus application in JVM and native modes ([34b9933](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/34b993304670cf2aa62cd2f6460cee7b9864b08e))
|
||||
* **events:** migrate game-creation and bot flows to Redis Streams NCS-89 ([#62](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/62)) ([a24924c](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/a24924c23057db3d700a75dbc4333557789cd991))
|
||||
* **ncs-110:** feed NNUE root-move scores into search move ordering ([#83](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/83)) ([e4fee85](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/e4fee8513430093d46957970618935e99591519f))
|
||||
* NCS-78 Add Traceability to the Applications ([#46](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/46)) ([649566e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/649566eb3fcf38f91c8896a739f74ea318af312d))
|
||||
* NCS-78 Add Traceability to the Applications ([#47](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/47)) ([87dfc6c](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/87dfc6c2bcce7f7d58fc641bd8d468a2e584c108))
|
||||
* NCS-82 add Swiss-system tournament module ([#55](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/55)) ([c5661de](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/c5661de4a0ebf4b33211f5a391840dcf744656b7))
|
||||
* **official-bots:** activate opening book in expert bot (native-safe) ([260db25](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/260db25803ec55ce99e55782791eabdc190dfed4))
|
||||
* **official-bots:** add Google Colab notebook for NNUE training (NCS-111) ([#81](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/81)) ([fa10852](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/fa10852bc98451d4068ec6fb9e7a486b5e53ef5c))
|
||||
* **official-bots:** consume GameOver stream for bot cleanup ([#67](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/67)) ([db9d153](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/db9d1533912f4b41c4d1ca80ccffdde5d23d6ff6))
|
||||
* **official-bots:** implement king-relative (HalfKP) encoding in NNUE (NCS-109) ([#80](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/80)) ([44f376f](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/44f376f03221f086b898741436e13c93fd314dd1))
|
||||
* **official-bots:** make HybridBot veto actionable and use it for expert ([1df29cf](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/1df29cf3a6e21af3f396b2b7a6da67d978f941ae))
|
||||
* **official-bots:** park expert bot on tournament server at startup ([#75](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/75)) ([30295a4](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/30295a4bb95855ee8261c92278bb9ebc80ee12ee))
|
||||
* **official-bots:** resolve tournament bot token from Redis and account service ([386ddc5](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/386ddc5c19f8f893b16c6422aa5393b54c872e45))
|
||||
* **official-bots:** standalone self-play + one-shot dataset builder for NNUE training ([1c80abd](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/1c80abdb8a45814d642d43c633cde81ce7374c4f))
|
||||
* **tournament:** auto-join external tournaments and publish created ones ([#77](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/77)) ([9978b7e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/9978b7ea78eb658a225a461b9cd339386c0c14f3))
|
||||
* **tournament:** federate tournaments across clusters with DB replication ([5b000a6](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/5b000a6e5f04ea6770d1c7ab6bfdaded77a99172))
|
||||
* **tournament:** seed external server registry from env var on startup ([845dc9c](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/845dc9c2935c8bc1be42541dfaf31c9a861d3272))
|
||||
* true-microservices ([#40](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/40)) ([5909242](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/590924254e8a2754de661a57a03e43f89ceb6299))
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* enable official bots to connect to external tournament server ([#71](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/71)) ([688d30e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/688d30e2b10026923372be5fca3c63eaaee2de2a))
|
||||
* modified training pipeline ([9f9140c](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/9f9140cb585345cd244a1dfee1a06e51a5f7f7a8))
|
||||
* **official-bots:** configure JWT verification ([#72](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/72)) ([98c64fc](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/98c64fc0d56dc542beb31c75f4b9056d91de03cd))
|
||||
* **official-bots:** correct parkOn path from /api/bots to /api/account/bots ([1be9949](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/1be9949c0b5c6a1db535696620d77735050d6c93))
|
||||
* **official-bots:** derive tournament game color from game endpoint ([#79](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/79)) ([bfc4672](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/bfc46723e615bb9b65f7f9bba5f53877c4f079a7))
|
||||
* **official-bots:** discover tournament games by polling, not just the stream ([10113fd](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/10113fd0579b614d15870798d933bc9c495d2049))
|
||||
* **official-bots:** make botToken optional, fall back to env, fix 502 status ([f43d193](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/f43d1930d80670d810c57b54eaa3789854fa082c))
|
||||
* **official-bots:** NCS-70-auto-register official bots with account service ([#59](https://git.janis-eccarius.de/NowChess/NowChessSystems/issues/59)) ([7117a93](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/7117a93376272094d0b1a6abf2121254ce396684))
|
||||
* **official-bots:** park on external tournament servers using correct endpoint and token ([3188241](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/31882417377468b41bbe3ff94506aa4928024450))
|
||||
* **official-bots:** play games by polling state instead of NDJSON stream ([bfb15c7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/bfb15c7299bd471d5e064a577ed10af98e2ea90a))
|
||||
* **official-bots:** play only own tournament games with correct color ([4651bb7](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/4651bb796f07a21bd013d9521b2dfe2e1078cebb))
|
||||
* **official-bots:** prevent Colab OOM in NNUE training ([e2b4342](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/e2b4342f602215b5e8de6fccafc4105525a1ddd1))
|
||||
* **official-bots:** prioritize Redis token over stale env var in joinTournament ([83dd2d4](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/83dd2d4335ca48eb3e5aa234a75367574276ba63))
|
||||
* **official-bots:** register with tournament server directly to get correct token ([64b5d55](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/64b5d5567f110c2fe152558c7de275a1e0b30e21))
|
||||
* **official-bots:** resolve per-difficulty bot token on tournament join ([fdf4c94](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/fdf4c94811d086996447bb4657fac1d9bd6e5a93))
|
||||
* **official-bots:** resume tournaments already joined after restart ([285b73e](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/285b73efbd6dd98cec410ade9eead9881d693a8f))
|
||||
* **official-bots:** stream NNUE features as sparse indices to stop host OOM ([9d65662](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/9d656624d85889f55746faa5704578e248f9b088))
|
||||
* **official-bots:** sync bots before token fetch on first startup after DB wipe ([b0ddb27](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/b0ddb274d23bca8b1b3f691ce0d643f33e0b54cd))
|
||||
* **official-bots:** use ThreadLocalRandom in PolyglotBook for native image ([1b30c3b](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/1b30c3be393d25712c8743d3d9057207f8bbb67c))
|
||||
|
||||
### Reverts
|
||||
|
||||
* Revert "refactor: update metrics paths formatting in application.yml for clarity" ([3870566](https://git.janis-eccarius.de/NowChess/NowChessSystems/commit/38705663498d5f47c40dafe2f26198589ede8656))
|
||||
|
||||
@@ -92,28 +92,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from train import train_nnue, burst_train, DEFAULT_HIDDEN_SIZES\n",
|
||||
"\n",
|
||||
"WEIGHTS_DIR = Path(DRIVE_ROOT) / 'weights'\n",
|
||||
"WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"OUTPUT_FILE = str(WEIGHTS_DIR / 'nnue_weights.pt')\n",
|
||||
"\n",
|
||||
"# ── Training hyperparameters ──────────────────────────────────────────────────\n",
|
||||
"HIDDEN_SIZES = DEFAULT_HIDDEN_SIZES # [1536, 1024, 512, 256]\n",
|
||||
"BATCH_SIZE = 16384\n",
|
||||
"EPOCHS = 100\n",
|
||||
"EARLY_STOPPING = 10 # None to disable\n",
|
||||
"SUBSAMPLE_RATIO = 1.0\n",
|
||||
"\n",
|
||||
"# Resume from latest checkpoint if one exists\n",
|
||||
"checkpoints = sorted(WEIGHTS_DIR.glob('nnue_weights_v*.pt'))\n",
|
||||
"CHECKPOINT = str(checkpoints[-1]) if checkpoints else None\n",
|
||||
"if CHECKPOINT:\n",
|
||||
" print(f'Resuming from checkpoint: {CHECKPOINT}')\n",
|
||||
"else:\n",
|
||||
" print('Starting training from scratch.')"
|
||||
],
|
||||
"source": "from train import train_nnue, burst_train, DEFAULT_HIDDEN_SIZES\n\nWEIGHTS_DIR = Path(DRIVE_ROOT) / 'weights'\nWEIGHTS_DIR.mkdir(parents=True, exist_ok=True)\nOUTPUT_FILE = str(WEIGHTS_DIR / 'nnue_weights.pt')\n\n# ── Training hyperparameters ──────────────────────────────────────────────────\nHIDDEN_SIZES = DEFAULT_HIDDEN_SIZES\n# Features are streamed as sparse indices and densified on the GPU per batch, so\n# host RAM is no longer the limit — GPU memory is. A dense batch is\n# batch_size * 98304 * 4 bytes on the GPU (~3.2 GB at 8192 on a 16 GB T4).\nBATCH_SIZE = 8192\nEPOCHS = 100\nEARLY_STOPPING = 10 # None to disable\nSUBSAMPLE_RATIO = 1.0\n\n# Resume from latest checkpoint if one exists\ncheckpoints = sorted(WEIGHTS_DIR.glob('nnue_weights_v*.pt'))\nCHECKPOINT = str(checkpoints[-1]) if checkpoints else None\nif CHECKPOINT:\n print(f'Resuming from checkpoint: {CHECKPOINT}')\nelse:\n print('Starting training from scratch.')",
|
||||
"id": "train-config"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -13,6 +13,11 @@ import chess
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
# DataLoader workers: cap to the machine's CPUs (Colab free tier = 2). Too many
|
||||
# workers each fork the dataset and OOM-kill the runtime.
|
||||
LOADER_WORKERS = int(os.environ.get("NNUE_LOADER_WORKERS", min(4, os.cpu_count() or 2)))
|
||||
|
||||
|
||||
def _shard_files(data_file):
|
||||
@@ -77,9 +82,12 @@ class NNUEDataset(Dataset):
|
||||
def __getitem__(self, idx):
|
||||
fen = self.positions[idx]
|
||||
eval_val = self.evals[idx]
|
||||
features = fen_to_features(fen)
|
||||
# Return only the active feature indices (~64), not a dense 98304-dim vector.
|
||||
# The training loop scatters these into a dense batch on the GPU, keeping host
|
||||
# RAM tiny. Densifying per-item here OOM-kills the runtime.
|
||||
indices = fen_to_indices(fen)
|
||||
|
||||
# Board is flipped for Black-to-move in fen_to_features; negate eval
|
||||
# Board is flipped for Black-to-move in fen_to_indices; negate eval
|
||||
# so the label still means "good for the side shown as White after flip"
|
||||
if ' b ' in fen:
|
||||
eval_val = -eval_val
|
||||
@@ -90,7 +98,7 @@ class NNUEDataset(Dataset):
|
||||
else:
|
||||
target = torch.sigmoid(torch.tensor(eval_val / 400.0, dtype=torch.float32))
|
||||
|
||||
return features, target
|
||||
return indices, target
|
||||
|
||||
# King-relative (HalfKP) encoding: two perspectives, one per side's king.
|
||||
# Each piece is encoded as: kingSq * 768 + pieceIdx * 64 + sq
|
||||
@@ -105,14 +113,14 @@ _PIECE_TO_IDX = {
|
||||
}
|
||||
|
||||
|
||||
def fen_to_features(fen):
|
||||
"""Convert FEN to 98304-dim king-relative (HalfKP) feature vector.
|
||||
def fen_to_indices(fen):
|
||||
"""Active king-relative (HalfKP) feature indices for a FEN (~64 entries).
|
||||
|
||||
For Black-to-move positions the board is mirrored (ranks flipped, colours
|
||||
swapped) so the network always sees the position from the side-to-move's
|
||||
perspective. The caller is responsible for negating the eval label to match.
|
||||
perspective. The caller is responsible for negating the eval label to match.
|
||||
"""
|
||||
features = torch.zeros(INPUT_SIZE, dtype=torch.float32)
|
||||
indices = []
|
||||
try:
|
||||
board = chess.Board(fen)
|
||||
# Perspective flip: present all positions as if White is to move
|
||||
@@ -121,20 +129,41 @@ def fen_to_features(fen):
|
||||
wk = board.king(chess.WHITE)
|
||||
bk = board.king(chess.BLACK)
|
||||
if wk is None or bk is None:
|
||||
return features
|
||||
return torch.zeros(0, dtype=torch.long)
|
||||
for sq in chess.SQUARES:
|
||||
piece = board.piece_at(sq)
|
||||
if piece is None:
|
||||
continue
|
||||
pidx = _PIECE_TO_IDX[piece.symbol()]
|
||||
# White-king perspective (indices 0 .. _HALF_SIZE-1)
|
||||
features[wk * 768 + pidx * 64 + sq] = 1.0
|
||||
indices.append(wk * 768 + pidx * 64 + sq)
|
||||
# Black-king perspective (indices _HALF_SIZE .. INPUT_SIZE-1)
|
||||
features[_HALF_SIZE + bk * 768 + pidx * 64 + sq] = 1.0
|
||||
indices.append(_HALF_SIZE + bk * 768 + pidx * 64 + sq)
|
||||
except Exception:
|
||||
pass
|
||||
return torch.zeros(0, dtype=torch.long)
|
||||
return torch.tensor(indices, dtype=torch.long)
|
||||
|
||||
|
||||
def fen_to_features(fen):
|
||||
"""Dense 98304-dim HalfKP vector. Kept for external callers; training uses the
|
||||
sparse indices + GPU scatter path instead (see _collate_sparse)."""
|
||||
features = torch.zeros(INPUT_SIZE, dtype=torch.float32)
|
||||
features[fen_to_indices(fen)] = 1.0
|
||||
return features
|
||||
|
||||
|
||||
def _collate_sparse(batch):
|
||||
"""Collate (indices, target) items into (row_idx, col_idx, batch_size), targets.
|
||||
|
||||
Row/col index pairs address the active features of a dense [B, INPUT_SIZE] tensor
|
||||
that the training loop allocates on the GPU — so the host only ever holds the
|
||||
sparse indices, never a dense batch."""
|
||||
idx_list, targets = zip(*batch)
|
||||
rows = torch.cat([torch.full((idx.numel(),), i, dtype=torch.long)
|
||||
for i, idx in enumerate(idx_list)])
|
||||
cols = torch.cat(idx_list)
|
||||
return (rows, cols, len(idx_list)), torch.stack(targets)
|
||||
|
||||
# Smaller hidden layers are appropriate: the L1 input is very sparse (~64 active
|
||||
# features out of 98304) so the L1 itself is cheap to update incrementally; the
|
||||
# larger capacity comes from the wider perspective encoding, not deeper layers.
|
||||
@@ -256,17 +285,19 @@ def _setup_training(data_file, batch_size, subsample_ratio):
|
||||
train_dataset,
|
||||
batch_size=batch_size,
|
||||
sampler=train_sampler,
|
||||
num_workers=8,
|
||||
num_workers=LOADER_WORKERS,
|
||||
pin_memory=True,
|
||||
persistent_workers=True
|
||||
persistent_workers=LOADER_WORKERS > 0,
|
||||
collate_fn=_collate_sparse,
|
||||
)
|
||||
val_loader = DataLoader(
|
||||
val_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=False,
|
||||
num_workers=8,
|
||||
num_workers=LOADER_WORKERS,
|
||||
pin_memory=True,
|
||||
persistent_workers=True
|
||||
persistent_workers=LOADER_WORKERS > 0,
|
||||
collate_fn=_collate_sparse,
|
||||
)
|
||||
|
||||
return device, dataset, train_dataset, val_dataset, train_loader, val_loader, num_positions
|
||||
@@ -304,8 +335,9 @@ def _run_training_season(
|
||||
model.train()
|
||||
train_loss = 0.0
|
||||
with tqdm(total=len(train_loader), desc=f"Epoch {epoch_display}/{total_epochs} - Train") as pbar:
|
||||
for batch_features, batch_targets in train_loader:
|
||||
batch_features = batch_features.to(device)
|
||||
for (rows, cols, bsz), batch_targets in train_loader:
|
||||
batch_features = torch.zeros(bsz, INPUT_SIZE, device=device)
|
||||
batch_features[rows.to(device), cols.to(device)] = 1.0
|
||||
batch_targets = batch_targets.to(device).unsqueeze(1)
|
||||
|
||||
optimizer.zero_grad()
|
||||
@@ -318,7 +350,7 @@ def _run_training_season(
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
|
||||
train_loss += loss.item() * batch_features.size(0)
|
||||
train_loss += loss.item() * bsz
|
||||
pbar.update(1)
|
||||
|
||||
train_loss /= len(train_dataset)
|
||||
@@ -328,14 +360,15 @@ def _run_training_season(
|
||||
val_loss = 0.0
|
||||
with torch.no_grad():
|
||||
with tqdm(total=len(val_loader), desc=f"Epoch {epoch_display}/{total_epochs} - Val") as pbar:
|
||||
for batch_features, batch_targets in val_loader:
|
||||
batch_features = batch_features.to(device)
|
||||
for (rows, cols, bsz), batch_targets in val_loader:
|
||||
batch_features = torch.zeros(bsz, INPUT_SIZE, device=device)
|
||||
batch_features[rows.to(device), cols.to(device)] = 1.0
|
||||
batch_targets = batch_targets.to(device).unsqueeze(1)
|
||||
|
||||
with torch.amp.autocast('cuda' if torch.cuda.is_available() else 'cpu'):
|
||||
outputs = model(batch_features)
|
||||
loss = criterion(outputs, batch_targets)
|
||||
val_loss += loss.item() * batch_features.size(0)
|
||||
val_loss += loss.item() * bsz
|
||||
pbar.update(1)
|
||||
|
||||
val_loss /= len(val_dataset)
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
MAJOR=0
|
||||
MINOR=39
|
||||
MINOR=40
|
||||
PATCH=0
|
||||
|
||||
Reference in New Issue
Block a user