feat: integrate NNUE bot and add Python training pipeline with weight export functionality
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
# Data and weights are local artifacts, not committed
|
||||
data/
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
.venv
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
@@ -1,383 +0,0 @@
|
||||
# Debugging the NNUE Pipeline
|
||||
|
||||
## Common Issues & Solutions
|
||||
|
||||
### Issue 1: Empty training_data.jsonl
|
||||
|
||||
**Symptom:** After running the pipeline, `training_data.jsonl` is empty or doesn't exist.
|
||||
|
||||
**Diagnosis:** Run labeling with verbose output:
|
||||
|
||||
```bash
|
||||
python label_positions.py positions.txt training_data.jsonl /path/to/stockfish --verbose
|
||||
```
|
||||
|
||||
**Check these in order:**
|
||||
|
||||
#### 1. Is `positions.txt` empty?
|
||||
|
||||
```bash
|
||||
wc -l positions.txt
|
||||
```
|
||||
|
||||
If 0 lines: positions generator is failing. See Issue 2.
|
||||
|
||||
If >0 lines: positions exist. Check step 2.
|
||||
|
||||
#### 2. Is Stockfish installed and working?
|
||||
|
||||
```bash
|
||||
# Linux/macOS
|
||||
which stockfish
|
||||
stockfish --version
|
||||
|
||||
# Windows
|
||||
where stockfish
|
||||
C:\path\to\stockfish.exe --version
|
||||
```
|
||||
|
||||
If not found: Install from https://stockfishchess.org
|
||||
|
||||
#### 3. Is the Stockfish path correct?
|
||||
|
||||
```bash
|
||||
# Check what path the labeler is using
|
||||
export STOCKFISH_PATH=/your/path/to/stockfish
|
||||
echo $STOCKFISH_PATH
|
||||
|
||||
python label_positions.py positions.txt training_data.jsonl $STOCKFISH_PATH --verbose
|
||||
```
|
||||
|
||||
The script will print at the top: `Using Stockfish: /path/to/stockfish`
|
||||
|
||||
#### 4. Check the error summary
|
||||
|
||||
After running with verbose, look for the summary:
|
||||
|
||||
```
|
||||
============================================================
|
||||
LABELING SUMMARY
|
||||
============================================================
|
||||
Successfully evaluated: 0 ← This should be > 0
|
||||
Skipped (duplicates): 0
|
||||
Skipped (invalid): 0
|
||||
Errors: 0
|
||||
```
|
||||
|
||||
If "Successfully evaluated" is 0, positions aren't being saved.
|
||||
|
||||
---
|
||||
|
||||
### Issue 2: Empty positions.txt
|
||||
|
||||
**Symptom:** `positions.txt` is empty after running `generate_positions.py`
|
||||
|
||||
**Diagnosis:** Check the generation summary:
|
||||
|
||||
```bash
|
||||
python generate_positions.py positions.txt --games 10000
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```
|
||||
============================================================
|
||||
POSITION GENERATION SUMMARY
|
||||
============================================================
|
||||
Total games: 10000
|
||||
Saved positions: 1234 ← This should be > 0
|
||||
Filtered (check): 2345
|
||||
Filtered (captures): 4321
|
||||
Filtered (game over): 1100
|
||||
Total filtered: 7766
|
||||
Acceptance rate: 12.34%
|
||||
============================================================
|
||||
```
|
||||
|
||||
**If Saved positions = 0:**
|
||||
|
||||
The filters are too strict! Try with `--no-filter-captures`:
|
||||
|
||||
```bash
|
||||
python generate_positions.py positions.txt --games 10000 --no-filter-captures
|
||||
```
|
||||
|
||||
This allows positions with available captures, which should greatly increase the output.
|
||||
|
||||
---
|
||||
|
||||
### Issue 3: Stockfish Errors During Labeling
|
||||
|
||||
**Symptom:** Labeling runs but shows errors like:
|
||||
```
|
||||
Error evaluating position: rnbqkbnr/pppppppp...
|
||||
SomeError: [error details]
|
||||
```
|
||||
|
||||
**Solutions:**
|
||||
|
||||
1. **Check Stockfish is responsive:**
|
||||
```bash
|
||||
# Test Stockfish directly
|
||||
echo "position startpos" | stockfish
|
||||
echo "quit" | stockfish
|
||||
```
|
||||
|
||||
2. **Try with lower depth** (faster, fewer timeouts):
|
||||
```bash
|
||||
python label_positions.py positions.txt training_data.jsonl /path/to/stockfish --depth 8
|
||||
```
|
||||
|
||||
3. **Use explicit path** instead of relying on PATH:
|
||||
```bash
|
||||
python label_positions.py positions.txt training_data.jsonl /usr/games/stockfish
|
||||
```
|
||||
|
||||
4. **Check if FENs in positions.txt are valid:**
|
||||
```bash
|
||||
head -5 positions.txt
|
||||
```
|
||||
|
||||
Output should look like:
|
||||
```
|
||||
rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq e3 0 1
|
||||
rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq e3 0 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Issue 4: Training Fails - No Valid Data
|
||||
|
||||
**Symptom:** `train_nnue.py` crashes with:
|
||||
```
|
||||
IndexError: list index out of range
|
||||
```
|
||||
|
||||
**Cause:** `training_data.jsonl` is empty or contains invalid JSON.
|
||||
|
||||
**Debug:**
|
||||
|
||||
```bash
|
||||
# Check file size
|
||||
ls -lh training_data.jsonl
|
||||
|
||||
# Count valid lines
|
||||
python -c "import json; lines = [1 for line in open('training_data.jsonl') if json.loads(line)]; print(f'Valid lines: {len(lines)}')"
|
||||
|
||||
# Look at first few lines
|
||||
head -3 training_data.jsonl
|
||||
```
|
||||
|
||||
Expected output:
|
||||
```
|
||||
{"fen": "rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq e3 0 1", "eval": 45}
|
||||
{"fen": "rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq e3 0 1", "eval": 48}
|
||||
```
|
||||
|
||||
If empty: go back to Issue 1.
|
||||
|
||||
---
|
||||
|
||||
## Step-by-Step Verification
|
||||
|
||||
Run this to verify each step works:
|
||||
|
||||
```bash
|
||||
cd modules/bot/python
|
||||
|
||||
# Step 1: Generate 1000 positions (quick test)
|
||||
echo "Testing position generation..."
|
||||
python generate_positions.py test_positions.txt --games 1000 --no-filter-captures
|
||||
|
||||
# Check output
|
||||
if [ ! -s test_positions.txt ]; then
|
||||
echo "ERROR: positions.txt is empty"
|
||||
exit 1
|
||||
fi
|
||||
POSITIONS=$(wc -l < test_positions.txt)
|
||||
echo "✓ Generated $POSITIONS positions"
|
||||
|
||||
# Step 2: Label positions (quick test with 100 positions)
|
||||
echo "Testing Stockfish labeling..."
|
||||
export STOCKFISH_PATH=$(which stockfish || which /usr/games/stockfish || echo "stockfish")
|
||||
if ! command -v $STOCKFISH_PATH &> /dev/null; then
|
||||
echo "ERROR: Stockfish not found"
|
||||
echo " Install: apt-get install stockfish (Linux) or brew install stockfish (Mac)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
head -100 test_positions.txt > test_positions_100.txt
|
||||
python label_positions.py test_positions_100.txt test_training_data.jsonl $STOCKFISH_PATH --depth 8
|
||||
|
||||
# Check output
|
||||
if [ ! -s test_training_data.jsonl ]; then
|
||||
echo "ERROR: training_data.jsonl is empty"
|
||||
echo " Run again with --verbose:"
|
||||
python label_positions.py test_positions_100.txt test_training_data.jsonl $STOCKFISH_PATH --depth 8 --verbose
|
||||
exit 1
|
||||
fi
|
||||
EVALS=$(wc -l < test_training_data.jsonl)
|
||||
echo "✓ Evaluated $EVALS positions"
|
||||
|
||||
# Step 3: Test training
|
||||
echo "Testing training..."
|
||||
python train_nnue.py test_training_data.jsonl test_weights.pt --epochs 1 --batch-size 32 --no-versioning
|
||||
|
||||
if [ ! -f test_weights.pt ]; then
|
||||
echo "ERROR: training failed"
|
||||
exit 1
|
||||
fi
|
||||
echo "✓ Training works"
|
||||
|
||||
echo ""
|
||||
echo "All tests passed! Pipeline is working correctly."
|
||||
echo "You can now run the full pipeline with:"
|
||||
echo " ./run_pipeline.sh"
|
||||
```
|
||||
|
||||
Save as `test_pipeline.sh` and run:
|
||||
|
||||
```bash
|
||||
chmod +x test_pipeline.sh
|
||||
./test_pipeline.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Error Messages
|
||||
|
||||
### "Stockfish not found at stockfish"
|
||||
|
||||
```bash
|
||||
# Set the full path
|
||||
export STOCKFISH_PATH=/usr/games/stockfish
|
||||
# Or on Windows:
|
||||
set STOCKFISH_PATH=C:\stockfish\stockfish.exe
|
||||
```
|
||||
|
||||
### "No such file or directory: positions.txt"
|
||||
|
||||
```bash
|
||||
# Make sure you're in the right directory
|
||||
cd modules/bot/python
|
||||
|
||||
# Or provide full path
|
||||
python label_positions.py /full/path/to/positions.txt training_data.jsonl stockfish
|
||||
```
|
||||
|
||||
### "JSONDecodeError" in training
|
||||
|
||||
```bash
|
||||
# training_data.jsonl has invalid JSON
|
||||
# Regenerate it:
|
||||
rm training_data.jsonl
|
||||
python label_positions.py positions.txt training_data.jsonl stockfish
|
||||
```
|
||||
|
||||
### "CUDA out of memory"
|
||||
|
||||
```bash
|
||||
# Reduce batch size
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --batch-size 1024
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Getting More Information
|
||||
|
||||
### Verbose Output
|
||||
|
||||
All scripts support `--verbose` for detailed debugging:
|
||||
|
||||
```bash
|
||||
python label_positions.py positions.txt training_data.jsonl stockfish --verbose
|
||||
```
|
||||
|
||||
This prints:
|
||||
- Which Stockfish is being used
|
||||
- Error details for each failed position
|
||||
- Summary of what passed/failed/skipped
|
||||
|
||||
### File Size Checks
|
||||
|
||||
```bash
|
||||
# Check all files
|
||||
ls -lh positions.txt training_data.jsonl nnue_weights.pt
|
||||
|
||||
# Count lines
|
||||
echo "Positions: $(wc -l < positions.txt)"
|
||||
echo "Training data: $(wc -l < training_data.jsonl)"
|
||||
```
|
||||
|
||||
### Quick Tests
|
||||
|
||||
```bash
|
||||
# Test position generation (100 games)
|
||||
python generate_positions.py test_pos.txt --games 100 --no-filter-captures
|
||||
|
||||
# Test Stockfish labeling (10 positions)
|
||||
head -10 test_pos.txt > test_pos_10.txt
|
||||
python label_positions.py test_pos_10.txt test_data_10.jsonl stockfish --depth 6
|
||||
|
||||
# Test training (on test data)
|
||||
python train_nnue.py test_data_10.jsonl test_model.pt --epochs 1 --batch-size 8
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pipeline Workflow with Debugging
|
||||
|
||||
```bash
|
||||
# 1. Generate positions
|
||||
python generate_positions.py positions.txt --games 100000 --no-filter-captures
|
||||
# Should output: Saved positions: ~20000-40000 (depends on filter)
|
||||
|
||||
# 2. Label with Stockfish
|
||||
export STOCKFISH_PATH=$(which stockfish)
|
||||
python label_positions.py positions.txt training_data.jsonl $STOCKFISH_PATH --depth 10
|
||||
# Should output: Successfully evaluated: > 0
|
||||
|
||||
# 3. Train model
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --epochs 5
|
||||
# Should output: Training summary with version info
|
||||
|
||||
# 4. Export to Scala
|
||||
python export_weights.py nnue_weights_v1.pt ../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala
|
||||
# Should output: NNUEWeights.scala created
|
||||
|
||||
# 5. Compile Scala
|
||||
cd ../..
|
||||
./compile
|
||||
# Should output: BUILD SUCCESSFUL
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Monitoring
|
||||
|
||||
While labeling is running, monitor progress:
|
||||
|
||||
```bash
|
||||
# In another terminal
|
||||
watch -n 5 'wc -l modules/bot/python/training_data.jsonl'
|
||||
|
||||
# Or on macOS
|
||||
while true; do echo $(wc -l < modules/bot/python/training_data.jsonl) positions labeled; sleep 5; done
|
||||
```
|
||||
|
||||
This shows how many positions per second are being evaluated.
|
||||
|
||||
---
|
||||
|
||||
## Still Stuck?
|
||||
|
||||
1. **Read the full output** — Don't skip error messages
|
||||
2. **Check file sizes** — `ls -lh` shows if files are being created
|
||||
3. **Run with `--verbose`** — Shows exactly what's failing
|
||||
4. **Test individual steps** — Don't run full pipeline, test pieces
|
||||
5. **Check Stockfish** — `stockfish --version` confirms it works
|
||||
|
||||
For more help, see:
|
||||
- `README_NNUE.md` — Complete pipeline docs
|
||||
- `TRAINING_GUIDE.md` — Training workflows
|
||||
- `INCREMENTAL_TRAINING.md` — Versioning & checkpoints
|
||||
@@ -1,296 +0,0 @@
|
||||
# Incremental Training & Versioning: New Features
|
||||
|
||||
## Summary
|
||||
|
||||
`train_nnue.py` now supports:
|
||||
|
||||
✅ **Checkpoint Loading** — Resume from previous models
|
||||
✅ **Automatic Versioning** — v1, v2, v3... naming
|
||||
✅ **Metadata Tracking** — Date, positions, losses, depth
|
||||
✅ **CLI Arguments** — Full control via command line
|
||||
|
||||
---
|
||||
|
||||
## Feature 1: Automatic Checkpoint Detection
|
||||
|
||||
When you run training, the trainer automatically looks for and loads existing weights:
|
||||
|
||||
```bash
|
||||
# First run: nnue_weights.pt doesn't exist
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt
|
||||
# → Trains from scratch, saves as nnue_weights_v1.pt
|
||||
|
||||
# Second run: nnue_weights.pt exists (symlink to v1)
|
||||
python train_nnue.py training_data_bigger.jsonl nnue_weights.pt
|
||||
# → Auto-loads nnue_weights_v1.pt as checkpoint
|
||||
# → Continues training
|
||||
# → Saves as nnue_weights_v2.pt
|
||||
```
|
||||
|
||||
**No command-line flag needed** — automatic detection of existing weights!
|
||||
|
||||
---
|
||||
|
||||
## Feature 2: Explicit Checkpoint
|
||||
|
||||
Override auto-detection with `--checkpoint`:
|
||||
|
||||
```bash
|
||||
# Use v1 as starting point, ignore any other weights
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt \
|
||||
--checkpoint nnue_weights_v1.pt
|
||||
|
||||
# Or load from external checkpoint
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt \
|
||||
--checkpoint /path/to/backup_model.pt
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Feature 3: Automatic Versioning
|
||||
|
||||
Models are saved with version numbers:
|
||||
|
||||
**First run:**
|
||||
```
|
||||
nnue_weights_v1.pt ← Model weights
|
||||
nnue_weights_v1_metadata.json ← Training info
|
||||
```
|
||||
|
||||
**Second run:**
|
||||
```
|
||||
nnue_weights_v2.pt ← Model weights
|
||||
nnue_weights_v2_metadata.json ← Training info
|
||||
```
|
||||
|
||||
**Third run:**
|
||||
```
|
||||
nnue_weights_v3.pt
|
||||
nnue_weights_v3_metadata.json
|
||||
```
|
||||
|
||||
Disable with `--no-versioning`:
|
||||
```bash
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --no-versioning
|
||||
# → Saves directly to nnue_weights.pt (no version number)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Feature 4: Training Metadata
|
||||
|
||||
Each model save includes a JSON metadata file tracking:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": 2,
|
||||
"date": "2026-04-07T15:30:45.123456",
|
||||
"num_positions": 1000000,
|
||||
"stockfish_depth": 12,
|
||||
"epochs": 20,
|
||||
"batch_size": 4096,
|
||||
"learning_rate": 0.001,
|
||||
"final_val_loss": 0.0234567,
|
||||
"device": "cuda",
|
||||
"checkpoint": "nnue_weights_v1.pt",
|
||||
"notes": "Win rate vs classical eval: TBD"
|
||||
}
|
||||
```
|
||||
|
||||
### Useful for:
|
||||
- **Tracking progress** — Compare val_loss across versions
|
||||
- **Reproducibility** — Know exactly how each model was trained
|
||||
- **Debugging** — Identify which positions/depth produced best results
|
||||
- **Benchmarking** — Record win rates (manually added to notes)
|
||||
|
||||
---
|
||||
|
||||
## Feature 5: CLI Arguments
|
||||
|
||||
Full control over training via command-line flags:
|
||||
|
||||
```bash
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt \
|
||||
--epochs 30 \
|
||||
--batch-size 2048 \
|
||||
--lr 5e-4 \
|
||||
--stockfish-depth 14 \
|
||||
--checkpoint nnue_weights_v1.pt
|
||||
```
|
||||
|
||||
**All flags:**
|
||||
- `--epochs` — Number of training passes (default: 20)
|
||||
- `--batch-size` — Samples per update (default: 4096)
|
||||
- `--lr` — Learning rate (default: 1e-3)
|
||||
- `--stockfish-depth` — Depth for metadata (default: 12)
|
||||
- `--checkpoint` — Resume from checkpoint (default: auto-detect)
|
||||
- `--no-versioning` — Disable versioning
|
||||
|
||||
---
|
||||
|
||||
## Workflow Examples
|
||||
|
||||
### Scenario 1: Continuous Improvement
|
||||
|
||||
```bash
|
||||
# Initial training: 500K positions
|
||||
./run_pipeline.sh
|
||||
# → nnue_weights_v1.pt created
|
||||
|
||||
# Add more positions (500K more)
|
||||
python label_positions.py positions_v2.txt training_data_v2.jsonl stockfish
|
||||
|
||||
# Combine and retrain
|
||||
cat training_data.jsonl training_data_v2.jsonl > all_data.jsonl
|
||||
python train_nnue.py all_data.jsonl nnue_weights.pt
|
||||
# → Loads v1, trains on all 1M positions
|
||||
# → nnue_weights_v2.pt created
|
||||
|
||||
# Export best version
|
||||
python export_weights.py nnue_weights_v2.pt ../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala
|
||||
```
|
||||
|
||||
### Scenario 2: Hyperparameter Tuning
|
||||
|
||||
```bash
|
||||
# Baseline
|
||||
python train_nnue.py data.jsonl nnue_weights.pt
|
||||
# → v1 with default settings
|
||||
|
||||
# Try lower learning rate
|
||||
python train_nnue.py data.jsonl nnue_weights.pt --lr 5e-4
|
||||
# → v2 with lr=5e-4
|
||||
|
||||
# Try higher learning rate
|
||||
python train_nnue.py data.jsonl nnue_weights.pt --lr 2e-3
|
||||
# → v3 with lr=2e-3
|
||||
|
||||
# Compare metadata
|
||||
cat nnue_weights_v*_metadata.json | grep final_val_loss
|
||||
# → Pick the lowest loss
|
||||
```
|
||||
|
||||
### Scenario 3: Interrupted Training Resume
|
||||
|
||||
```bash
|
||||
# Start training
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --epochs 50
|
||||
# → Epoch 30 of 50, then crash/interrupt
|
||||
|
||||
# Resume: same command
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --epochs 50
|
||||
# → Auto-detects checkpoint, continues from epoch 30
|
||||
# → Completes to epoch 50
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Command-Line Help
|
||||
|
||||
View all options:
|
||||
|
||||
```bash
|
||||
python train_nnue.py --help
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
usage: train_nnue.py [-h] [--checkpoint CHECKPOINT] [--epochs EPOCHS]
|
||||
[--batch-size BATCH_SIZE] [--lr LR]
|
||||
[--stockfish-depth STOCKFISH_DEPTH] [--no-versioning]
|
||||
[data_file] [output_file]
|
||||
|
||||
Train NNUE neural network for chess evaluation
|
||||
|
||||
positional arguments:
|
||||
data_file Path to training_data.jsonl (default: training_data.jsonl)
|
||||
output_file Output file base name (default: nnue_weights.pt)
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--checkpoint CHECKPOINT
|
||||
Path to checkpoint file to resume training from (optional)
|
||||
--epochs EPOCHS Number of epochs to train (default: 20)
|
||||
--batch-size BATCH_SIZE
|
||||
Batch size (default: 4096)
|
||||
--lr LR Learning rate (default: 1e-3)
|
||||
--stockfish-depth STOCKFISH_DEPTH
|
||||
Stockfish depth used for evaluations (for metadata, default: 12)
|
||||
--no-versioning Disable automatic versioning (save directly to output file)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Differences from Previous Version
|
||||
|
||||
| Feature | Before | After |
|
||||
|---------|--------|-------|
|
||||
| Checkpoint support | ❌ No | ✅ Yes (auto + explicit) |
|
||||
| Versioning | ❌ Single file | ✅ v1, v2, v3... |
|
||||
| Metadata tracking | ❌ No | ✅ JSON with all info |
|
||||
| CLI arguments | ❌ Limited | ✅ Full argparse |
|
||||
| Resumed training | ❌ Always from scratch | ✅ Resume from checkpoint |
|
||||
| Training history | ❌ Lost | ✅ Tracked in metadata |
|
||||
|
||||
---
|
||||
|
||||
## Integration with Pipeline
|
||||
|
||||
The `run_pipeline.sh` and `run_pipeline.bat` scripts automatically use versioning:
|
||||
|
||||
```bash
|
||||
./run_pipeline.sh
|
||||
# First run:
|
||||
# - Generates data
|
||||
# - Trains model
|
||||
# - Creates nnue_weights_v1.pt + metadata
|
||||
# - Exports to NNUEWeights.scala
|
||||
|
||||
# Second run:
|
||||
# - Auto-detects v1, loads as checkpoint
|
||||
# - Continues training on all data
|
||||
# - Creates nnue_weights_v2.pt + metadata
|
||||
# - Exports updated NNUEWeights.scala
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Tips & Tricks
|
||||
|
||||
### List all versions with losses:
|
||||
|
||||
```bash
|
||||
for f in nnue_weights_v*_metadata.json; do
|
||||
version=$(grep version $f | head -1)
|
||||
loss=$(grep final_val_loss $f)
|
||||
echo "$version | $loss"
|
||||
done
|
||||
```
|
||||
|
||||
### Auto-export best version:
|
||||
|
||||
```bash
|
||||
# Find version with lowest loss
|
||||
BEST=$(for f in nnue_weights_v*_metadata.json; do
|
||||
echo "$f $(grep final_val_loss $f | cut -d: -f2)"
|
||||
done | sort -k2 -n | head -1 | cut -d_ -f3 | cut -d. -f1)
|
||||
|
||||
python export_weights.py nnue_weights_$BEST.pt ../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala
|
||||
```
|
||||
|
||||
### Archive old versions:
|
||||
|
||||
```bash
|
||||
mkdir -p archive
|
||||
mv nnue_weights_v{1,2,3}.pt archive/
|
||||
mv nnue_weights_v{1,2,3}_metadata.json archive/
|
||||
# Keep only v4+
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## See Also
|
||||
|
||||
- `TRAINING_GUIDE.md` — Detailed examples and workflows
|
||||
- `README_NNUE.md` — Complete pipeline documentation
|
||||
- `train_nnue.py --help` — Command-line reference
|
||||
@@ -0,0 +1,129 @@
|
||||
# NNUE Python Pipeline
|
||||
|
||||
Central CLI for training and exporting chess evaluation neural networks (NNUE).
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
python/
|
||||
├── nnue.py # Main CLI entry point
|
||||
├── src/ # Python modules
|
||||
│ ├── generate.py # Generate random chess positions
|
||||
│ ├── label.py # Label positions with Stockfish
|
||||
│ ├── train.py # Train NNUE model
|
||||
│ └── export.py # Export weights to Scala
|
||||
├── data/ # Training data (gitignored)
|
||||
│ ├── positions.txt
|
||||
│ └── training_data.jsonl
|
||||
└── weights/ # Model weights (gitignored)
|
||||
├── nnue_weights_v1.pt
|
||||
├── nnue_weights_v1_metadata.json
|
||||
└── ...
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Train a new model (500k positions, auto-detect checkpoint)
|
||||
python nnue.py train
|
||||
|
||||
# Train from specific checkpoint
|
||||
python nnue.py train --from-checkpoint 2
|
||||
|
||||
# Train with custom games count
|
||||
python nnue.py train --games 200000
|
||||
|
||||
# Train with custom positions file
|
||||
python nnue.py train --positions-file my_positions.txt
|
||||
|
||||
# Export specific version to Scala
|
||||
python nnue.py export 2
|
||||
|
||||
# List all checkpoints
|
||||
python nnue.py list
|
||||
```
|
||||
|
||||
## CLI Commands
|
||||
|
||||
### `train` - Train NNUE model
|
||||
|
||||
```bash
|
||||
python nnue.py train [OPTIONS]
|
||||
```
|
||||
|
||||
**Options:**
|
||||
- `--from-checkpoint N` - Resume from checkpoint version N (default: uses latest)
|
||||
- `--games N` - Number of games to generate (default: 500000)
|
||||
- `--positions-file FILE` - Use existing positions file instead of generating
|
||||
- `--stockfish PATH` - Path to Stockfish binary (default: `$STOCKFISH_PATH` or `/usr/games/stockfish`)
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Train with latest checkpoint
|
||||
python nnue.py train
|
||||
|
||||
# Train from v2 with 100k games
|
||||
python nnue.py train --from-checkpoint 2 --games 100000
|
||||
|
||||
# Train with custom positions
|
||||
python nnue.py train --positions-file my_games.txt --stockfish /opt/stockfish/sf15
|
||||
```
|
||||
|
||||
### `export` - Export weights to Scala
|
||||
|
||||
```bash
|
||||
python nnue.py export WEIGHTS [output_path]
|
||||
```
|
||||
|
||||
**Arguments:**
|
||||
- `WEIGHTS` - Version number (e.g., `2`) or full filename (e.g., `nnue_weights_v2.pt`)
|
||||
|
||||
**Examples:**
|
||||
```bash
|
||||
# Export version 2
|
||||
python nnue.py export 2
|
||||
|
||||
# Export with full filename
|
||||
python nnue.py export nnue_weights_v3.pt
|
||||
```
|
||||
|
||||
Output goes to `../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights_vN.scala`
|
||||
|
||||
### `list` - List available checkpoints
|
||||
|
||||
```bash
|
||||
python nnue.py list
|
||||
```
|
||||
|
||||
Shows all available model versions with file sizes.
|
||||
|
||||
## Data Flow
|
||||
|
||||
1. **Generate** → `data/positions.txt`
|
||||
- Random chess positions from 8-20 move openings
|
||||
- Filters out checks, game-over states, and captures
|
||||
|
||||
2. **Label** → `data/training_data.jsonl`
|
||||
- Evaluates each position with Stockfish at depth 12
|
||||
- Stores FEN + evaluation in JSONL format
|
||||
|
||||
3. **Train** → `weights/nnue_weights_vN.pt`
|
||||
- Trains neural network on labeled positions
|
||||
- Auto-versioning (v1, v2, v3, etc.)
|
||||
- Saves metadata alongside weights
|
||||
|
||||
4. **Export** → `NNUEWeights_vN.scala`
|
||||
- Converts weights to Scala object
|
||||
- Ready for integration into bot
|
||||
|
||||
## Versioning
|
||||
|
||||
- Models are automatically versioned (v1, v2, v3, etc.)
|
||||
- Each version gets a `_metadata.json` file with training info
|
||||
- Training from checkpoint uses latest version unless specified with `--from-checkpoint`
|
||||
|
||||
## Files
|
||||
|
||||
- `data/` and `weights/` are gitignored (local artifacts)
|
||||
- Documentation in `docs/` explains training, debugging, and incremental improvements
|
||||
- Source modules in `src/` are independent and can be imported for custom workflows
|
||||
@@ -1,173 +0,0 @@
|
||||
# NNUE Training Pipeline
|
||||
|
||||
This directory contains the complete NNUE (Efficiently Updatable Neural Network) training pipeline for the Now-Chess bot.
|
||||
|
||||
## Overview
|
||||
|
||||
The pipeline generates 500,000 random chess positions, evaluates them with Stockfish, trains a neural network, and exports the weights as Scala code for integration into the engine.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Install Python dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Ensure Stockfish is installed. You can:
|
||||
- Install via package manager: `apt-get install stockfish` (Linux) or `brew install stockfish` (macOS)
|
||||
- Or download from [stockfish.org](https://stockfishchess.org)
|
||||
|
||||
Set the Stockfish path:
|
||||
```bash
|
||||
export STOCKFISH_PATH=/path/to/stockfish
|
||||
```
|
||||
|
||||
## Pipeline Steps
|
||||
|
||||
### Quick Run
|
||||
|
||||
Run the entire pipeline:
|
||||
|
||||
```bash
|
||||
chmod +x run_pipeline.sh
|
||||
./run_pipeline.sh
|
||||
```
|
||||
|
||||
This automatically runs all 4 steps in sequence and confirms each succeeds before continuing.
|
||||
|
||||
### Individual Steps
|
||||
|
||||
#### Step 1: Generate Positions
|
||||
|
||||
Generate 500,000 random chess positions:
|
||||
|
||||
```bash
|
||||
python3 generate_positions.py positions.txt
|
||||
```
|
||||
|
||||
Output: `positions.txt` (one FEN per line)
|
||||
- Plays 8-20 random opening moves
|
||||
- Filters out checks, captures available, and game-over positions
|
||||
- Shows progress bar with tqdm
|
||||
|
||||
#### Step 2: Label with Stockfish
|
||||
|
||||
Evaluate each position with Stockfish at depth 12:
|
||||
|
||||
```bash
|
||||
export STOCKFISH_PATH=/path/to/stockfish
|
||||
python3 label_positions.py positions.txt training_data.jsonl $STOCKFISH_PATH
|
||||
```
|
||||
|
||||
Output: `training_data.jsonl` (one JSON per line)
|
||||
- Format: `{"fen": "...", "eval": 123}` (centipawns)
|
||||
- Evals clamped to [-2000, 2000] to avoid mate score outliers
|
||||
- Supports resuming if interrupted (checks for existing entries)
|
||||
- Shows progress bar with tqdm
|
||||
|
||||
**Note:** This step is slow (~24-36 hours for 500K positions at depth 12). You can reduce games or use lower depth for testing.
|
||||
|
||||
#### Step 3: Train NNUE Model
|
||||
|
||||
Train the neural network:
|
||||
|
||||
```bash
|
||||
python3 train_nnue.py training_data.jsonl nnue_weights.pt
|
||||
```
|
||||
|
||||
Output: `nnue_weights.pt` (PyTorch model weights)
|
||||
|
||||
Architecture:
|
||||
- Input: 768 binary features (12 piece types × 64 squares)
|
||||
- Hidden 1: 256 neurons + ReLU
|
||||
- Hidden 2: 32 neurons + ReLU
|
||||
- Output: 1 neuron (sigmoid applied to eval/400)
|
||||
|
||||
Training:
|
||||
- 20 epochs, batch size 4096, Adam optimizer (lr=1e-3)
|
||||
- 90% train / 10% validation split
|
||||
- Saves best weights by validation loss
|
||||
- Shows train/val loss per epoch
|
||||
|
||||
**Note:** Requires GPU for reasonable speed (~2-4 hours). CPU falls back to ~8-16 hours.
|
||||
|
||||
#### Step 4: Export to Scala
|
||||
|
||||
Export weights as Scala code:
|
||||
|
||||
```bash
|
||||
python3 export_weights.py nnue_weights.pt ../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala
|
||||
```
|
||||
|
||||
Output: `NNUEWeights.scala`
|
||||
- Object with `val` arrays for each layer's weights and biases
|
||||
- Format: `Array[Float]` with precision sufficient for inference
|
||||
- Includes shape comments for reference
|
||||
|
||||
## Scala Integration
|
||||
|
||||
### Step 5: NNUE Evaluator
|
||||
|
||||
Create `NNUE.scala` in `src/main/scala/de/nowchess/bot/bots/nnue/`:
|
||||
|
||||
```scala
|
||||
package de.nowchess.bot.bots.nnue
|
||||
|
||||
class NNUE:
|
||||
// Load weights from NNUEWeights.scala
|
||||
// Convert Position to 768-feature vector
|
||||
// Run inference: l1→ReLU→l2→ReLU→l3
|
||||
// Return centipawn score
|
||||
```
|
||||
|
||||
### Step 6: Integration
|
||||
|
||||
Implement `NNUEBot` that uses the NNUE evaluator for move selection.
|
||||
|
||||
## File Reference
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `requirements.txt` | Python dependencies |
|
||||
| `generate_positions.py` | Step 1: Position generator |
|
||||
| `label_positions.py` | Step 2: Stockfish labeler |
|
||||
| `train_nnue.py` | Step 3: NNUE trainer |
|
||||
| `export_weights.py` | Step 4: Weight exporter |
|
||||
| `run_pipeline.sh` | Master script (runs steps 1-4) |
|
||||
| `positions.txt` | Output: Raw FENs (500K) |
|
||||
| `training_data.jsonl` | Output: FEN+eval pairs |
|
||||
| `nnue_weights.pt` | Output: Trained weights |
|
||||
| `../src/main/scala/.../NNUEWeights.scala` | Output: Scala weights |
|
||||
|
||||
## Tips
|
||||
|
||||
- **For testing:** Reduce `generate_positions.py` to 10,000 games for quick iteration
|
||||
- **Resume labeling:** Run step 2 again; it skips already-evaluated positions
|
||||
- **GPU acceleration:** Install CUDA for PyTorch to speed up training
|
||||
- **Stockfish tuning:** Lower depth (e.g., 8 instead of 12) for faster labeling
|
||||
- **Batch size:** Increase to 8192 if OOM; decrease if out of memory
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**ImportError: No module named 'chess'**
|
||||
- Run: `pip install -r requirements.txt`
|
||||
|
||||
**Stockfish not found**
|
||||
- Check: `which stockfish` or set `export STOCKFISH_PATH=/full/path/to/stockfish`
|
||||
|
||||
**CUDA out of memory**
|
||||
- Reduce batch size in `train_nnue.py` (e.g., 2048)
|
||||
- Or use CPU: Remove CUDA check and device setup
|
||||
|
||||
**Training loss not decreasing**
|
||||
- Check data quality: Sample some entries from `training_data.jsonl`
|
||||
- Increase learning rate to 1e-2 or 5e-4 for experimentation
|
||||
- Verify Stockfish depth was sufficient (depth ≥ 10)
|
||||
|
||||
## References
|
||||
|
||||
- [NNUE Overview](https://www.chessprogramming.org/NNUE)
|
||||
- [python-chess](https://python-chess.readthedocs.io/)
|
||||
- [PyTorch](https://pytorch.org/)
|
||||
- [Stockfish](https://stockfishchess.org/)
|
||||
@@ -1,381 +0,0 @@
|
||||
# NNUE Training Guide: Incremental Training & Versioning
|
||||
|
||||
## Overview
|
||||
|
||||
The improved `train_nnue.py` now supports:
|
||||
1. **Incremental training** — Resume from checkpoint, continue training on new data
|
||||
2. **Automatic versioning** — Each training run saved as `nnue_weights_v{N}.pt`
|
||||
3. **Metadata tracking** — Date, positions, depth, losses stored in JSON
|
||||
4. **CLI flags** — Full control over training parameters
|
||||
|
||||
## Quick Start
|
||||
|
||||
### First Training Run (Fresh Start)
|
||||
|
||||
```bash
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt
|
||||
```
|
||||
|
||||
This saves:
|
||||
- `nnue_weights_v1.pt` — The trained weights
|
||||
- `nnue_weights_v1_metadata.json` — Training metadata
|
||||
|
||||
### Continue Training (Incremental)
|
||||
|
||||
Add more positions to `training_data.jsonl`, then:
|
||||
|
||||
```bash
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt
|
||||
```
|
||||
|
||||
The trainer will:
|
||||
1. Detect `nnue_weights.pt` exists
|
||||
2. Load it as a checkpoint automatically
|
||||
3. Continue training on all data
|
||||
4. Save as `nnue_weights_v2.pt` with updated metadata
|
||||
|
||||
Alternatively, specify a checkpoint explicitly:
|
||||
|
||||
```bash
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --checkpoint nnue_weights_v1.pt
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Custom Training Parameters
|
||||
|
||||
```bash
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt \
|
||||
--epochs 30 \
|
||||
--batch-size 2048 \
|
||||
--lr 5e-4 \
|
||||
--stockfish-depth 14
|
||||
```
|
||||
|
||||
- `--epochs` — How many passes through the data (default: 20)
|
||||
- `--batch-size` — Samples per gradient update (default: 4096)
|
||||
- `--lr` — Learning rate (default: 1e-3)
|
||||
- `--stockfish-depth` — Depth of Stockfish evaluation (for metadata only)
|
||||
|
||||
### Explicit Checkpoint
|
||||
|
||||
Resume from a specific checkpoint (not `nnue_weights.pt`):
|
||||
|
||||
```bash
|
||||
python train_nnue.py training_data_v2.jsonl nnue_weights.pt \
|
||||
--checkpoint nnue_weights_v1.pt
|
||||
```
|
||||
|
||||
### Disable Versioning
|
||||
|
||||
Save directly to output file without versioning:
|
||||
|
||||
```bash
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --no-versioning
|
||||
```
|
||||
|
||||
This overwrites `nnue_weights.pt` instead of creating `nnue_weights_v2.pt`.
|
||||
|
||||
## Incremental Training Workflow
|
||||
|
||||
Typical workflow for improving the model over time:
|
||||
|
||||
**Step 1: Initial Training**
|
||||
```bash
|
||||
# Generate 500K positions with Stockfish
|
||||
./run_pipeline.sh
|
||||
|
||||
# This saves:
|
||||
# - nnue_weights_v1.pt
|
||||
# - nnue_weights_v1_metadata.json
|
||||
```
|
||||
|
||||
**Step 2: Generate More Positions**
|
||||
```bash
|
||||
# Later, generate 500K more positions
|
||||
# Append to training_data.jsonl or create new one
|
||||
|
||||
# Label with Stockfish at depth 16 (more thorough)
|
||||
python label_positions.py positions_batch2.txt training_data_batch2.jsonl stockfish --stockfish-depth 16
|
||||
|
||||
# Combine datasets
|
||||
cat training_data_batch1.jsonl training_data_batch2.jsonl > training_data_combined.jsonl
|
||||
```
|
||||
|
||||
**Step 3: Continue Training**
|
||||
```bash
|
||||
# Train on combined data, starting from v1 checkpoint
|
||||
python train_nnue.py training_data_combined.jsonl nnue_weights.pt
|
||||
|
||||
# Saves:
|
||||
# - nnue_weights_v2.pt (improved)
|
||||
# - nnue_weights_v2_metadata.json
|
||||
```
|
||||
|
||||
**Step 4: Benchmark & Choose**
|
||||
```bash
|
||||
# Test both versions in matches
|
||||
# If v2 is better, use it; otherwise keep v1
|
||||
|
||||
# Update NNUEWeights.scala with best version
|
||||
python export_weights.py nnue_weights_v2.pt ../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala
|
||||
```
|
||||
|
||||
## Metadata File Format
|
||||
|
||||
Each training session generates a JSON metadata file, e.g., `nnue_weights_v2_metadata.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": 2,
|
||||
"date": "2026-04-07T21:45:30.123456",
|
||||
"num_positions": 1000000,
|
||||
"stockfish_depth": 12,
|
||||
"epochs": 20,
|
||||
"batch_size": 4096,
|
||||
"learning_rate": 0.001,
|
||||
"final_val_loss": 0.0234567,
|
||||
"device": "cuda",
|
||||
"checkpoint": "nnue_weights_v1.pt",
|
||||
"notes": "Win rate vs classical eval: TBD (requires benchmark games)"
|
||||
}
|
||||
```
|
||||
|
||||
### Fields
|
||||
|
||||
- **version**: Training version number (v1, v2, etc.)
|
||||
- **date**: ISO timestamp of training start
|
||||
- **num_positions**: Total positions in dataset
|
||||
- **stockfish_depth**: Depth of Stockfish evaluations (from command-line flag)
|
||||
- **epochs**: Number of training passes
|
||||
- **batch_size**: Training batch size
|
||||
- **learning_rate**: Adam optimizer learning rate
|
||||
- **final_val_loss**: Best validation loss achieved
|
||||
- **device**: GPU (cuda) or CPU used for training
|
||||
- **checkpoint**: Previous model used as starting point (null if from scratch)
|
||||
- **notes**: Win rate comparison (currently TBD — requires benchmark)
|
||||
|
||||
## Checkpoint Logic
|
||||
|
||||
When you run training, the trainer checks for checkpoints in this order:
|
||||
|
||||
1. **Explicit checkpoint** — If you provide `--checkpoint`, use it
|
||||
2. **Auto-detect** — If output file exists (e.g., `nnue_weights.pt`), load it
|
||||
3. **From scratch** — Otherwise, initialize with random weights
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
# First run: from scratch (no nnue_weights.pt exists)
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt
|
||||
# → Creates v1 from scratch, saves as nnue_weights_v1.pt
|
||||
|
||||
# Second run: auto-detect nnue_weights.pt as checkpoint
|
||||
python train_nnue.py training_data_bigger.jsonl nnue_weights.pt
|
||||
# → Loads nnue_weights_v1.pt (because nnue_weights.pt = v1), saves as v2
|
||||
|
||||
# Third run: explicit checkpoint
|
||||
python train_nnue.py training_data_huge.jsonl nnue_weights.pt --checkpoint nnue_weights_v2.pt
|
||||
# → Loads v2, saves as v3
|
||||
```
|
||||
|
||||
## Resuming Interrupted Training
|
||||
|
||||
If training is interrupted (power loss, ^C), you can resume:
|
||||
|
||||
```bash
|
||||
# Original command
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt
|
||||
|
||||
# If interrupted, the same command will:
|
||||
# 1. Detect nnue_weights_v1.pt exists (or a higher version)
|
||||
# 2. Auto-load it as checkpoint
|
||||
# 3. Resume training
|
||||
# 4. Save next version (v2, v3, etc.)
|
||||
```
|
||||
|
||||
## Performance Tips
|
||||
|
||||
### Reduce Training Time
|
||||
|
||||
```bash
|
||||
# Smaller batch size = slower but less memory
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --batch-size 1024
|
||||
|
||||
# Fewer epochs
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --epochs 5
|
||||
|
||||
# Lower learning rate = slower convergence but more stable
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --lr 5e-4
|
||||
```
|
||||
|
||||
### Accelerate on GPU
|
||||
|
||||
If you have NVIDIA GPU with CUDA:
|
||||
|
||||
```bash
|
||||
# Training will automatically use CUDA
|
||||
# Check metadata device field: should be "cuda" not "cpu"
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt
|
||||
```
|
||||
|
||||
If training uses CPU but GPU is available:
|
||||
```bash
|
||||
# Reinstall PyTorch with CUDA
|
||||
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
```
|
||||
|
||||
### Efficient Incremental Training
|
||||
|
||||
```bash
|
||||
# Fine-tune v1 on slightly different data (high learning rate)
|
||||
python train_nnue.py new_positions.jsonl nnue_weights.pt \
|
||||
--checkpoint nnue_weights_v1.pt \
|
||||
--epochs 3 \
|
||||
--lr 5e-4
|
||||
|
||||
# Full retraining on combined data (slower, better)
|
||||
python train_nnue.py all_positions.jsonl nnue_weights.pt \
|
||||
--checkpoint nnue_weights_v1.pt \
|
||||
--epochs 20 \
|
||||
--lr 1e-3
|
||||
```
|
||||
|
||||
## Version Management
|
||||
|
||||
### List All Versions
|
||||
|
||||
```bash
|
||||
ls -la nnue_weights_v*.pt
|
||||
ls -la nnue_weights_v*_metadata.json
|
||||
```
|
||||
|
||||
### Compare Versions
|
||||
|
||||
```bash
|
||||
cat nnue_weights_v1_metadata.json | grep "final_val_loss"
|
||||
cat nnue_weights_v2_metadata.json | grep "final_val_loss"
|
||||
cat nnue_weights_v3_metadata.json | grep "final_val_loss"
|
||||
```
|
||||
|
||||
Lower val loss = better model.
|
||||
|
||||
### Benchmark Best Version
|
||||
|
||||
After training multiple versions, benchmark them:
|
||||
|
||||
```bash
|
||||
# Export v1 and play some games
|
||||
python export_weights.py nnue_weights_v1.pt ../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala
|
||||
./compile && ./test
|
||||
|
||||
# Export v2 and benchmark
|
||||
python export_weights.py nnue_weights_v2.pt ../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala
|
||||
./compile && ./test
|
||||
|
||||
# Keep the best, archive others
|
||||
```
|
||||
|
||||
### Archive Old Versions
|
||||
|
||||
```bash
|
||||
# Keep only recent versions
|
||||
mkdir -p old_models
|
||||
mv nnue_weights_v1.pt old_models/
|
||||
mv nnue_weights_v1_metadata.json old_models/
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "FileNotFoundError: training_data.jsonl not found"
|
||||
|
||||
```bash
|
||||
# Make sure you're in the python/ directory
|
||||
cd modules/bot/python
|
||||
|
||||
# Or provide full path
|
||||
python train_nnue.py /full/path/to/training_data.jsonl nnue_weights.pt
|
||||
```
|
||||
|
||||
### "CUDA out of memory"
|
||||
|
||||
Reduce batch size:
|
||||
|
||||
```bash
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --batch-size 2048
|
||||
```
|
||||
|
||||
### Training seems slow (using CPU not GPU)
|
||||
|
||||
```bash
|
||||
# Check metadata of a training run
|
||||
cat nnue_weights_v1_metadata.json | grep device
|
||||
|
||||
# If "cpu", reinstall PyTorch with CUDA support
|
||||
pip install torch --index-url https://download.pytorch.org/whl/cu118
|
||||
```
|
||||
|
||||
### "checkpoint file corrupted"
|
||||
|
||||
```bash
|
||||
# Start over from scratch (don't load corrupted checkpoint)
|
||||
python train_nnue.py training_data.jsonl nnue_weights_fresh.pt --no-versioning
|
||||
|
||||
# Or resume from earlier version
|
||||
python train_nnue.py training_data.jsonl nnue_weights.pt --checkpoint nnue_weights_v1.pt
|
||||
```
|
||||
|
||||
## Integration with Pipeline
|
||||
|
||||
The `run_pipeline.sh` script now supports incremental training:
|
||||
|
||||
```bash
|
||||
# First run: generates data, trains v1
|
||||
./run_pipeline.sh
|
||||
|
||||
# Add more positions
|
||||
# ... generate more, label more ...
|
||||
|
||||
# Second run: trains on combined data as v2
|
||||
./run_pipeline.sh
|
||||
```
|
||||
|
||||
## Example: Full Workflow
|
||||
|
||||
```bash
|
||||
cd modules/bot/python
|
||||
|
||||
# Session 1: Initial training
|
||||
chmod +x run_pipeline.sh
|
||||
export STOCKFISH_PATH=/usr/bin/stockfish
|
||||
./run_pipeline.sh
|
||||
# Creates: nnue_weights_v1.pt, nnue_weights_v1_metadata.json
|
||||
|
||||
# Session 2: Improve with deeper analysis
|
||||
# (manually evaluate more positions at depth 14)
|
||||
python label_positions.py positions_v2.txt training_data_v2.jsonl \
|
||||
/usr/bin/stockfish --stockfish-depth 14
|
||||
|
||||
# Combine and retrain
|
||||
cat training_data_v1.jsonl training_data_v2.jsonl > training_data_combined.jsonl
|
||||
|
||||
python train_nnue.py training_data_combined.jsonl nnue_weights.pt \
|
||||
--epochs 25 \
|
||||
--stockfish-depth 14
|
||||
# Creates: nnue_weights_v2.pt, nnue_weights_v2_metadata.json
|
||||
|
||||
# Session 3: Benchmark and choose
|
||||
# Test both v1 and v2 with matches...
|
||||
# If v2 is better, export and use
|
||||
python export_weights.py nnue_weights_v2.pt \
|
||||
../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala
|
||||
|
||||
cd ../..
|
||||
./compile && ./test
|
||||
```
|
||||
|
||||
## See Also
|
||||
|
||||
- `train_nnue.py --help` — Command-line help
|
||||
- `README_NNUE.md` — Complete pipeline documentation
|
||||
- `NNUE_IMPLEMENTATION_SUMMARY.md` — Technical architecture
|
||||
@@ -1,64 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Export NNUE weights to Scala code."""
|
||||
|
||||
import torch
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def export_weights_to_scala(weights_file, output_file):
|
||||
"""Load PyTorch weights and export as Scala code."""
|
||||
|
||||
if not Path(weights_file).exists():
|
||||
print(f"Error: Weights file not found at {weights_file}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load weights (weights_only=False for compatibility with older PyTorch versions)
|
||||
state_dict = torch.load(weights_file, map_location='cpu')
|
||||
|
||||
# Create output directory if needed
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
f.write("package de.nowchess.bot.bots.nnue\n\n")
|
||||
f.write("object NNUEWeights:\n")
|
||||
|
||||
for layer_name, tensor in sorted(state_dict.items()):
|
||||
# Sanitize name
|
||||
safe_name = layer_name.replace('.', '_').replace(' ', '_')
|
||||
|
||||
# Convert tensor to flat list
|
||||
values = tensor.flatten().tolist()
|
||||
|
||||
# Format as Scala array
|
||||
f.write(f"\n val {safe_name} = Array(\n")
|
||||
|
||||
# Write values in chunks for readability
|
||||
chunk_size = 16
|
||||
for i in range(0, len(values), chunk_size):
|
||||
chunk = values[i:i + chunk_size]
|
||||
formatted_chunk = ", ".join(f"{v:.10g}f" for v in chunk)
|
||||
f.write(f" {formatted_chunk}")
|
||||
if i + chunk_size < len(values):
|
||||
f.write(",\n")
|
||||
else:
|
||||
f.write("\n")
|
||||
|
||||
f.write(f" )\n")
|
||||
|
||||
# Store shape for reference
|
||||
shape = list(tensor.shape)
|
||||
f.write(f" // Shape: {shape}\n")
|
||||
|
||||
print(f"Weights exported to {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
weights_file = "nnue_weights.pt"
|
||||
output_file = "../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights.scala"
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
weights_file = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
output_file = sys.argv[2]
|
||||
|
||||
export_weights_to_scala(weights_file, output_file)
|
||||
+59
-32
@@ -13,44 +13,63 @@ def get_python_cmd():
|
||||
return "python"
|
||||
return "python3" if os.popen("which python3 2>/dev/null").read() else "python"
|
||||
|
||||
def get_src_module(module_name):
|
||||
"""Get path to module in src/ directory."""
|
||||
return Path(__file__).parent / "src" / f"{module_name}.py"
|
||||
|
||||
def get_data_dir():
|
||||
"""Get/create data directory."""
|
||||
data_dir = Path(__file__).parent / "data"
|
||||
data_dir.mkdir(exist_ok=True)
|
||||
return data_dir
|
||||
|
||||
def get_weights_dir():
|
||||
"""Get/create weights directory."""
|
||||
weights_dir = Path(__file__).parent / "weights"
|
||||
weights_dir.mkdir(exist_ok=True)
|
||||
return weights_dir
|
||||
|
||||
def list_checkpoints():
|
||||
"""List available checkpoint versions."""
|
||||
checkpoints = sorted(Path(".").glob("nnue_weights_v*.pt"))
|
||||
weights_dir = get_weights_dir()
|
||||
checkpoints = sorted(weights_dir.glob("nnue_weights_v*.pt"))
|
||||
if not checkpoints:
|
||||
return []
|
||||
return [int(cp.stem.split("_v")[1]) for cp in checkpoints]
|
||||
|
||||
def run_generate_positions(num_games):
|
||||
"""Generate random positions."""
|
||||
positions_file = "positions.txt"
|
||||
data_dir = get_data_dir()
|
||||
positions_file = data_dir / "positions.txt"
|
||||
print(f"Generating {num_games} positions...")
|
||||
result = subprocess.run(
|
||||
[get_python_cmd(), "generate_positions.py", positions_file, "--games", str(num_games)],
|
||||
[get_python_cmd(), str(get_src_module("generate")), str(positions_file), "--games", str(num_games)],
|
||||
capture_output=False
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print("ERROR: Position generation failed")
|
||||
return False
|
||||
return Path(positions_file).exists()
|
||||
return positions_file.exists()
|
||||
|
||||
def run_label_positions(stockfish_path):
|
||||
"""Label positions with Stockfish."""
|
||||
positions_file = "positions.txt"
|
||||
output_file = "training_data.jsonl"
|
||||
data_dir = get_data_dir()
|
||||
positions_file = data_dir / "positions.txt"
|
||||
output_file = data_dir / "training_data.jsonl"
|
||||
|
||||
if not Path(positions_file).exists():
|
||||
if not positions_file.exists():
|
||||
print("ERROR: positions.txt not found")
|
||||
return False
|
||||
|
||||
print("Labeling positions with Stockfish...")
|
||||
result = subprocess.run(
|
||||
[get_python_cmd(), "label_positions.py", positions_file, output_file, stockfish_path],
|
||||
[get_python_cmd(), str(get_src_module("label")), str(positions_file), str(output_file), stockfish_path],
|
||||
capture_output=False
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print("ERROR: Position labeling failed")
|
||||
return False
|
||||
return Path(output_file).exists()
|
||||
return output_file.exists()
|
||||
|
||||
def run_train(positions_file, output_weights, from_checkpoint=None):
|
||||
"""Train NNUE model."""
|
||||
@@ -58,29 +77,34 @@ def run_train(positions_file, output_weights, from_checkpoint=None):
|
||||
print(f"ERROR: {positions_file} not found")
|
||||
return False
|
||||
|
||||
weights_dir = get_weights_dir()
|
||||
print(f"Training model (output: {output_weights})...")
|
||||
if from_checkpoint:
|
||||
print(f" Starting from checkpoint: {from_checkpoint}")
|
||||
|
||||
cmd = [get_python_cmd(), "train_nnue.py", positions_file, output_weights]
|
||||
cmd = [get_python_cmd(), str(get_src_module("train")), str(positions_file), str(output_weights)]
|
||||
if from_checkpoint:
|
||||
cmd.extend(["--checkpoint", from_checkpoint])
|
||||
cmd.extend(["--checkpoint", str(from_checkpoint)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=False)
|
||||
# Run from weights directory so outputs save there
|
||||
result = subprocess.run(cmd, cwd=str(weights_dir), capture_output=False)
|
||||
if result.returncode != 0:
|
||||
print("ERROR: Training failed")
|
||||
return False
|
||||
return True # train_nnue creates versioned file, not the base name
|
||||
return True
|
||||
|
||||
def run_export(weights_file, output_file):
|
||||
"""Export weights to Scala."""
|
||||
if not Path(weights_file).exists():
|
||||
print(f"ERROR: {weights_file} not found")
|
||||
weights_dir = get_weights_dir()
|
||||
weights_path = weights_dir / Path(weights_file).name
|
||||
|
||||
if not weights_path.exists():
|
||||
print(f"ERROR: {weights_file} not found in {weights_dir}")
|
||||
return False
|
||||
|
||||
print(f"Exporting {weights_file} to Scala...")
|
||||
result = subprocess.run(
|
||||
[get_python_cmd(), "export_weights.py", weights_file, output_file],
|
||||
[get_python_cmd(), str(get_src_module("export")), str(weights_path), output_file],
|
||||
capture_output=False
|
||||
)
|
||||
if result.returncode != 0:
|
||||
@@ -91,13 +115,16 @@ def run_export(weights_file, output_file):
|
||||
def cmd_train(args):
|
||||
"""Handle train command."""
|
||||
stockfish_path = args.stockfish or os.environ.get("STOCKFISH_PATH", "/usr/games/stockfish")
|
||||
data_dir = get_data_dir()
|
||||
weights_dir = get_weights_dir()
|
||||
|
||||
# Determine checkpoint
|
||||
checkpoint = None
|
||||
if args.from_checkpoint:
|
||||
checkpoint_version = args.from_checkpoint
|
||||
checkpoint = f"nnue_weights_v{checkpoint_version}.pt"
|
||||
if not Path(checkpoint).exists():
|
||||
checkpoint_path = weights_dir / checkpoint
|
||||
if not checkpoint_path.exists():
|
||||
print(f"ERROR: Checkpoint {checkpoint} not found")
|
||||
return False
|
||||
else:
|
||||
@@ -109,12 +136,12 @@ def cmd_train(args):
|
||||
|
||||
# Generate or use existing positions
|
||||
if args.positions_file:
|
||||
if not Path(args.positions_file).exists():
|
||||
positions_file = Path(args.positions_file)
|
||||
if not positions_file.exists():
|
||||
print(f"ERROR: {args.positions_file} not found")
|
||||
return False
|
||||
positions_file = args.positions_file
|
||||
else:
|
||||
positions_file = "positions.txt"
|
||||
positions_file = data_dir / "positions.txt"
|
||||
num_games = args.games or 500000
|
||||
if not run_generate_positions(num_games):
|
||||
return False
|
||||
@@ -125,8 +152,9 @@ def cmd_train(args):
|
||||
|
||||
print("\nStarting training...")
|
||||
|
||||
# Train (train_nnue.py handles versioning internally)
|
||||
if not run_train("training_data.jsonl", "nnue_weights.pt", checkpoint):
|
||||
# Train with absolute path to data, checkpoint is relative to weights dir
|
||||
training_data = str(data_dir / "training_data.jsonl")
|
||||
if not run_train(training_data, "nnue_weights.pt", checkpoint):
|
||||
return False
|
||||
|
||||
# Show created version
|
||||
@@ -143,13 +171,8 @@ def cmd_export(args):
|
||||
if not weights_file.endswith(".pt"):
|
||||
weights_file = f"nnue_weights_v{weights_file}.pt"
|
||||
|
||||
if not Path(weights_file).exists():
|
||||
print(f"ERROR: {weights_file} not found")
|
||||
return False
|
||||
|
||||
# Determine version from filename
|
||||
version = Path(weights_file).stem.split("_v")[1] if "_v" in weights_file else "1"
|
||||
output_file = f"../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights_v{version}.scala"
|
||||
# Output to resources directory as binary format
|
||||
output_file = str(Path(__file__).parent.parent / "src" / "main" / "resources" / "nnue_weights.bin")
|
||||
|
||||
if not run_export(weights_file, output_file):
|
||||
return False
|
||||
@@ -164,11 +187,15 @@ def cmd_list(args):
|
||||
print("No checkpoints found")
|
||||
return True
|
||||
|
||||
weights_dir = get_weights_dir()
|
||||
print("Available checkpoints:")
|
||||
for v in available:
|
||||
weights_file = f"nnue_weights_v{v}.pt"
|
||||
size = Path(weights_file).stat().st_size / (1024**2) # MB
|
||||
print(f" v{v} ({size:.1f} MB)")
|
||||
weights_file = weights_dir / f"nnue_weights_v{v}.pt"
|
||||
if weights_file.exists():
|
||||
size = weights_file.stat().st_size / (1024**2) # MB
|
||||
print(f" v{v} ({size:.1f} MB)")
|
||||
else:
|
||||
print(f" v{v} (file not found)")
|
||||
return True
|
||||
|
||||
def main():
|
||||
|
||||
Binary file not shown.
@@ -1,6 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
# NNUE Training Pipeline (bash version)
|
||||
# Uses the central CLI (nnue.py) for all operations
|
||||
# Works on Linux, macOS, and Windows (with Git Bash or WSL)
|
||||
|
||||
set -e # Exit on error
|
||||
@@ -20,56 +21,16 @@ echo "Python command: $PYTHON_CMD"
|
||||
echo "Working directory: $SCRIPT_DIR"
|
||||
echo ""
|
||||
|
||||
# Step 1: Generate positions
|
||||
echo "Step 1: Generating 500,000 random positions..."
|
||||
$PYTHON_CMD generate_positions.py positions.txt
|
||||
if [ ! -f positions.txt ]; then
|
||||
echo "ERROR: positions.txt not created"
|
||||
# Run the unified training pipeline
|
||||
$PYTHON_CMD nnue.py train
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo ""
|
||||
echo "ERROR: Training pipeline failed"
|
||||
exit 1
|
||||
fi
|
||||
echo "✓ Positions generated"
|
||||
|
||||
echo ""
|
||||
|
||||
# Step 2: Label positions with Stockfish
|
||||
echo "Step 2: Labeling positions with Stockfish (depth 12)..."
|
||||
STOCKFISH_PATH="${STOCKFISH_PATH:-/usr/games/stockfish}"
|
||||
echo "Using Stockfish: $STOCKFISH_PATH"
|
||||
$PYTHON_CMD label_positions.py positions.txt training_data.jsonl "$STOCKFISH_PATH"
|
||||
if [ ! -f training_data.jsonl ]; then
|
||||
echo "ERROR: training_data.jsonl not created"
|
||||
exit 1
|
||||
fi
|
||||
echo "✓ Positions labeled"
|
||||
echo ""
|
||||
|
||||
# Step 3: Train NNUE model with versioning
|
||||
echo "Step 3: Training NNUE model (20 epochs)..."
|
||||
|
||||
# Auto-detect latest version and increment
|
||||
LATEST_VERSION=$(ls -1 nnue_weights_v*.pt 2>/dev/null | sed 's/nnue_weights_v//;s/.pt$//' | sort -n | tail -1)
|
||||
NEW_VERSION=$((${LATEST_VERSION:-0} + 1))
|
||||
WEIGHTS_FILE="nnue_weights_v${NEW_VERSION}.pt"
|
||||
|
||||
echo "Creating version v${NEW_VERSION}..."
|
||||
$PYTHON_CMD train_nnue.py training_data.jsonl "$WEIGHTS_FILE"
|
||||
if [ ! -f "$WEIGHTS_FILE" ]; then
|
||||
echo "ERROR: $WEIGHTS_FILE not created"
|
||||
exit 1
|
||||
fi
|
||||
echo "✓ Model trained: $WEIGHTS_FILE"
|
||||
echo ""
|
||||
|
||||
# Step 4: Export weights to Scala
|
||||
echo "Step 4: Exporting weights to Scala..."
|
||||
SCALA_FILE="../src/main/scala/de/nowchess/bot/bots/nnue/NNUEWeights_v${NEW_VERSION}.scala"
|
||||
$PYTHON_CMD export_weights.py "$WEIGHTS_FILE" "$SCALA_FILE"
|
||||
if [ ! -f "$SCALA_FILE" ]; then
|
||||
echo "ERROR: $SCALA_FILE not created"
|
||||
exit 1
|
||||
fi
|
||||
echo "✓ Weights exported: $SCALA_FILE"
|
||||
echo ""
|
||||
|
||||
echo "=== Pipeline Complete ==="
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Export NNUE weights to binary format for runtime loading."""
|
||||
|
||||
import torch
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def export_weights_to_binary(weights_file, output_file):
|
||||
"""Load PyTorch weights and export as binary file."""
|
||||
|
||||
if not Path(weights_file).exists():
|
||||
print(f"Error: Weights file not found at {weights_file}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load weights
|
||||
state_dict = torch.load(weights_file, map_location='cpu')
|
||||
|
||||
# Debug: print available layers
|
||||
print(f"Available layers in {weights_file}:")
|
||||
for key in sorted(state_dict.keys()):
|
||||
print(f" {key}: {state_dict[key].shape}")
|
||||
|
||||
# Create output directory if needed
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, 'wb') as f:
|
||||
# Write magic number and version
|
||||
f.write(b'NNUE')
|
||||
f.write(struct.pack('<I', 1)) # version 1
|
||||
|
||||
# Write each weight tensor in order
|
||||
for layer_name in ['l1.weight', 'l1.bias', 'l2.weight', 'l2.bias', 'l3.weight', 'l3.bias']:
|
||||
if layer_name not in state_dict:
|
||||
print(f"Error: Missing layer {layer_name}")
|
||||
sys.exit(1)
|
||||
|
||||
tensor = state_dict[layer_name]
|
||||
# Convert to float32 and flatten
|
||||
data = tensor.float().flatten().cpu().numpy()
|
||||
|
||||
# Write shape (allows validation on load)
|
||||
shape = list(tensor.shape)
|
||||
f.write(struct.pack('<I', len(shape)))
|
||||
for dim in shape:
|
||||
f.write(struct.pack('<I', dim))
|
||||
|
||||
# Write flattened data as binary floats
|
||||
f.write(struct.pack(f'<{len(data)}f', *data))
|
||||
|
||||
print(f" {layer_name}: shape {shape}, {len(data)} floats")
|
||||
|
||||
file_size_mb = output_path.stat().st_size / (1024**2)
|
||||
print(f"Weights exported to {output_file} ({file_size_mb:.2f} MB)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
weights_file = "nnue_weights.pt"
|
||||
output_file = "../src/main/resources/nnue_weights.bin"
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
weights_file = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
output_file = sys.argv[2]
|
||||
|
||||
export_weights_to_binary(weights_file, output_file)
|
||||
Binary file not shown.
+3
-3
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"version": 1,
|
||||
"date": "2026-04-07T22:37:15.093371",
|
||||
"num_positions": 1223,
|
||||
"date": "2026-04-07T22:56:23.259658",
|
||||
"num_positions": 2086,
|
||||
"stockfish_depth": 12,
|
||||
"epochs": 20,
|
||||
"batch_size": 4096,
|
||||
"learning_rate": 0.001,
|
||||
"final_val_loss": 0.0162429828196764,
|
||||
"final_val_loss": 0.016311248764395714,
|
||||
"device": "cuda",
|
||||
"checkpoint": null,
|
||||
"notes": "Win rate vs classical eval: TBD (requires benchmark games)"
|
||||
Reference in New Issue
Block a user