feat: Implement dataset versioning and management for NNUE training data

This commit is contained in:
2026-04-13 21:19:26 +02:00
parent 4b52199754
commit 8fb872e958
18 changed files with 1399 additions and 335 deletions
@@ -17,7 +17,7 @@ from generate import play_random_game_and_collect_positions
def download_and_extract_puzzle_db(
url: str = 'https://database.lichess.org/lichess_db_puzzle.csv.zst',
output_dir: str = 'trainingdata'
output_dir: str = 'tactical_data'
):
"""Download and extract the Lichess puzzle database."""
output_path = Path(output_dir)
@@ -141,6 +141,31 @@ def merge_positions(
print(f"{'='*60}\n")
def extract_tactical_only(
puzzle_csv: str,
output_file: str,
max_puzzles: int = 300_000
) -> int:
"""Extract tactical positions and save to file (no merge prompts).
Args:
puzzle_csv: Path to Lichess puzzle CSV
output_file: Where to save the FEN positions
max_puzzles: Maximum puzzles to extract
Returns:
Number of positions extracted
"""
print("Extracting tactical positions from puzzle database...")
tactical_positions = extract_puzzle_positions(puzzle_csv, max_puzzles)
with open(output_file, 'w') as f:
for fen in tactical_positions:
f.write(fen + '\n')
return len(tactical_positions)
def interactive_merge_positions(
puzzle_csv: str,
output_file: str = 'position.txt',