# MAP-NEO Mini Configuration and Setup # Configuration files and helper scripts import json from pathlib import Path # Training configuration optimized for RTX 5070 8GB TRAINING_CONFIG = { "model": { "vocab_size": 50257, "max_seq_len": 2048, "dim": 1024, "n_layers": 16, "n_heads": 16, "hidden_dim": 2736, "dropout": 0.0 }, "training": { "batch_size": 1, "gradient_accumulation_steps": 32, "max_steps": 50000, "warmup_steps": 2000, "learning_rate": 3e-4, "weight_decay": 0.01, "grad_clip": 1.0, "mixed_precision": "bf16", "gradient_checkpointing": True }, "data": { "seq_length": 1024, "data_path": "data/tokens/packed_1024.txt" }, "hardware": { "device": "cuda", "compile_model": False }, "logging": { "log_interval": 10, "save_interval": 2000, "output_dir": "checkpoints" } } # Data preprocessing configuration DATA_CONFIG = { "num_docs": 20000, # Start with 20k documents "seq_length": 1024, "tokenizer": "gpt2", # Will switch to MAP-NEO tokenizer later "output_dir": "data", "min_text_length": 50, # Filter out very short texts "max_text_length": 10000 # Filter out very long texts } def setup_project(): """Create project directory structure""" directories = [ "data/shards", "data/processed", "data/tokens", "checkpoints", "configs", "logs", "notebooks" ] for dir_path in directories: Path(dir_path).mkdir(parents=True, exist_ok=True) print(f"Created directory: {dir_path}") def save_configs(): """Save configuration files""" # Training config with open("configs/training_config.json", "w") as f: json.dump(TRAINING_CONFIG, f, indent=2) # Data config with open("configs/data_config.json", "w") as f: json.dump(DATA_CONFIG, f, indent=2) print("Configuration files saved to configs/") def create_requirements_txt(): """Create requirements.txt file""" requirements = [ "torch>=2.0.0", "transformers>=4.35.0", "tokenizers>=0.14.0", "datasets>=2.14.0", "accelerate>=0.24.0", "sentencepiece>=0.1.99", "langdetect>=1.0.9", "zstandard>=0.21.0", "tqdm>=4.65.0", "numpy>=1.24.0", "matplotlib>=3.6.0", "tensorboard>=2.14.0" ] with open("requirements.txt", "w") as f: f.write("\n".join(requirements)) print("Created requirements.txt") def create_run_script(): """Create a simple run script for training""" run_script = '''#!/usr/bin/env python3 # Run MAP-NEO Mini training pipeline import subprocess import sys from pathlib import Path def run_command(cmd, description): """Run a command and handle errors""" print(f"\\n{'='*50}") print(f"Running: {description}") print(f"Command: {cmd}") print(f"{'='*50}") result = subprocess.run(cmd, shell=True, capture_output=True, text=True) if result.returncode != 0: print(f"Error in {description}:") print(result.stderr) sys.exit(1) else: print(f"Success: {description}") if result.stdout: print(result.stdout) def main(): print("MAP-NEO Mini Training Pipeline") print("Optimized for RTX 5070 8GB VRAM") # Step 1: Data preprocessing if not Path("data/tokens/packed_1024.txt").exists(): print("\\nStep 1: Data preprocessing") run_command( "python data_prep.py --num_docs 20000 --seq_length 1024", "Data preprocessing" ) else: print("\\nSkipping data preprocessing (data exists)") # Step 2: Model training print("\\nStep 2: Starting model training") run_command( "python train_neo.py", "Model training" ) print("\\n" + "="*50) print("Training pipeline completed!") print("Check checkpoints/ directory for saved models") print("="*50) if __name__ == "__main__": main() ''' with open("run_training.py", "w") as f: f.write(run_script) print("Created run_training.py script") if __name__ == "__main__": print("Setting up MAP-NEO Mini project...") setup_project() save_configs() create_requirements_txt() create_run_script() print("\nProject setup complete!") print("\nNext steps:") print("1. Run: python data_prep.py --num_docs 10000") print("2. Run: python train_neo.py") print("3. Or use: python run_training.py")