File size: 4,360 Bytes

a683148

# scale_data.py - Scale up MAP-NEO Mini training data
import subprocess
import sys
import time
from pathlib import Path

def scale_training_data():
    print("🚀 MAP-NEO Mini Data Scaling")
    print("=" * 50)
    print("Target: 50,000 documents (10x current scale)")
    print("Expected result: ~25,000 training sequences")
    print("Estimated time: 45-60 minutes")
    print("=" * 50)
    
    # Check if we already have large dataset
    large_data = Path("data/tokens/packed_1024_large.txt")
    if large_data.exists():
        print("✅ Large dataset already exists!")
        print(f"Found: {large_data}")
        return str(large_data)
    
    # Check if small dataset exists (backup)
    small_data = Path("data/tokens/packed_1024.txt")
    if small_data.exists():
        backup_path = Path("data/tokens/packed_1024_small_backup.txt")
        print(f"📁 Backing up current dataset to: {backup_path}")
        small_data.rename(backup_path)
    
    # Process 50k documents
    print("\n🔄 Starting data processing...")
    print("This will download and process 50,000 English documents")
    
    cmd = [
        sys.executable, "data_prep.py",
        "--num_docs", "50000",
        "--seq_length", "1024"
    ]
    
    start_time = time.time()
    
    try:
        result = subprocess.run(cmd, check=True, capture_output=False, text=True)
        
        elapsed = time.time() - start_time
        print(f"\n✅ Data scaling completed in {elapsed/60:.1f} minutes!")
        
        # Rename for clarity
        old_path = Path("data/tokens/packed_1024.txt")
        new_path = Path("data/tokens/packed_1024_large.txt")
        if old_path.exists():
            old_path.rename(new_path)
            print(f"📊 Large dataset saved as: {new_path}")
            
            # Count sequences
            with open(new_path, 'r') as f:
                seq_count = sum(1 for _ in f)
            print(f"📈 Total sequences: {seq_count:,}")
            
            return str(new_path)
        else:
            print("❌ Expected output file not found")
            return None
            
    except subprocess.CalledProcessError as e:
        print(f"❌ Error in data processing:")
        print(f"Return code: {e.returncode}")
        return None
    except KeyboardInterrupt:
        print("\n⏹️ Process interrupted by user")
        return None

def update_training_config():
    """Update train_neo.py to use large dataset"""
    print("\n🔧 Updating training configuration...")
    
    train_file = Path("train_neo.py")
    if not train_file.exists():
        print("❌ train_neo.py not found")
        return
    
    # Read current file
    content = train_file.read_text(encoding='utf-8')
    
    # Update data path and training steps
    old_data_path = 'data_path: str = "data/tokens/packed_1024.txt"'
    new_data_path = 'data_path: str = "data/tokens/packed_1024_large.txt"'
    
    old_max_steps = 'max_steps: int = 50000'
    new_max_steps = 'max_steps: int = 100000'
    
    if old_data_path in content:
        content = content.replace(old_data_path, new_data_path)
        print("✅ Updated data_path to use large dataset")
    
    if old_max_steps in content:
        content = content.replace(old_max_steps, new_max_steps)
        print("✅ Updated max_steps to 100,000 for extended training")
    
    # Write back
    train_file.write_text(content, encoding='utf-8')
    print("💾 Training configuration updated!")

def main():
    print("MAP-NEO Mini Data Scaling Pipeline")
    
    # Scale data
    result = scale_training_data()
    
    if result:
        # Update config
        update_training_config()
        
        print("\n" + "="*60)
        print("🎉 DATA SCALING COMPLETE!")
        print("="*60)
        print("Next steps:")
        print("1. Your large dataset is ready for training")
        print("2. Training config updated for 100k steps")
        print("3. Run: python train_neo.py")
        print("4. Expected training time: ~3-4 hours")
        print("5. Expected quality: Much more coherent text!")
        print("="*60)
    else:
        print("\n❌ Data scaling failed. Check the errors above.")

if __name__ == "__main__":
    main()