"""test_conversational_neo_manual.py - Manual generation for NeoMini model"""

from model_neo import NeoMini, NeoMiniConfig
from transformers import AutoTokenizer
import torch
import torch.nn.functional as F
import json

def load_conversational_model(model_path="conversational_neo_extended"):
    """Load the fine-tuned conversational model"""
    
    print("Loading fine-tuned conversational model...")
    
    # Load config
    try:
        with open(f"{model_path}/model_config.json", 'r') as f:
            model_config = json.load(f)
        max_seq_len = model_config.get('max_seq_len', 4096)
    except:
        max_seq_len = 4096
    
    # Load model
    config = NeoMiniConfig()
    config.max_seq_len = max_seq_len
    
    model = NeoMini(config)
    checkpoint = torch.load(f"{model_path}/conversational_model.pt", map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    model.eval()
    
    # Move to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    print(f"✅ Model loaded with {max_seq_len} token context window on {device}")
    return model, tokenizer, device

def generate_response(model, tokenizer, prompt, device, max_new_tokens=200, temperature=0.8, top_k=50, top_p=0.9):
    """Manual text generation for NeoMini model"""
    
    # Tokenize input
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    original_length = input_ids.shape[1]
    
    # Generate tokens one by one
    with torch.no_grad():
        for step in range(max_new_tokens):
            # Forward pass
            logits = model(input_ids)
            
            # Get logits for the last token
            next_token_logits = logits[0, -1, :] / temperature
            
            # Apply top-k filtering
            if top_k > 0:
                top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
                next_token_logits = torch.full_like(next_token_logits, float('-inf'))
                next_token_logits.scatter_(0, top_k_indices, top_k_logits)
            
            # Apply top-p filtering
            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                
                # Remove tokens with cumulative probability above threshold
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
                sorted_indices_to_remove[0] = 0
                
                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                next_token_logits[indices_to_remove] = float('-inf')
            
            # Sample next token
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            
            # Append to sequence
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
            
            # Stop if EOS token or max context reached
            if next_token.item() == tokenizer.eos_token_id:
                break
            
            if input_ids.shape[1] >= model.config.max_seq_len:
                break
    
    # Decode only the generated part
    generated_tokens = input_ids[0, original_length:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    return generated_text.strip()

def chat_with_model(model, tokenizer, device):
    """Interactive chat with the conversational model"""
    
    print("\n🤖 MAP-NEO Conversational AI (Fine-Tuned)")
    print("Type 'quit' to exit, 'clear' to clear history, 'test' for quality tests")
    print("="*70)
    
    conversation_history = []
    system_prompt = "You are MAP-NEO, a helpful, harmless, and honest AI assistant. Engage in natural conversation and provide thoughtful, accurate responses."
    
    while True:
        user_input = input("\n🧑 You: ").strip()
        
        if user_input.lower() in ['quit', 'exit']:
            print("👋 Goodbye!")
            break
        
        if user_input.lower() == 'clear':
            conversation_history = []
            print("🔄 Conversation cleared!")
            continue
        
        if user_input.lower() == 'test':
            test_model_quality(model, tokenizer, device)
            continue
        
        if not user_input:
            continue
        
        # Build conversation context
        conversation_history.append(f"User: {user_input}")
        
        # Keep recent context (last 10 messages)
        recent_context = conversation_history[-10:]
        context = "\n".join(recent_context)
        
        prompt = f"{system_prompt}\n\n{context}\nAssistant:"
        
        # Check prompt length and truncate if needed
        prompt_tokens = tokenizer.encode(prompt)
        if len(prompt_tokens) > 1800:  # Leave room for response
            recent_context = conversation_history[-6:]
            context = "\n".join(recent_context)
            prompt = f"{system_prompt}\n\n{context}\nAssistant:"
        
        print("🤖 MAP-NEO: ", end="", flush=True)
        
        # Generate response
        try:
            assistant_response = generate_response(
                model, tokenizer, prompt, device,
                max_new_tokens=150,
                temperature=0.8,
                top_k=50,
                top_p=0.9
            )
            
            # Clean up response
            if assistant_response.startswith("Assistant:"):
                assistant_response = assistant_response[10:].strip()
            
            print(assistant_response)
            
            # Add to history
            conversation_history.append(f"Assistant: {assistant_response}")
            
            # Show token usage
            total_tokens = len(tokenizer.encode(prompt + assistant_response))
            print(f"   📊 Tokens: {total_tokens}/4096 ({total_tokens/4096*100:.1f}%)")
            
        except Exception as e:
            print(f"❌ Error generating response: {e}")
            print("Try again with a different prompt.")

def test_model_quality(model, tokenizer, device):
    """Test model quality with sample prompts"""
    
    print("\n🧪 Testing Model Quality...")
    print("="*60)
    
    test_prompts = [
        "Hello! Can you help me understand machine learning?",
        "What's the difference between AI and machine learning?", 
        "I'm feeling stressed about work. Any advice?",
        "Can you write a short story about a robot?",
        "Explain quantum physics in simple terms.",
        "How do I make a good cup of coffee?",
        "What are the benefits of exercise?"
    ]
    
    system_prompt = "You are MAP-NEO, a helpful, harmless, and honest AI assistant. Engage in natural conversation and provide thoughtful, accurate responses."
    
    for i, user_prompt in enumerate(test_prompts[:5], 1):  # Test first 5
        print(f"\n--- Test {i}/5 ---")
        print(f"🧑 User: {user_prompt}")
        
        prompt = f"{system_prompt}\n\nUser: {user_prompt}\nAssistant:"
        
        try:
            assistant_response = generate_response(
                model, tokenizer, prompt, device,
                max_new_tokens=120,
                temperature=0.7,
                top_k=50,
                top_p=0.9
            )
            
            print(f"🤖 MAP-NEO: {assistant_response}")
            
        except Exception as e:
            print(f"❌ Error: {e}")
    
    print(f"\n✅ Quality tests completed!")

def compare_before_after(model, tokenizer, device):
    """Compare responses before and after fine-tuning"""
    
    print("\n📊 Before vs After Fine-Tuning Comparison")
    print("="*60)
    
    # Load original model for comparison
    try:
        print("Loading original model for comparison...")
        original_config = NeoMiniConfig()
        original_model = NeoMini(original_config)
        original_checkpoint = torch.load('checkpoints/checkpoint_step_99999.pt', map_location='cpu')
        original_model.load_state_dict(original_checkpoint['model_state_dict'])
        original_model.eval().to(device)
        
        test_prompt = "Hello! Can you help me learn about artificial intelligence?"
        prompt = f"You are MAP-NEO, a helpful AI assistant.\n\nUser: {test_prompt}\nAssistant:"
        
        # Original model response
        print(f"\n🧑 User: {test_prompt}")
        print("\n🤖 Original Model:")
        original_response = generate_response(original_model, tokenizer, prompt, device, max_new_tokens=100, temperature=0.7)
        print(original_response)
        
        # Fine-tuned model response  
        print("\n🤖 Fine-Tuned Model:")
        finetuned_response = generate_response(model, tokenizer, prompt, device, max_new_tokens=100, temperature=0.7)
        print(finetuned_response)
        
        print("\n📈 The fine-tuned model should be much more conversational and helpful!")
        
    except Exception as e:
        print(f"Comparison unavailable: {e}")

if __name__ == "__main__":
    print("🚀 MAP-NEO Conversational AI Testing Suite")
    print("="*60)
    
    # Load model
    model, tokenizer, device = load_conversational_model()
    
    # Test model quality
    test_model_quality(model, tokenizer, device)
    
    # Compare with original if available
    compare_before_after(model, tokenizer, device)
    
    print("\n" + "="*70)
    print("🎉 Ready for interactive conversation!")
    print("="*70)
    
    # Start interactive chat
    chat_with_model(model, tokenizer, device)