"""test_conversational_neo_manual.py - Manual generation for NeoMini model""" from model_neo import NeoMini, NeoMiniConfig from transformers import AutoTokenizer import torch import torch.nn.functional as F import json def load_conversational_model(model_path="conversational_neo_extended"): """Load the fine-tuned conversational model""" print("Loading fine-tuned conversational model...") # Load config try: with open(f"{model_path}/model_config.json", 'r') as f: model_config = json.load(f) max_seq_len = model_config.get('max_seq_len', 4096) except: max_seq_len = 4096 # Load model config = NeoMiniConfig() config.max_seq_len = max_seq_len model = NeoMini(config) checkpoint = torch.load(f"{model_path}/conversational_model.pt", map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path) model.eval() # Move to GPU if available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) print(f"โœ… Model loaded with {max_seq_len} token context window on {device}") return model, tokenizer, device def generate_response(model, tokenizer, prompt, device, max_new_tokens=200, temperature=0.8, top_k=50, top_p=0.9): """Manual text generation for NeoMini model""" # Tokenize input input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device) original_length = input_ids.shape[1] # Generate tokens one by one with torch.no_grad(): for step in range(max_new_tokens): # Forward pass logits = model(input_ids) # Get logits for the last token next_token_logits = logits[0, -1, :] / temperature # Apply top-k filtering if top_k > 0: top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k) next_token_logits = torch.full_like(next_token_logits, float('-inf')) next_token_logits.scatter_(0, top_k_indices, top_k_logits) # Apply top-p filtering if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above threshold sorted_indices_to_remove = cumulative_probs > top_p sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone() sorted_indices_to_remove[0] = 0 indices_to_remove = sorted_indices[sorted_indices_to_remove] next_token_logits[indices_to_remove] = float('-inf') # Sample next token probs = F.softmax(next_token_logits, dim=-1) next_token = torch.multinomial(probs, num_samples=1) # Append to sequence input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1) # Stop if EOS token or max context reached if next_token.item() == tokenizer.eos_token_id: break if input_ids.shape[1] >= model.config.max_seq_len: break # Decode only the generated part generated_tokens = input_ids[0, original_length:] generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) return generated_text.strip() def chat_with_model(model, tokenizer, device): """Interactive chat with the conversational model""" print("\n๐Ÿค– MAP-NEO Conversational AI (Fine-Tuned)") print("Type 'quit' to exit, 'clear' to clear history, 'test' for quality tests") print("="*70) conversation_history = [] system_prompt = "You are MAP-NEO, a helpful, harmless, and honest AI assistant. Engage in natural conversation and provide thoughtful, accurate responses." while True: user_input = input("\n๐Ÿง‘ You: ").strip() if user_input.lower() in ['quit', 'exit']: print("๐Ÿ‘‹ Goodbye!") break if user_input.lower() == 'clear': conversation_history = [] print("๐Ÿ”„ Conversation cleared!") continue if user_input.lower() == 'test': test_model_quality(model, tokenizer, device) continue if not user_input: continue # Build conversation context conversation_history.append(f"User: {user_input}") # Keep recent context (last 10 messages) recent_context = conversation_history[-10:] context = "\n".join(recent_context) prompt = f"{system_prompt}\n\n{context}\nAssistant:" # Check prompt length and truncate if needed prompt_tokens = tokenizer.encode(prompt) if len(prompt_tokens) > 1800: # Leave room for response recent_context = conversation_history[-6:] context = "\n".join(recent_context) prompt = f"{system_prompt}\n\n{context}\nAssistant:" print("๐Ÿค– MAP-NEO: ", end="", flush=True) # Generate response try: assistant_response = generate_response( model, tokenizer, prompt, device, max_new_tokens=150, temperature=0.8, top_k=50, top_p=0.9 ) # Clean up response if assistant_response.startswith("Assistant:"): assistant_response = assistant_response[10:].strip() print(assistant_response) # Add to history conversation_history.append(f"Assistant: {assistant_response}") # Show token usage total_tokens = len(tokenizer.encode(prompt + assistant_response)) print(f" ๐Ÿ“Š Tokens: {total_tokens}/4096 ({total_tokens/4096*100:.1f}%)") except Exception as e: print(f"โŒ Error generating response: {e}") print("Try again with a different prompt.") def test_model_quality(model, tokenizer, device): """Test model quality with sample prompts""" print("\n๐Ÿงช Testing Model Quality...") print("="*60) test_prompts = [ "Hello! Can you help me understand machine learning?", "What's the difference between AI and machine learning?", "I'm feeling stressed about work. Any advice?", "Can you write a short story about a robot?", "Explain quantum physics in simple terms.", "How do I make a good cup of coffee?", "What are the benefits of exercise?" ] system_prompt = "You are MAP-NEO, a helpful, harmless, and honest AI assistant. Engage in natural conversation and provide thoughtful, accurate responses." for i, user_prompt in enumerate(test_prompts[:5], 1): # Test first 5 print(f"\n--- Test {i}/5 ---") print(f"๐Ÿง‘ User: {user_prompt}") prompt = f"{system_prompt}\n\nUser: {user_prompt}\nAssistant:" try: assistant_response = generate_response( model, tokenizer, prompt, device, max_new_tokens=120, temperature=0.7, top_k=50, top_p=0.9 ) print(f"๐Ÿค– MAP-NEO: {assistant_response}") except Exception as e: print(f"โŒ Error: {e}") print(f"\nโœ… Quality tests completed!") def compare_before_after(model, tokenizer, device): """Compare responses before and after fine-tuning""" print("\n๐Ÿ“Š Before vs After Fine-Tuning Comparison") print("="*60) # Load original model for comparison try: print("Loading original model for comparison...") original_config = NeoMiniConfig() original_model = NeoMini(original_config) original_checkpoint = torch.load('checkpoints/checkpoint_step_99999.pt', map_location='cpu') original_model.load_state_dict(original_checkpoint['model_state_dict']) original_model.eval().to(device) test_prompt = "Hello! Can you help me learn about artificial intelligence?" prompt = f"You are MAP-NEO, a helpful AI assistant.\n\nUser: {test_prompt}\nAssistant:" # Original model response print(f"\n๐Ÿง‘ User: {test_prompt}") print("\n๐Ÿค– Original Model:") original_response = generate_response(original_model, tokenizer, prompt, device, max_new_tokens=100, temperature=0.7) print(original_response) # Fine-tuned model response print("\n๐Ÿค– Fine-Tuned Model:") finetuned_response = generate_response(model, tokenizer, prompt, device, max_new_tokens=100, temperature=0.7) print(finetuned_response) print("\n๐Ÿ“ˆ The fine-tuned model should be much more conversational and helpful!") except Exception as e: print(f"Comparison unavailable: {e}") if __name__ == "__main__": print("๐Ÿš€ MAP-NEO Conversational AI Testing Suite") print("="*60) # Load model model, tokenizer, device = load_conversational_model() # Test model quality test_model_quality(model, tokenizer, device) # Compare with original if available compare_before_after(model, tokenizer, device) print("\n" + "="*70) print("๐ŸŽ‰ Ready for interactive conversation!") print("="*70) # Start interactive chat chat_with_model(model, tokenizer, device)