#!/usr/bin/env python3 """ Test sentence-transformers to isolate the segfault. """ import sys import os print("=" * 60) print("SENTENCE TRANSFORMERS TEST") print("=" * 60) print(f"Python version: {sys.version}") print() # Test 1: Import sentence_transformers print("Test 1: Import sentence_transformers...") try: from sentence_transformers import SentenceTransformer print(f" ✓ sentence_transformers imported") except Exception as e: print(f" ❌ Failed: {e}") sys.exit(1) # Test 2: Load model print("\nTest 2: Load model (this downloads ~90MB on first run)...") try: os.environ['TOKENIZERS_PARALLELISM'] = 'false' model = SentenceTransformer('all-MiniLM-L6-v2') print(f" ✓ Model loaded") except Exception as e: print(f" ❌ Failed: {e}") import traceback traceback.print_exc() sys.exit(1) # Test 3: Encode simple text print("\nTest 3: Encode simple text...") try: text = "This is a test sentence." embedding = model.encode([text]) print(f" ✓ Encoded text, embedding shape: {embedding.shape}") except Exception as e: print(f" ❌ Failed: {e}") import traceback traceback.print_exc() sys.exit(1) # Test 4: Encode batch print("\nTest 4: Encode batch of texts...") try: texts = ["First sentence", "Second sentence", "Third sentence"] embeddings = model.encode(texts, show_progress_bar=False) print(f" ✓ Encoded {len(texts)} texts, shape: {embeddings.shape}") except Exception as e: print(f" ❌ Failed: {e}") import traceback traceback.print_exc() sys.exit(1) # Test 5: Encode with explicit parameters print("\nTest 5: Encode with explicit parameters (like in our script)...") try: embeddings = model.encode( texts, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=False, device='cpu' ) print(f" ✓ Encoded with explicit params, shape: {embeddings.shape}") except Exception as e: print(f" ❌ Failed: {e}") import traceback traceback.print_exc() sys.exit(1) # Test 6: Encode larger batch print("\nTest 6: Encode larger batch (100 texts)...") try: large_texts = [f"This is test sentence number {i}" for i in range(100)] embeddings = model.encode( large_texts, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=False, device='cpu' ) print(f" ✓ Encoded {len(large_texts)} texts, shape: {embeddings.shape}") except Exception as e: print(f" ❌ Failed: {e}") import traceback traceback.print_exc() sys.exit(1) # Test 7: Test with actual biography-like text print("\nTest 7: Encode biography-like text...") try: bio = """A Representative from Illinois and 16th President of the United States; born in Hardin County, Ky., February 12, 1809; moved with his parents to a tract on Little Pigeon Creek, Ind., in 1816; attended a log-cabin school at short intervals and was self-instructed in elementary branches.""" embedding = model.encode([bio], show_progress_bar=False, device='cpu') print(f" ✓ Encoded biography, shape: {embedding.shape}") except Exception as e: print(f" ❌ Failed: {e}") import traceback traceback.print_exc() sys.exit(1) print("\n" + "=" * 60) print("✅ ALL TESTS PASSED!") print("=" * 60) print("\nSentence transformers is working correctly.") print("The issue may be with the combination of:") print(" - Very large batch processing") print(" - Integration with FAISS normalize") print(" - Memory management with 13k+ texts")