Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Test the embeddings data to check for issues before FAISS operations. | |
| """ | |
| import sys | |
| import os | |
| import sqlite3 | |
| import numpy as np | |
| print("=" * 60) | |
| print("EMBEDDINGS DATA VALIDATION TEST") | |
| print("=" * 60) | |
| print(f"Python version: {sys.version}") | |
| print() | |
| # Load model | |
| print("Loading sentence transformer...") | |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| print("β Model loaded\n") | |
| # Load ALL biographies | |
| print("Loading ALL biographies from database...") | |
| conn = sqlite3.connect("congress.db") | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| SELECT bio_id, profile_text | |
| FROM members | |
| WHERE profile_text IS NOT NULL AND profile_text != '' | |
| """) | |
| rows = cursor.fetchall() | |
| conn.close() | |
| bio_ids = [r[0] for r in rows] | |
| texts = [r[1] for r in rows] | |
| print(f"β Loaded {len(texts)} biographies\n") | |
| # Encode ALL | |
| print("Encoding all biographies...") | |
| print("(This will take a few minutes...)") | |
| embeddings = [] | |
| batch_size = 32 | |
| for i in range(0, len(texts), batch_size): | |
| batch = texts[i:i + batch_size] | |
| batch_embeddings = model.encode( | |
| batch, | |
| show_progress_bar=False, | |
| convert_to_numpy=True, | |
| normalize_embeddings=False, | |
| device='cpu' | |
| ) | |
| embeddings.extend(batch_embeddings) | |
| if (i // batch_size + 1) % 100 == 0: | |
| print(f" Encoded {i + len(batch)}/{len(texts)}...") | |
| embeddings = np.array(embeddings, dtype=np.float32) | |
| print(f"β Encoded all, shape: {embeddings.shape}\n") | |
| # Validate embeddings | |
| print("Validating embeddings data...") | |
| print(f" Shape: {embeddings.shape}") | |
| print(f" Dtype: {embeddings.dtype}") | |
| print(f" Min value: {np.min(embeddings)}") | |
| print(f" Max value: {np.max(embeddings)}") | |
| print(f" Mean: {np.mean(embeddings)}") | |
| print(f" Has NaN: {np.any(np.isnan(embeddings))}") | |
| print(f" Has Inf: {np.any(np.isinf(embeddings))}") | |
| print(f" Is C-contiguous: {embeddings.flags['C_CONTIGUOUS']}") | |
| print(f" Memory usage: {embeddings.nbytes / (1024**2):.2f} MB") | |
| if np.any(np.isnan(embeddings)): | |
| print("\nβ ERROR: Embeddings contain NaN values!") | |
| sys.exit(1) | |
| if np.any(np.isinf(embeddings)): | |
| print("\nβ ERROR: Embeddings contain Inf values!") | |
| sys.exit(1) | |
| print("\nβ Embeddings data looks good") | |
| # Now test FAISS operations one by one | |
| print("\n" + "=" * 60) | |
| print("Testing FAISS operations...") | |
| print("=" * 60) | |
| import faiss | |
| dimension = embeddings.shape[1] | |
| print(f"\n1. Creating IndexFlatIP with dimension {dimension}...") | |
| try: | |
| index = faiss.IndexFlatIP(dimension) | |
| print(" β Index created") | |
| except Exception as e: | |
| print(f" β FAILED at index creation: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| print(f"\n2. Normalizing {len(embeddings)} embeddings...") | |
| try: | |
| # Make a copy to preserve original | |
| embeddings_norm = embeddings.copy() | |
| print(f" Before normalize - sample norm: {np.linalg.norm(embeddings_norm[0]):.4f}") | |
| faiss.normalize_L2(embeddings_norm) | |
| print(f" After normalize - sample norm: {np.linalg.norm(embeddings_norm[0]):.4f}") | |
| print(f" β Normalized") | |
| except Exception as e: | |
| print(f" β FAILED at normalize: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| print(f"\n3. Adding {len(embeddings_norm)} vectors to index...") | |
| try: | |
| index.add(embeddings_norm) | |
| print(f" β Added {index.ntotal} vectors") | |
| except Exception as e: | |
| print(f" β FAILED at add: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| print(f"\n4. Writing index to disk...") | |
| try: | |
| faiss.write_index(index, "test_full.faiss") | |
| print(f" β Index written") | |
| except Exception as e: | |
| print(f" β FAILED at write: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| print("\n" + "=" * 60) | |
| print("β SUCCESS! Full pipeline works!") | |
| print("=" * 60) | |
| print(f"\nProcessed {len(embeddings)} embeddings successfully") | |
| print("The index has been created: test_full.faiss") | |