#!/usr/bin/env python3 """ Build FAISS index from Congressional biography database. This script: 1. Loads all biographies from the SQLite database 2. Generates embeddings using sentence transformers 3. Builds a FAISS index for fast similarity search 4. Saves the index and bio ID mapping to disk Run this script whenever: - The database is first created - You want to rebuild the semantic search index - After updating to a compatible Python version Requires Python 3.9-3.12 (Python 3.14+ may have compatibility issues) """ import sqlite3 import faiss import numpy as np import pickle import time import os from pathlib import Path from sentence_transformers import SentenceTransformer # Paths SCRIPT_DIR = Path(__file__).parent.absolute() DB_PATH = str(SCRIPT_DIR / "congress.db") INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index") MAPPING_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl") def build_faiss_index(): """Build FAISS index from database biographies.""" print("=" * 60) print("BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE") print("=" * 60) # Check database exists if not Path(DB_PATH).exists(): print(f"\n❌ ERROR: Database not found at {DB_PATH}") print(" Run ingest_data.py first to create the database.") return False # Load sentence transformer model print("\n1. Loading sentence transformer model...") start = time.time() # Disable all parallelism to avoid Python 3.14 issues os.environ['TOKENIZERS_PARALLELISM'] = 'false' os.environ['OMP_NUM_THREADS'] = '1' os.environ['MKL_NUM_THREADS'] = '1' os.environ['OPENBLAS_NUM_THREADS'] = '1' import torch torch.set_num_threads(1) model = SentenceTransformer('all-MiniLM-L6-v2') print(f" ✓ Model loaded in {time.time() - start:.3f}s") # Load biographies from database print("\n2. Loading biographies from database...") start = time.time() conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() cursor.execute(""" SELECT bio_id, profile_text FROM members WHERE profile_text IS NOT NULL AND profile_text != '' """) rows = cursor.fetchall() conn.close() elapsed = time.time() - start print(f" ✓ Loaded {len(rows):,} biographies in {elapsed:.3f}s") if len(rows) == 0: print("\n❌ ERROR: No biographies found in database!") return False # Prepare data print("\n3. Preparing data for encoding...") start = time.time() bio_ids = [row[0] for row in rows] texts = [row[1] for row in rows] print(f" ✓ Prepared {len(bio_ids):,} texts") print(f" ✓ Time: {time.time() - start:.3f}s") # Generate embeddings in batches print("\n4. Generating embeddings...") print(" (This may take several minutes...)") start = time.time() batch_size = 32 embeddings = [] for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] batch_embeddings = model.encode( batch, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=False, device='cpu' # Explicit CPU to avoid issues ) embeddings.extend(batch_embeddings) # Progress update every 100 batches (~3200 texts) if (i // batch_size + 1) % 100 == 0: elapsed = time.time() - start rate = (i + len(batch)) / elapsed remaining = (len(texts) - i - len(batch)) / rate if rate > 0 else 0 print(f" Encoded {i + len(batch):,}/{len(texts):,} " + f"({rate:.0f} texts/sec, ~{remaining:.0f}s remaining)") embeddings = np.array(embeddings, dtype=np.float32) elapsed = time.time() - start print(f" ✓ Generated {len(embeddings):,} embeddings in {elapsed:.1f}s") print(f" ✓ Shape: {embeddings.shape}") # Build FAISS index print("\n5. Building FAISS index...") start = time.time() dimension = embeddings.shape[1] print(f" Dimension: {dimension}") # Use IndexFlatIP for exact cosine similarity search # (Inner Product is equivalent to cosine similarity for normalized vectors) index = faiss.IndexFlatIP(dimension) # Normalize embeddings for cosine similarity faiss.normalize_L2(embeddings) # Add embeddings to index index.add(embeddings) elapsed = time.time() - start print(f" ✓ Index built in {elapsed:.3f}s") print(f" ✓ Total vectors in index: {index.ntotal:,}") # Save FAISS index print("\n6. Saving FAISS index to disk...") start = time.time() faiss.write_index(index, INDEX_PATH) elapsed = time.time() - start print(f" ✓ Index saved to: {INDEX_PATH}") print(f" ✓ Time: {elapsed:.3f}s") # Save bio ID mapping print("\n7. Saving bio ID mapping...") start = time.time() with open(MAPPING_PATH, "wb") as f: pickle.dump(bio_ids, f) elapsed = time.time() - start print(f" ✓ Mapping saved to: {MAPPING_PATH}") print(f" ✓ Time: {elapsed:.3f}s") # Get file sizes index_size_mb = Path(INDEX_PATH).stat().st_size / (1024**2) mapping_size_mb = Path(MAPPING_PATH).stat().st_size / (1024**2) print("\n" + "=" * 60) print("FAISS INDEX BUILD COMPLETE") print("=" * 60) print(f"Total biographies indexed: {len(bio_ids):,}") print(f"Index file size: {index_size_mb:.2f} MB") print(f"Mapping file size: {mapping_size_mb:.2f} MB") print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB") print("\nThe MCP server will now load this index on startup for semantic search.") print("You can now use the 'semantic_search_biography' tool!") return True def main(): """Main entry point.""" try: success = build_faiss_index() if not success: exit(1) except Exception as e: print(f"\n❌ ERROR: {e}") print("\nThis may be due to Python version incompatibility.") print("FAISS and sentence-transformers work best with Python 3.9-3.12") print(f"Current Python version: {os.sys.version}") print("\nThe database is still usable without semantic search.") import traceback traceback.print_exc() exit(1) if __name__ == "__main__": main()