BioGuideMCP / build_faiss_index.py
stefanjwojcik's picture
Add setup script and comprehensive tests for Congressional Bioguide MCP Server
15de73a
#!/usr/bin/env python3
"""
Build FAISS index from Congressional biography database.
This script:
1. Loads all biographies from the SQLite database
2. Generates embeddings using sentence transformers
3. Builds a FAISS index for fast similarity search
4. Saves the index and bio ID mapping to disk
Run this script whenever:
- The database is first created
- You want to rebuild the semantic search index
- After updating to a compatible Python version
Requires Python 3.9-3.12 (Python 3.14+ may have compatibility issues)
"""
import sqlite3
import faiss
import numpy as np
import pickle
import time
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer
# Paths
SCRIPT_DIR = Path(__file__).parent.absolute()
DB_PATH = str(SCRIPT_DIR / "congress.db")
INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index")
MAPPING_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl")
def build_faiss_index():
"""Build FAISS index from database biographies."""
print("=" * 60)
print("BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE")
print("=" * 60)
# Check database exists
if not Path(DB_PATH).exists():
print(f"\n❌ ERROR: Database not found at {DB_PATH}")
print(" Run ingest_data.py first to create the database.")
return False
# Load sentence transformer model
print("\n1. Loading sentence transformer model...")
start = time.time()
# Disable all parallelism to avoid Python 3.14 issues
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import torch
torch.set_num_threads(1)
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f" βœ“ Model loaded in {time.time() - start:.3f}s")
# Load biographies from database
print("\n2. Loading biographies from database...")
start = time.time()
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
SELECT bio_id, profile_text
FROM members
WHERE profile_text IS NOT NULL AND profile_text != ''
""")
rows = cursor.fetchall()
conn.close()
elapsed = time.time() - start
print(f" βœ“ Loaded {len(rows):,} biographies in {elapsed:.3f}s")
if len(rows) == 0:
print("\n❌ ERROR: No biographies found in database!")
return False
# Prepare data
print("\n3. Preparing data for encoding...")
start = time.time()
bio_ids = [row[0] for row in rows]
texts = [row[1] for row in rows]
print(f" βœ“ Prepared {len(bio_ids):,} texts")
print(f" βœ“ Time: {time.time() - start:.3f}s")
# Generate embeddings in batches
print("\n4. Generating embeddings...")
print(" (This may take several minutes...)")
start = time.time()
batch_size = 32
embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch_embeddings = model.encode(
batch,
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=False,
device='cpu' # Explicit CPU to avoid issues
)
embeddings.extend(batch_embeddings)
# Progress update every 100 batches (~3200 texts)
if (i // batch_size + 1) % 100 == 0:
elapsed = time.time() - start
rate = (i + len(batch)) / elapsed
remaining = (len(texts) - i - len(batch)) / rate if rate > 0 else 0
print(f" Encoded {i + len(batch):,}/{len(texts):,} " +
f"({rate:.0f} texts/sec, ~{remaining:.0f}s remaining)")
embeddings = np.array(embeddings, dtype=np.float32)
elapsed = time.time() - start
print(f" βœ“ Generated {len(embeddings):,} embeddings in {elapsed:.1f}s")
print(f" βœ“ Shape: {embeddings.shape}")
# Build FAISS index
print("\n5. Building FAISS index...")
start = time.time()
dimension = embeddings.shape[1]
print(f" Dimension: {dimension}")
# Use IndexFlatIP for exact cosine similarity search
# (Inner Product is equivalent to cosine similarity for normalized vectors)
index = faiss.IndexFlatIP(dimension)
# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)
# Add embeddings to index
index.add(embeddings)
elapsed = time.time() - start
print(f" βœ“ Index built in {elapsed:.3f}s")
print(f" βœ“ Total vectors in index: {index.ntotal:,}")
# Save FAISS index
print("\n6. Saving FAISS index to disk...")
start = time.time()
faiss.write_index(index, INDEX_PATH)
elapsed = time.time() - start
print(f" βœ“ Index saved to: {INDEX_PATH}")
print(f" βœ“ Time: {elapsed:.3f}s")
# Save bio ID mapping
print("\n7. Saving bio ID mapping...")
start = time.time()
with open(MAPPING_PATH, "wb") as f:
pickle.dump(bio_ids, f)
elapsed = time.time() - start
print(f" βœ“ Mapping saved to: {MAPPING_PATH}")
print(f" βœ“ Time: {elapsed:.3f}s")
# Get file sizes
index_size_mb = Path(INDEX_PATH).stat().st_size / (1024**2)
mapping_size_mb = Path(MAPPING_PATH).stat().st_size / (1024**2)
print("\n" + "=" * 60)
print("FAISS INDEX BUILD COMPLETE")
print("=" * 60)
print(f"Total biographies indexed: {len(bio_ids):,}")
print(f"Index file size: {index_size_mb:.2f} MB")
print(f"Mapping file size: {mapping_size_mb:.2f} MB")
print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB")
print("\nThe MCP server will now load this index on startup for semantic search.")
print("You can now use the 'semantic_search_biography' tool!")
return True
def main():
"""Main entry point."""
try:
success = build_faiss_index()
if not success:
exit(1)
except Exception as e:
print(f"\n❌ ERROR: {e}")
print("\nThis may be due to Python version incompatibility.")
print("FAISS and sentence-transformers work best with Python 3.9-3.12")
print(f"Current Python version: {os.sys.version}")
print("\nThe database is still usable without semantic search.")
import traceback
traceback.print_exc()
exit(1)
if __name__ == "__main__":
main()