Spaces:

stefanjwojcik
/

BioGuideMCP

Running

App Files Files Community

BioGuideMCP / build_faiss_index.py

stefanjwojcik

Add setup script and comprehensive tests for Congressional Bioguide MCP Server

15de73a about 1 month ago

raw

history blame contribute delete

6.51 kB

	#!/usr/bin/env python3
	"""
	Build FAISS index from Congressional biography database.

	This script:
	1. Loads all biographies from the SQLite database
	2. Generates embeddings using sentence transformers
	3. Builds a FAISS index for fast similarity search
	4. Saves the index and bio ID mapping to disk

	Run this script whenever:
	- The database is first created
	- You want to rebuild the semantic search index
	- After updating to a compatible Python version

	Requires Python 3.9-3.12 (Python 3.14+ may have compatibility issues)
	"""

	import sqlite3
	import faiss
	import numpy as np
	import pickle
	import time
	import os
	from pathlib import Path
	from sentence_transformers import SentenceTransformer

	# Paths
	SCRIPT_DIR = Path(__file__).parent.absolute()
	DB_PATH = str(SCRIPT_DIR / "congress.db")
	INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index")
	MAPPING_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl")

	def build_faiss_index():
	"""Build FAISS index from database biographies."""
	print("=" * 60)
	print("BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE")
	print("=" * 60)

	# Check database exists
	if not Path(DB_PATH).exists():
	print(f"\n❌ ERROR: Database not found at {DB_PATH}")
	print(" Run ingest_data.py first to create the database.")
	return False

	# Load sentence transformer model
	print("\n1. Loading sentence transformer model...")
	start = time.time()

	# Disable all parallelism to avoid Python 3.14 issues
	os.environ['TOKENIZERS_PARALLELISM'] = 'false'
	os.environ['OMP_NUM_THREADS'] = '1'
	os.environ['MKL_NUM_THREADS'] = '1'
	os.environ['OPENBLAS_NUM_THREADS'] = '1'

	import torch
	torch.set_num_threads(1)

	model = SentenceTransformer('all-MiniLM-L6-v2')
	print(f" ✓ Model loaded in {time.time() - start:.3f}s")

	# Load biographies from database
	print("\n2. Loading biographies from database...")
	start = time.time()
	conn = sqlite3.connect(DB_PATH)
	cursor = conn.cursor()

	cursor.execute("""
	SELECT bio_id, profile_text
	FROM members
	WHERE profile_text IS NOT NULL AND profile_text != ''
	""")
	rows = cursor.fetchall()
	conn.close()

	elapsed = time.time() - start
	print(f" ✓ Loaded {len(rows):,} biographies in {elapsed:.3f}s")

	if len(rows) == 0:
	print("\n❌ ERROR: No biographies found in database!")
	return False

	# Prepare data
	print("\n3. Preparing data for encoding...")
	start = time.time()
	bio_ids = [row[0] for row in rows]
	texts = [row[1] for row in rows]
	print(f" ✓ Prepared {len(bio_ids):,} texts")
	print(f" ✓ Time: {time.time() - start:.3f}s")

	# Generate embeddings in batches
	print("\n4. Generating embeddings...")
	print(" (This may take several minutes...)")
	start = time.time()
	batch_size = 32
	embeddings = []

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]
	batch_embeddings = model.encode(
	batch,
	show_progress_bar=False,
	convert_to_numpy=True,
	normalize_embeddings=False,
	device='cpu' # Explicit CPU to avoid issues
	)
	embeddings.extend(batch_embeddings)

	# Progress update every 100 batches (~3200 texts)
	if (i // batch_size + 1) % 100 == 0:
	elapsed = time.time() - start
	rate = (i + len(batch)) / elapsed
	remaining = (len(texts) - i - len(batch)) / rate if rate > 0 else 0
	print(f" Encoded {i + len(batch):,}/{len(texts):,} " +
	f"({rate:.0f} texts/sec, ~{remaining:.0f}s remaining)")

	embeddings = np.array(embeddings, dtype=np.float32)
	elapsed = time.time() - start
	print(f" ✓ Generated {len(embeddings):,} embeddings in {elapsed:.1f}s")
	print(f" ✓ Shape: {embeddings.shape}")

	# Build FAISS index
	print("\n5. Building FAISS index...")
	start = time.time()
	dimension = embeddings.shape[1]
	print(f" Dimension: {dimension}")

	# Use IndexFlatIP for exact cosine similarity search
	# (Inner Product is equivalent to cosine similarity for normalized vectors)
	index = faiss.IndexFlatIP(dimension)

	# Normalize embeddings for cosine similarity
	faiss.normalize_L2(embeddings)

	# Add embeddings to index
	index.add(embeddings)

	elapsed = time.time() - start
	print(f" ✓ Index built in {elapsed:.3f}s")
	print(f" ✓ Total vectors in index: {index.ntotal:,}")

	# Save FAISS index
	print("\n6. Saving FAISS index to disk...")
	start = time.time()
	faiss.write_index(index, INDEX_PATH)
	elapsed = time.time() - start
	print(f" ✓ Index saved to: {INDEX_PATH}")
	print(f" ✓ Time: {elapsed:.3f}s")

	# Save bio ID mapping
	print("\n7. Saving bio ID mapping...")
	start = time.time()
	with open(MAPPING_PATH, "wb") as f:
	pickle.dump(bio_ids, f)
	elapsed = time.time() - start
	print(f" ✓ Mapping saved to: {MAPPING_PATH}")
	print(f" ✓ Time: {elapsed:.3f}s")

	# Get file sizes
	index_size_mb = Path(INDEX_PATH).stat().st_size / (1024**2)
	mapping_size_mb = Path(MAPPING_PATH).stat().st_size / (1024**2)

	print("\n" + "=" * 60)
	print("FAISS INDEX BUILD COMPLETE")
	print("=" * 60)
	print(f"Total biographies indexed: {len(bio_ids):,}")
	print(f"Index file size: {index_size_mb:.2f} MB")
	print(f"Mapping file size: {mapping_size_mb:.2f} MB")
	print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB")
	print("\nThe MCP server will now load this index on startup for semantic search.")
	print("You can now use the 'semantic_search_biography' tool!")

	return True


	def main():
	"""Main entry point."""
	try:
	success = build_faiss_index()
	if not success:
	exit(1)
	except Exception as e:
	print(f"\n❌ ERROR: {e}")
	print("\nThis may be due to Python version incompatibility.")
	print("FAISS and sentence-transformers work best with Python 3.9-3.12")
	print(f"Current Python version: {os.sys.version}")
	print("\nThe database is still usable without semantic search.")
	import traceback
	traceback.print_exc()
	exit(1)


	if __name__ == "__main__":
	main()