Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Build FAISS index from Congressional biography database. | |
| This script: | |
| 1. Loads all biographies from the SQLite database | |
| 2. Generates embeddings using sentence transformers | |
| 3. Builds a FAISS index for fast similarity search | |
| 4. Saves the index and bio ID mapping to disk | |
| Run this script whenever: | |
| - The database is first created | |
| - You want to rebuild the semantic search index | |
| - After updating to a compatible Python version | |
| Requires Python 3.9-3.12 (Python 3.14+ may have compatibility issues) | |
| """ | |
| import sqlite3 | |
| import faiss | |
| import numpy as np | |
| import pickle | |
| import time | |
| import os | |
| from pathlib import Path | |
| from sentence_transformers import SentenceTransformer | |
| # Paths | |
| SCRIPT_DIR = Path(__file__).parent.absolute() | |
| DB_PATH = str(SCRIPT_DIR / "congress.db") | |
| INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index") | |
| MAPPING_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl") | |
| def build_faiss_index(): | |
| """Build FAISS index from database biographies.""" | |
| print("=" * 60) | |
| print("BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE") | |
| print("=" * 60) | |
| # Check database exists | |
| if not Path(DB_PATH).exists(): | |
| print(f"\nβ ERROR: Database not found at {DB_PATH}") | |
| print(" Run ingest_data.py first to create the database.") | |
| return False | |
| # Load sentence transformer model | |
| print("\n1. Loading sentence transformer model...") | |
| start = time.time() | |
| # Disable all parallelism to avoid Python 3.14 issues | |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
| os.environ['OMP_NUM_THREADS'] = '1' | |
| os.environ['MKL_NUM_THREADS'] = '1' | |
| os.environ['OPENBLAS_NUM_THREADS'] = '1' | |
| import torch | |
| torch.set_num_threads(1) | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| print(f" β Model loaded in {time.time() - start:.3f}s") | |
| # Load biographies from database | |
| print("\n2. Loading biographies from database...") | |
| start = time.time() | |
| conn = sqlite3.connect(DB_PATH) | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| SELECT bio_id, profile_text | |
| FROM members | |
| WHERE profile_text IS NOT NULL AND profile_text != '' | |
| """) | |
| rows = cursor.fetchall() | |
| conn.close() | |
| elapsed = time.time() - start | |
| print(f" β Loaded {len(rows):,} biographies in {elapsed:.3f}s") | |
| if len(rows) == 0: | |
| print("\nβ ERROR: No biographies found in database!") | |
| return False | |
| # Prepare data | |
| print("\n3. Preparing data for encoding...") | |
| start = time.time() | |
| bio_ids = [row[0] for row in rows] | |
| texts = [row[1] for row in rows] | |
| print(f" β Prepared {len(bio_ids):,} texts") | |
| print(f" β Time: {time.time() - start:.3f}s") | |
| # Generate embeddings in batches | |
| print("\n4. Generating embeddings...") | |
| print(" (This may take several minutes...)") | |
| start = time.time() | |
| batch_size = 32 | |
| embeddings = [] | |
| for i in range(0, len(texts), batch_size): | |
| batch = texts[i:i + batch_size] | |
| batch_embeddings = model.encode( | |
| batch, | |
| show_progress_bar=False, | |
| convert_to_numpy=True, | |
| normalize_embeddings=False, | |
| device='cpu' # Explicit CPU to avoid issues | |
| ) | |
| embeddings.extend(batch_embeddings) | |
| # Progress update every 100 batches (~3200 texts) | |
| if (i // batch_size + 1) % 100 == 0: | |
| elapsed = time.time() - start | |
| rate = (i + len(batch)) / elapsed | |
| remaining = (len(texts) - i - len(batch)) / rate if rate > 0 else 0 | |
| print(f" Encoded {i + len(batch):,}/{len(texts):,} " + | |
| f"({rate:.0f} texts/sec, ~{remaining:.0f}s remaining)") | |
| embeddings = np.array(embeddings, dtype=np.float32) | |
| elapsed = time.time() - start | |
| print(f" β Generated {len(embeddings):,} embeddings in {elapsed:.1f}s") | |
| print(f" β Shape: {embeddings.shape}") | |
| # Build FAISS index | |
| print("\n5. Building FAISS index...") | |
| start = time.time() | |
| dimension = embeddings.shape[1] | |
| print(f" Dimension: {dimension}") | |
| # Use IndexFlatIP for exact cosine similarity search | |
| # (Inner Product is equivalent to cosine similarity for normalized vectors) | |
| index = faiss.IndexFlatIP(dimension) | |
| # Normalize embeddings for cosine similarity | |
| faiss.normalize_L2(embeddings) | |
| # Add embeddings to index | |
| index.add(embeddings) | |
| elapsed = time.time() - start | |
| print(f" β Index built in {elapsed:.3f}s") | |
| print(f" β Total vectors in index: {index.ntotal:,}") | |
| # Save FAISS index | |
| print("\n6. Saving FAISS index to disk...") | |
| start = time.time() | |
| faiss.write_index(index, INDEX_PATH) | |
| elapsed = time.time() - start | |
| print(f" β Index saved to: {INDEX_PATH}") | |
| print(f" β Time: {elapsed:.3f}s") | |
| # Save bio ID mapping | |
| print("\n7. Saving bio ID mapping...") | |
| start = time.time() | |
| with open(MAPPING_PATH, "wb") as f: | |
| pickle.dump(bio_ids, f) | |
| elapsed = time.time() - start | |
| print(f" β Mapping saved to: {MAPPING_PATH}") | |
| print(f" β Time: {elapsed:.3f}s") | |
| # Get file sizes | |
| index_size_mb = Path(INDEX_PATH).stat().st_size / (1024**2) | |
| mapping_size_mb = Path(MAPPING_PATH).stat().st_size / (1024**2) | |
| print("\n" + "=" * 60) | |
| print("FAISS INDEX BUILD COMPLETE") | |
| print("=" * 60) | |
| print(f"Total biographies indexed: {len(bio_ids):,}") | |
| print(f"Index file size: {index_size_mb:.2f} MB") | |
| print(f"Mapping file size: {mapping_size_mb:.2f} MB") | |
| print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB") | |
| print("\nThe MCP server will now load this index on startup for semantic search.") | |
| print("You can now use the 'semantic_search_biography' tool!") | |
| return True | |
| def main(): | |
| """Main entry point.""" | |
| try: | |
| success = build_faiss_index() | |
| if not success: | |
| exit(1) | |
| except Exception as e: | |
| print(f"\nβ ERROR: {e}") | |
| print("\nThis may be due to Python version incompatibility.") | |
| print("FAISS and sentence-transformers work best with Python 3.9-3.12") | |
| print(f"Current Python version: {os.sys.version}") | |
| print("\nThe database is still usable without semantic search.") | |
| import traceback | |
| traceback.print_exc() | |
| exit(1) | |
| if __name__ == "__main__": | |
| main() | |