Spaces:

stefanjwojcik
/

BioGuideMCP

Running

App Files Files Community

BioGuideMCP / test_embeddings_data.py

stefanjwojcik

Add setup script and comprehensive tests for Congressional Bioguide MCP Server

15de73a 30 days ago

raw

history blame contribute delete

4.17 kB

	#!/usr/bin/env python3
	"""
	Test the embeddings data to check for issues before FAISS operations.
	"""

	import sys
	import os
	import sqlite3
	import numpy as np

	print("=" * 60)
	print("EMBEDDINGS DATA VALIDATION TEST")
	print("=" * 60)
	print(f"Python version: {sys.version}")
	print()

	# Load model
	print("Loading sentence transformer...")
	os.environ['TOKENIZERS_PARALLELISM'] = 'false'
	from sentence_transformers import SentenceTransformer
	model = SentenceTransformer('all-MiniLM-L6-v2')
	print("✓ Model loaded\n")

	# Load ALL biographies
	print("Loading ALL biographies from database...")
	conn = sqlite3.connect("congress.db")
	cursor = conn.cursor()
	cursor.execute("""
	SELECT bio_id, profile_text
	FROM members
	WHERE profile_text IS NOT NULL AND profile_text != ''
	""")
	rows = cursor.fetchall()
	conn.close()

	bio_ids = [r[0] for r in rows]
	texts = [r[1] for r in rows]
	print(f"✓ Loaded {len(texts)} biographies\n")

	# Encode ALL
	print("Encoding all biographies...")
	print("(This will take a few minutes...)")
	embeddings = []
	batch_size = 32

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]
	batch_embeddings = model.encode(
	batch,
	show_progress_bar=False,
	convert_to_numpy=True,
	normalize_embeddings=False,
	device='cpu'
	)
	embeddings.extend(batch_embeddings)

	if (i // batch_size + 1) % 100 == 0:
	print(f" Encoded {i + len(batch)}/{len(texts)}...")

	embeddings = np.array(embeddings, dtype=np.float32)
	print(f"✓ Encoded all, shape: {embeddings.shape}\n")

	# Validate embeddings
	print("Validating embeddings data...")
	print(f" Shape: {embeddings.shape}")
	print(f" Dtype: {embeddings.dtype}")
	print(f" Min value: {np.min(embeddings)}")
	print(f" Max value: {np.max(embeddings)}")
	print(f" Mean: {np.mean(embeddings)}")
	print(f" Has NaN: {np.any(np.isnan(embeddings))}")
	print(f" Has Inf: {np.any(np.isinf(embeddings))}")
	print(f" Is C-contiguous: {embeddings.flags['C_CONTIGUOUS']}")
	print(f" Memory usage: {embeddings.nbytes / (1024**2):.2f} MB")

	if np.any(np.isnan(embeddings)):
	print("\n❌ ERROR: Embeddings contain NaN values!")
	sys.exit(1)

	if np.any(np.isinf(embeddings)):
	print("\n❌ ERROR: Embeddings contain Inf values!")
	sys.exit(1)

	print("\n✓ Embeddings data looks good")

	# Now test FAISS operations one by one
	print("\n" + "=" * 60)
	print("Testing FAISS operations...")
	print("=" * 60)

	import faiss

	dimension = embeddings.shape[1]
	print(f"\n1. Creating IndexFlatIP with dimension {dimension}...")
	try:
	index = faiss.IndexFlatIP(dimension)
	print(" ✓ Index created")
	except Exception as e:
	print(f" ❌ FAILED at index creation: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	print(f"\n2. Normalizing {len(embeddings)} embeddings...")
	try:
	# Make a copy to preserve original
	embeddings_norm = embeddings.copy()
	print(f" Before normalize - sample norm: {np.linalg.norm(embeddings_norm[0]):.4f}")

	faiss.normalize_L2(embeddings_norm)

	print(f" After normalize - sample norm: {np.linalg.norm(embeddings_norm[0]):.4f}")
	print(f" ✓ Normalized")
	except Exception as e:
	print(f" ❌ FAILED at normalize: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	print(f"\n3. Adding {len(embeddings_norm)} vectors to index...")
	try:
	index.add(embeddings_norm)
	print(f" ✓ Added {index.ntotal} vectors")
	except Exception as e:
	print(f" ❌ FAILED at add: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	print(f"\n4. Writing index to disk...")
	try:
	faiss.write_index(index, "test_full.faiss")
	print(f" ✓ Index written")
	except Exception as e:
	print(f" ❌ FAILED at write: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	print("\n" + "=" * 60)
	print("✅ SUCCESS! Full pipeline works!")
	print("=" * 60)
	print(f"\nProcessed {len(embeddings)} embeddings successfully")
	print("The index has been created: test_full.faiss")