Spaces:
Running
Running
File size: 4,169 Bytes
15de73a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
#!/usr/bin/env python3
"""
Test the embeddings data to check for issues before FAISS operations.
"""
import sys
import os
import sqlite3
import numpy as np
print("=" * 60)
print("EMBEDDINGS DATA VALIDATION TEST")
print("=" * 60)
print(f"Python version: {sys.version}")
print()
# Load model
print("Loading sentence transformer...")
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
print("β Model loaded\n")
# Load ALL biographies
print("Loading ALL biographies from database...")
conn = sqlite3.connect("congress.db")
cursor = conn.cursor()
cursor.execute("""
SELECT bio_id, profile_text
FROM members
WHERE profile_text IS NOT NULL AND profile_text != ''
""")
rows = cursor.fetchall()
conn.close()
bio_ids = [r[0] for r in rows]
texts = [r[1] for r in rows]
print(f"β Loaded {len(texts)} biographies\n")
# Encode ALL
print("Encoding all biographies...")
print("(This will take a few minutes...)")
embeddings = []
batch_size = 32
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch_embeddings = model.encode(
batch,
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=False,
device='cpu'
)
embeddings.extend(batch_embeddings)
if (i // batch_size + 1) % 100 == 0:
print(f" Encoded {i + len(batch)}/{len(texts)}...")
embeddings = np.array(embeddings, dtype=np.float32)
print(f"β Encoded all, shape: {embeddings.shape}\n")
# Validate embeddings
print("Validating embeddings data...")
print(f" Shape: {embeddings.shape}")
print(f" Dtype: {embeddings.dtype}")
print(f" Min value: {np.min(embeddings)}")
print(f" Max value: {np.max(embeddings)}")
print(f" Mean: {np.mean(embeddings)}")
print(f" Has NaN: {np.any(np.isnan(embeddings))}")
print(f" Has Inf: {np.any(np.isinf(embeddings))}")
print(f" Is C-contiguous: {embeddings.flags['C_CONTIGUOUS']}")
print(f" Memory usage: {embeddings.nbytes / (1024**2):.2f} MB")
if np.any(np.isnan(embeddings)):
print("\nβ ERROR: Embeddings contain NaN values!")
sys.exit(1)
if np.any(np.isinf(embeddings)):
print("\nβ ERROR: Embeddings contain Inf values!")
sys.exit(1)
print("\nβ Embeddings data looks good")
# Now test FAISS operations one by one
print("\n" + "=" * 60)
print("Testing FAISS operations...")
print("=" * 60)
import faiss
dimension = embeddings.shape[1]
print(f"\n1. Creating IndexFlatIP with dimension {dimension}...")
try:
index = faiss.IndexFlatIP(dimension)
print(" β Index created")
except Exception as e:
print(f" β FAILED at index creation: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print(f"\n2. Normalizing {len(embeddings)} embeddings...")
try:
# Make a copy to preserve original
embeddings_norm = embeddings.copy()
print(f" Before normalize - sample norm: {np.linalg.norm(embeddings_norm[0]):.4f}")
faiss.normalize_L2(embeddings_norm)
print(f" After normalize - sample norm: {np.linalg.norm(embeddings_norm[0]):.4f}")
print(f" β Normalized")
except Exception as e:
print(f" β FAILED at normalize: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print(f"\n3. Adding {len(embeddings_norm)} vectors to index...")
try:
index.add(embeddings_norm)
print(f" β Added {index.ntotal} vectors")
except Exception as e:
print(f" β FAILED at add: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print(f"\n4. Writing index to disk...")
try:
faiss.write_index(index, "test_full.faiss")
print(f" β Index written")
except Exception as e:
print(f" β FAILED at write: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print("\n" + "=" * 60)
print("β
SUCCESS! Full pipeline works!")
print("=" * 60)
print(f"\nProcessed {len(embeddings)} embeddings successfully")
print("The index has been created: test_full.faiss")
|