Spaces:
Running
Running
Commit
Β·
15de73a
1
Parent(s):
9000c90
Add setup script and comprehensive tests for Congressional Bioguide MCP Server
Browse files- Created setup.sh for environment setup, including Python version checks and dependency installation.
- Added test_embeddings_data.py to validate embeddings data and FAISS operations.
- Introduced test_faiss_minimal.py for minimal testing of FAISS functionality.
- Implemented test_queries.py to validate database structure and search functionality.
- Added test_sentence_transformers.py to test sentence-transformers integration and performance.
- .gitattributes +2 -0
- README.md +418 -2
- build_faiss_index.py +194 -0
- faiss_build.log +46 -0
- gradio_app.py +574 -0
- ingest_data.py +447 -0
- mcp_config_example.json +11 -0
- requirements-minimal.txt +3 -0
- requirements.txt +7 -0
- server.py +1219 -0
- setup.sh +86 -0
- test_embeddings_data.py +143 -0
- test_faiss_minimal.py +142 -0
- test_queries.py +252 -0
- test_sentence_transformers.py +118 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
congress_bio_ids.pkl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
congress_faiss.index filter=lfs diff=lfs merge=lfs -text
|
| 38 |
congress.db filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 36 |
congress_bio_ids.pkl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
congress_faiss.index filter=lfs diff=lfs merge=lfs -text
|
| 38 |
congress.db filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.index filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
*.db filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -5,10 +5,426 @@ colorFrom: purple
|
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
-
app_file:
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
-
short_description: 'An
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
+
app_file: gradio_app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
+
short_description: 'An MCP allowing users to analyze congressional biographies. '
|
| 12 |
---
|
| 13 |
|
| 14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 15 |
+
|
| 16 |
+
# Congressional Bioguide MCP Server
|
| 17 |
+
|
| 18 |
+
A Model Context Protocol (MCP) server that provides access to Congressional member profiles with both structured SQL queries and semantic search capabilities.
|
| 19 |
+
|
| 20 |
+
## Deployment Options
|
| 21 |
+
|
| 22 |
+
### 1. Gradio MCP (Hugging Face Spaces)
|
| 23 |
+
|
| 24 |
+
Run this MCP as a Gradio app with web interface + MCP server:
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
python gradio_app.py
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
This will launch a web interface at `http://localhost:7860` with 9 tools exposed as both a web UI and MCP tools.
|
| 31 |
+
|
| 32 |
+
**Deploy to Hugging Face Spaces:**
|
| 33 |
+
1. Create a new Space on Hugging Face
|
| 34 |
+
2. Set SDK to `gradio` (version 5.49.1+)
|
| 35 |
+
3. Upload all files including `gradio_app.py`, `congress.db`, `congress_faiss.index`, and `congress_bio_ids.pkl`
|
| 36 |
+
4. The app will automatically launch with `mcp_server=True`
|
| 37 |
+
|
| 38 |
+
### 2. Traditional MCP Server
|
| 39 |
+
|
| 40 |
+
Use the original MCP server for integration with Claude Desktop or other MCP clients:
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
python server.py
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Test the server backend with `npx @modelcontextprotocol/inspector python server.py` or integrate it into your Claude setup.
|
| 47 |
+
|
| 48 |
+
## Features
|
| 49 |
+
|
| 50 |
+
### Gradio MCP Tools (9 Tools)
|
| 51 |
+
|
| 52 |
+
The Gradio app (`gradio_app.py`) exposes these 9 MCP tools:
|
| 53 |
+
|
| 54 |
+
1. **search_by_name** - Search members by name (first/last name)
|
| 55 |
+
2. **search_by_party** - Find by political party affiliation
|
| 56 |
+
3. **search_by_state** - Search by state/region representation
|
| 57 |
+
4. **semantic_search_biography** - AI-powered natural language search of biographies
|
| 58 |
+
5. **get_member_profile** - Get complete profile by Bioguide ID
|
| 59 |
+
6. **count_members_by_party** - Count members grouped by party
|
| 60 |
+
7. **count_members_by_state** - Count members grouped by state
|
| 61 |
+
8. **execute_sql_query** - Execute custom SQL queries (read-only)
|
| 62 |
+
9. **get_database_schema** - View database structure
|
| 63 |
+
|
| 64 |
+
### Traditional MCP Server Tools (14 Tools)
|
| 65 |
+
|
| 66 |
+
The traditional server (`server.py`) provides all tools:
|
| 67 |
+
|
| 68 |
+
**Search Tools** (return concise results by default):
|
| 69 |
+
1. **search_by_name** - Search members by name (returns: name, dates, party, congress)
|
| 70 |
+
2. **search_by_party** - Find by political party affiliation
|
| 71 |
+
3. **search_by_state** - Search by state/region representation
|
| 72 |
+
4. **search_by_congress** - Get all members from specific Congress
|
| 73 |
+
5. **search_by_date_range** - Find members who served during specific dates
|
| 74 |
+
6. **semantic_search_biography** - Natural language AI search of biographies
|
| 75 |
+
7. **search_biography_regex** - Regex pattern search (keywords, phrases)
|
| 76 |
+
8. **search_by_relationship** - Find members with family relationships
|
| 77 |
+
|
| 78 |
+
**Aggregation & Analysis Tools** (efficient for large datasets):
|
| 79 |
+
9. **count_members** - Count members by party, state, position, congress, or year
|
| 80 |
+
10. **temporal_analysis** - Analyze trends over time (party shifts, demographics, etc.)
|
| 81 |
+
11. **count_by_biography_content** - Count members mentioning specific keywords (e.g., "Harvard", "lawyer")
|
| 82 |
+
|
| 83 |
+
**Profile & Query Tools**:
|
| 84 |
+
12. **get_member_profile** - Get complete profile by Bioguide ID
|
| 85 |
+
13. **execute_sql_query** - Execute custom SQL queries (read-only)
|
| 86 |
+
14. **get_database_schema** - View database structure
|
| 87 |
+
|
| 88 |
+
### Database Schema
|
| 89 |
+
|
| 90 |
+
- **members** - Core biographical data (13,047+ profiles)
|
| 91 |
+
- **job_positions** - Congressional positions and affiliations
|
| 92 |
+
- **images** - Profile images
|
| 93 |
+
- **relationships** - Family relationships between members
|
| 94 |
+
- **creative_works** - Publications by members
|
| 95 |
+
- **assets** - Additional media assets
|
| 96 |
+
|
| 97 |
+
## Requirements
|
| 98 |
+
|
| 99 |
+
- **Python 3.10+** including Python 3.14
|
| 100 |
+
- β
**Python 3.14 is now supported!** (with single-threaded mode for FAISS)
|
| 101 |
+
|
| 102 |
+
## Setup
|
| 103 |
+
|
| 104 |
+
### Quick Start
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
./setup.sh
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
This automated script will:
|
| 111 |
+
1. Create a Python virtual environment
|
| 112 |
+
2. Install all dependencies
|
| 113 |
+
3. Ingest all Congressional profiles into SQLite
|
| 114 |
+
4. Build the FAISS semantic search index
|
| 115 |
+
|
| 116 |
+
### Manual Setup
|
| 117 |
+
|
| 118 |
+
If you prefer manual setup:
|
| 119 |
+
|
| 120 |
+
#### 1. Install Dependencies
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
python3 -m venv venv
|
| 124 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 125 |
+
pip install -r requirements.txt
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
#### 2. Ingest Data
|
| 129 |
+
|
| 130 |
+
Run the ingestion script to create the SQLite database and FAISS index:
|
| 131 |
+
|
| 132 |
+
```bash
|
| 133 |
+
python3 ingest_data.py
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
This will:
|
| 137 |
+
- Create `congress.db` SQLite database (13,047+ members)
|
| 138 |
+
- Build `congress_faiss.index` for semantic search
|
| 139 |
+
- Generate `congress_bio_ids.pkl` for ID mapping
|
| 140 |
+
|
| 141 |
+
Expected output:
|
| 142 |
+
```
|
| 143 |
+
Starting Congressional Bioguide ingestion...
|
| 144 |
+
============================================================
|
| 145 |
+
β Database schema created
|
| 146 |
+
Ingesting 13047 profiles...
|
| 147 |
+
Processed 1000/13047 profiles...
|
| 148 |
+
...
|
| 149 |
+
β Ingested 13047 profiles into database
|
| 150 |
+
Building FAISS index for semantic search...
|
| 151 |
+
Encoding 13047 biographies...
|
| 152 |
+
Encoded 3200/13047 biographies...
|
| 153 |
+
...
|
| 154 |
+
β FAISS index created with 13047 vectors
|
| 155 |
+
Index dimension: 384
|
| 156 |
+
============================================================
|
| 157 |
+
β Ingestion complete!
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
**Note**: Ingestion takes approximately 5-10 minutes depending on your system.
|
| 161 |
+
|
| 162 |
+
#### 3. Test the System (Optional)
|
| 163 |
+
|
| 164 |
+
```bash
|
| 165 |
+
python3 test_queries.py
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
#### 4. Run the Server
|
| 169 |
+
|
| 170 |
+
```bash
|
| 171 |
+
python3 server.py
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
## Usage Examples
|
| 175 |
+
|
| 176 |
+
### Name Search
|
| 177 |
+
```json
|
| 178 |
+
{
|
| 179 |
+
"name": "search_by_name",
|
| 180 |
+
"arguments": {
|
| 181 |
+
"family_name": "Lincoln"
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### Party Search
|
| 187 |
+
```json
|
| 188 |
+
{
|
| 189 |
+
"name": "search_by_party",
|
| 190 |
+
"arguments": {
|
| 191 |
+
"party": "Republican",
|
| 192 |
+
"congress_number": 117
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
### State Search
|
| 198 |
+
```json
|
| 199 |
+
{
|
| 200 |
+
"name": "search_by_state",
|
| 201 |
+
"arguments": {
|
| 202 |
+
"state_code": "CA",
|
| 203 |
+
"congress_number": 117
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
### Semantic Search
|
| 209 |
+
```json
|
| 210 |
+
{
|
| 211 |
+
"name": "semantic_search_biography",
|
| 212 |
+
"arguments": {
|
| 213 |
+
"query": "Civil War veterans who became lawyers",
|
| 214 |
+
"top_k": 5
|
| 215 |
+
}
|
| 216 |
+
}
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
### Regex Search - Find Keywords
|
| 220 |
+
```json
|
| 221 |
+
{
|
| 222 |
+
"name": "search_biography_regex",
|
| 223 |
+
"arguments": {
|
| 224 |
+
"pattern": "Harvard",
|
| 225 |
+
"limit": 5
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
### Regex Search - Filter by Party
|
| 231 |
+
```json
|
| 232 |
+
{
|
| 233 |
+
"name": "search_biography_regex",
|
| 234 |
+
"arguments": {
|
| 235 |
+
"pattern": "lawyer",
|
| 236 |
+
"filter_party": "Republican",
|
| 237 |
+
"limit": 10
|
| 238 |
+
}
|
| 239 |
+
}
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
### Regex Search - Filter by State and Congress
|
| 243 |
+
```json
|
| 244 |
+
{
|
| 245 |
+
"name": "search_biography_regex",
|
| 246 |
+
"arguments": {
|
| 247 |
+
"pattern": "served.*Confederate Army",
|
| 248 |
+
"filter_state": "VA",
|
| 249 |
+
"limit": 5
|
| 250 |
+
}
|
| 251 |
+
}
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
**Note**: Regex search returns concise results (name, dates, party, state) by default. Set `return_full_profile: true` to get biography text.
|
| 255 |
+
|
| 256 |
+
### Count Members by Party
|
| 257 |
+
```json
|
| 258 |
+
{
|
| 259 |
+
"name": "count_members",
|
| 260 |
+
"arguments": {
|
| 261 |
+
"group_by": "party"
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
```
|
| 265 |
+
|
| 266 |
+
### Count Republicans by State in 117th Congress
|
| 267 |
+
```json
|
| 268 |
+
{
|
| 269 |
+
"name": "count_members",
|
| 270 |
+
"arguments": {
|
| 271 |
+
"group_by": "state",
|
| 272 |
+
"filter_party": "Republican",
|
| 273 |
+
"filter_congress": 117
|
| 274 |
+
}
|
| 275 |
+
}
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
### Temporal Analysis - Party Changes Over Time
|
| 279 |
+
```json
|
| 280 |
+
{
|
| 281 |
+
"name": "temporal_analysis",
|
| 282 |
+
"arguments": {
|
| 283 |
+
"analysis_type": "party_over_time",
|
| 284 |
+
"time_unit": "congress",
|
| 285 |
+
"start_date": "1900-01-01",
|
| 286 |
+
"end_date": "2000-12-31"
|
| 287 |
+
}
|
| 288 |
+
}
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
### Demographics Analysis - Average Age by Congress
|
| 292 |
+
```json
|
| 293 |
+
{
|
| 294 |
+
"name": "temporal_analysis",
|
| 295 |
+
"arguments": {
|
| 296 |
+
"analysis_type": "demographics",
|
| 297 |
+
"time_unit": "congress"
|
| 298 |
+
}
|
| 299 |
+
}
|
| 300 |
+
```
|
| 301 |
+
|
| 302 |
+
### Count Members Who Attended Harvard
|
| 303 |
+
```json
|
| 304 |
+
{
|
| 305 |
+
"name": "count_by_biography_content",
|
| 306 |
+
"arguments": {
|
| 307 |
+
"keywords": ["Harvard"]
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
```
|
| 311 |
+
|
| 312 |
+
### Count Lawyers by Party
|
| 313 |
+
```json
|
| 314 |
+
{
|
| 315 |
+
"name": "count_by_biography_content",
|
| 316 |
+
"arguments": {
|
| 317 |
+
"keywords": ["lawyer", "attorney"],
|
| 318 |
+
"breakdown_by": "party"
|
| 319 |
+
}
|
| 320 |
+
}
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
### Count Members Who Were Both Lawyers AND Veterans
|
| 324 |
+
```json
|
| 325 |
+
{
|
| 326 |
+
"name": "count_by_biography_content",
|
| 327 |
+
"arguments": {
|
| 328 |
+
"keywords": ["lawyer", "military", "army"],
|
| 329 |
+
"match_all": false,
|
| 330 |
+
"breakdown_by": "state"
|
| 331 |
+
}
|
| 332 |
+
}
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
### SQL Query - Find Longest Serving Members
|
| 336 |
+
```json
|
| 337 |
+
{
|
| 338 |
+
"name": "execute_sql_query",
|
| 339 |
+
"arguments": {
|
| 340 |
+
"query": "SELECT family_name, given_name, COUNT(DISTINCT congress_number) as congresses FROM members m JOIN job_positions j ON m.bio_id = j.bio_id GROUP BY m.bio_id HAVING congresses > 5 ORDER BY congresses DESC LIMIT 10"
|
| 341 |
+
}
|
| 342 |
+
}
|
| 343 |
+
```
|
| 344 |
+
|
| 345 |
+
### Get Full Member Profile
|
| 346 |
+
```json
|
| 347 |
+
{
|
| 348 |
+
"name": "get_member_profile",
|
| 349 |
+
"arguments": {
|
| 350 |
+
"bio_id": "L000313"
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
```
|
| 354 |
+
|
| 355 |
+
### Search by Congress Number
|
| 356 |
+
```json
|
| 357 |
+
{
|
| 358 |
+
"name": "search_by_congress",
|
| 359 |
+
"arguments": {
|
| 360 |
+
"congress_number": 117,
|
| 361 |
+
"chamber": "Senator"
|
| 362 |
+
}
|
| 363 |
+
}
|
| 364 |
+
```
|
| 365 |
+
|
| 366 |
+
### Search by Date Range
|
| 367 |
+
```json
|
| 368 |
+
{
|
| 369 |
+
"name": "search_by_date_range",
|
| 370 |
+
"arguments": {
|
| 371 |
+
"start_date": "1861-03-04",
|
| 372 |
+
"end_date": "1865-03-04"
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
```
|
| 376 |
+
|
| 377 |
+
### Find Family Relationships
|
| 378 |
+
```json
|
| 379 |
+
{
|
| 380 |
+
"name": "search_by_relationship",
|
| 381 |
+
"arguments": {
|
| 382 |
+
"relationship_type": "father"
|
| 383 |
+
}
|
| 384 |
+
}
|
| 385 |
+
```
|
| 386 |
+
|
| 387 |
+
### Complex SQL - Party Transitions
|
| 388 |
+
```json
|
| 389 |
+
{
|
| 390 |
+
"name": "execute_sql_query",
|
| 391 |
+
"arguments": {
|
| 392 |
+
"query": "SELECT m.bio_id, m.family_name, m.given_name, GROUP_CONCAT(DISTINCT j.party) as parties FROM members m JOIN job_positions j ON m.bio_id = j.bio_id WHERE j.party IS NOT NULL GROUP BY m.bio_id HAVING COUNT(DISTINCT j.party) > 1 LIMIT 20"
|
| 393 |
+
}
|
| 394 |
+
}
|
| 395 |
+
```
|
| 396 |
+
|
| 397 |
+
## Data Source
|
| 398 |
+
|
| 399 |
+
Data comes from the US Congressional Bioguide, containing biographical information for all members of Congress throughout history.
|
| 400 |
+
|
| 401 |
+
## Technical Details
|
| 402 |
+
|
| 403 |
+
- **Database**: SQLite for structured queries
|
| 404 |
+
- **Semantic Search**: FAISS with sentence-transformers (all-MiniLM-L6-v2)
|
| 405 |
+
- **Embedding Dimension**: 384
|
| 406 |
+
- **Index Type**: Flat IP (Inner Product) with L2 normalization for cosine similarity
|
| 407 |
+
|
| 408 |
+
## MCP Configuration
|
| 409 |
+
|
| 410 |
+
Add to your MCP settings file (usually `~/.config/claude/claude_desktop_config.json` on macOS/Linux or `%APPDATA%\Claude\claude_desktop_config.json` on Windows):
|
| 411 |
+
|
| 412 |
+
```json
|
| 413 |
+
{
|
| 414 |
+
"mcpServers": {
|
| 415 |
+
"congressional-bioguide": {
|
| 416 |
+
"command": "/Users/electron/workspace/Nanocentury AI/NIO/BioGuideMCP/venv/bin/python",
|
| 417 |
+
"args": [
|
| 418 |
+
"/Users/electron/workspace/Nanocentury AI/NIO/BioGuideMCP/server.py"
|
| 419 |
+
],
|
| 420 |
+
"cwd": "/Users/electron/workspace/Nanocentury AI/NIO/BioGuideMCP"
|
| 421 |
+
}
|
| 422 |
+
}
|
| 423 |
+
}
|
| 424 |
+
```
|
| 425 |
+
|
| 426 |
+
**Note**: This uses the virtual environment's Python which has all the required dependencies installed.
|
| 427 |
+
|
| 428 |
+
## License
|
| 429 |
+
|
| 430 |
+
Data is public domain from the US Congressional Bioguide.
|
build_faiss_index.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Build FAISS index from Congressional biography database.
|
| 4 |
+
|
| 5 |
+
This script:
|
| 6 |
+
1. Loads all biographies from the SQLite database
|
| 7 |
+
2. Generates embeddings using sentence transformers
|
| 8 |
+
3. Builds a FAISS index for fast similarity search
|
| 9 |
+
4. Saves the index and bio ID mapping to disk
|
| 10 |
+
|
| 11 |
+
Run this script whenever:
|
| 12 |
+
- The database is first created
|
| 13 |
+
- You want to rebuild the semantic search index
|
| 14 |
+
- After updating to a compatible Python version
|
| 15 |
+
|
| 16 |
+
Requires Python 3.9-3.12 (Python 3.14+ may have compatibility issues)
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import sqlite3
|
| 20 |
+
import faiss
|
| 21 |
+
import numpy as np
|
| 22 |
+
import pickle
|
| 23 |
+
import time
|
| 24 |
+
import os
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from sentence_transformers import SentenceTransformer
|
| 27 |
+
|
| 28 |
+
# Paths
|
| 29 |
+
SCRIPT_DIR = Path(__file__).parent.absolute()
|
| 30 |
+
DB_PATH = str(SCRIPT_DIR / "congress.db")
|
| 31 |
+
INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index")
|
| 32 |
+
MAPPING_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl")
|
| 33 |
+
|
| 34 |
+
def build_faiss_index():
|
| 35 |
+
"""Build FAISS index from database biographies."""
|
| 36 |
+
print("=" * 60)
|
| 37 |
+
print("BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE")
|
| 38 |
+
print("=" * 60)
|
| 39 |
+
|
| 40 |
+
# Check database exists
|
| 41 |
+
if not Path(DB_PATH).exists():
|
| 42 |
+
print(f"\nβ ERROR: Database not found at {DB_PATH}")
|
| 43 |
+
print(" Run ingest_data.py first to create the database.")
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
# Load sentence transformer model
|
| 47 |
+
print("\n1. Loading sentence transformer model...")
|
| 48 |
+
start = time.time()
|
| 49 |
+
|
| 50 |
+
# Disable all parallelism to avoid Python 3.14 issues
|
| 51 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 52 |
+
os.environ['OMP_NUM_THREADS'] = '1'
|
| 53 |
+
os.environ['MKL_NUM_THREADS'] = '1'
|
| 54 |
+
os.environ['OPENBLAS_NUM_THREADS'] = '1'
|
| 55 |
+
|
| 56 |
+
import torch
|
| 57 |
+
torch.set_num_threads(1)
|
| 58 |
+
|
| 59 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 60 |
+
print(f" β Model loaded in {time.time() - start:.3f}s")
|
| 61 |
+
|
| 62 |
+
# Load biographies from database
|
| 63 |
+
print("\n2. Loading biographies from database...")
|
| 64 |
+
start = time.time()
|
| 65 |
+
conn = sqlite3.connect(DB_PATH)
|
| 66 |
+
cursor = conn.cursor()
|
| 67 |
+
|
| 68 |
+
cursor.execute("""
|
| 69 |
+
SELECT bio_id, profile_text
|
| 70 |
+
FROM members
|
| 71 |
+
WHERE profile_text IS NOT NULL AND profile_text != ''
|
| 72 |
+
""")
|
| 73 |
+
rows = cursor.fetchall()
|
| 74 |
+
conn.close()
|
| 75 |
+
|
| 76 |
+
elapsed = time.time() - start
|
| 77 |
+
print(f" β Loaded {len(rows):,} biographies in {elapsed:.3f}s")
|
| 78 |
+
|
| 79 |
+
if len(rows) == 0:
|
| 80 |
+
print("\nβ ERROR: No biographies found in database!")
|
| 81 |
+
return False
|
| 82 |
+
|
| 83 |
+
# Prepare data
|
| 84 |
+
print("\n3. Preparing data for encoding...")
|
| 85 |
+
start = time.time()
|
| 86 |
+
bio_ids = [row[0] for row in rows]
|
| 87 |
+
texts = [row[1] for row in rows]
|
| 88 |
+
print(f" β Prepared {len(bio_ids):,} texts")
|
| 89 |
+
print(f" β Time: {time.time() - start:.3f}s")
|
| 90 |
+
|
| 91 |
+
# Generate embeddings in batches
|
| 92 |
+
print("\n4. Generating embeddings...")
|
| 93 |
+
print(" (This may take several minutes...)")
|
| 94 |
+
start = time.time()
|
| 95 |
+
batch_size = 32
|
| 96 |
+
embeddings = []
|
| 97 |
+
|
| 98 |
+
for i in range(0, len(texts), batch_size):
|
| 99 |
+
batch = texts[i:i + batch_size]
|
| 100 |
+
batch_embeddings = model.encode(
|
| 101 |
+
batch,
|
| 102 |
+
show_progress_bar=False,
|
| 103 |
+
convert_to_numpy=True,
|
| 104 |
+
normalize_embeddings=False,
|
| 105 |
+
device='cpu' # Explicit CPU to avoid issues
|
| 106 |
+
)
|
| 107 |
+
embeddings.extend(batch_embeddings)
|
| 108 |
+
|
| 109 |
+
# Progress update every 100 batches (~3200 texts)
|
| 110 |
+
if (i // batch_size + 1) % 100 == 0:
|
| 111 |
+
elapsed = time.time() - start
|
| 112 |
+
rate = (i + len(batch)) / elapsed
|
| 113 |
+
remaining = (len(texts) - i - len(batch)) / rate if rate > 0 else 0
|
| 114 |
+
print(f" Encoded {i + len(batch):,}/{len(texts):,} " +
|
| 115 |
+
f"({rate:.0f} texts/sec, ~{remaining:.0f}s remaining)")
|
| 116 |
+
|
| 117 |
+
embeddings = np.array(embeddings, dtype=np.float32)
|
| 118 |
+
elapsed = time.time() - start
|
| 119 |
+
print(f" β Generated {len(embeddings):,} embeddings in {elapsed:.1f}s")
|
| 120 |
+
print(f" β Shape: {embeddings.shape}")
|
| 121 |
+
|
| 122 |
+
# Build FAISS index
|
| 123 |
+
print("\n5. Building FAISS index...")
|
| 124 |
+
start = time.time()
|
| 125 |
+
dimension = embeddings.shape[1]
|
| 126 |
+
print(f" Dimension: {dimension}")
|
| 127 |
+
|
| 128 |
+
# Use IndexFlatIP for exact cosine similarity search
|
| 129 |
+
# (Inner Product is equivalent to cosine similarity for normalized vectors)
|
| 130 |
+
index = faiss.IndexFlatIP(dimension)
|
| 131 |
+
|
| 132 |
+
# Normalize embeddings for cosine similarity
|
| 133 |
+
faiss.normalize_L2(embeddings)
|
| 134 |
+
|
| 135 |
+
# Add embeddings to index
|
| 136 |
+
index.add(embeddings)
|
| 137 |
+
|
| 138 |
+
elapsed = time.time() - start
|
| 139 |
+
print(f" β Index built in {elapsed:.3f}s")
|
| 140 |
+
print(f" β Total vectors in index: {index.ntotal:,}")
|
| 141 |
+
|
| 142 |
+
# Save FAISS index
|
| 143 |
+
print("\n6. Saving FAISS index to disk...")
|
| 144 |
+
start = time.time()
|
| 145 |
+
faiss.write_index(index, INDEX_PATH)
|
| 146 |
+
elapsed = time.time() - start
|
| 147 |
+
print(f" β Index saved to: {INDEX_PATH}")
|
| 148 |
+
print(f" β Time: {elapsed:.3f}s")
|
| 149 |
+
|
| 150 |
+
# Save bio ID mapping
|
| 151 |
+
print("\n7. Saving bio ID mapping...")
|
| 152 |
+
start = time.time()
|
| 153 |
+
with open(MAPPING_PATH, "wb") as f:
|
| 154 |
+
pickle.dump(bio_ids, f)
|
| 155 |
+
elapsed = time.time() - start
|
| 156 |
+
print(f" β Mapping saved to: {MAPPING_PATH}")
|
| 157 |
+
print(f" β Time: {elapsed:.3f}s")
|
| 158 |
+
|
| 159 |
+
# Get file sizes
|
| 160 |
+
index_size_mb = Path(INDEX_PATH).stat().st_size / (1024**2)
|
| 161 |
+
mapping_size_mb = Path(MAPPING_PATH).stat().st_size / (1024**2)
|
| 162 |
+
|
| 163 |
+
print("\n" + "=" * 60)
|
| 164 |
+
print("FAISS INDEX BUILD COMPLETE")
|
| 165 |
+
print("=" * 60)
|
| 166 |
+
print(f"Total biographies indexed: {len(bio_ids):,}")
|
| 167 |
+
print(f"Index file size: {index_size_mb:.2f} MB")
|
| 168 |
+
print(f"Mapping file size: {mapping_size_mb:.2f} MB")
|
| 169 |
+
print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB")
|
| 170 |
+
print("\nThe MCP server will now load this index on startup for semantic search.")
|
| 171 |
+
print("You can now use the 'semantic_search_biography' tool!")
|
| 172 |
+
|
| 173 |
+
return True
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def main():
|
| 177 |
+
"""Main entry point."""
|
| 178 |
+
try:
|
| 179 |
+
success = build_faiss_index()
|
| 180 |
+
if not success:
|
| 181 |
+
exit(1)
|
| 182 |
+
except Exception as e:
|
| 183 |
+
print(f"\nβ ERROR: {e}")
|
| 184 |
+
print("\nThis may be due to Python version incompatibility.")
|
| 185 |
+
print("FAISS and sentence-transformers work best with Python 3.9-3.12")
|
| 186 |
+
print(f"Current Python version: {os.sys.version}")
|
| 187 |
+
print("\nThe database is still usable without semantic search.")
|
| 188 |
+
import traceback
|
| 189 |
+
traceback.print_exc()
|
| 190 |
+
exit(1)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
if __name__ == "__main__":
|
| 194 |
+
main()
|
faiss_build.log
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
============================================================
|
| 2 |
+
BUILDING FAISS INDEX FOR CONGRESSIONAL BIOGUIDE
|
| 3 |
+
============================================================
|
| 4 |
+
|
| 5 |
+
1. Loading sentence transformer model...
|
| 6 |
+
β Model loaded in 2.021s
|
| 7 |
+
|
| 8 |
+
2. Loading biographies from database...
|
| 9 |
+
β Loaded 13,047 biographies in 0.211s
|
| 10 |
+
|
| 11 |
+
3. Preparing data for encoding...
|
| 12 |
+
β Prepared 13,047 texts
|
| 13 |
+
β Time: 0.000s
|
| 14 |
+
|
| 15 |
+
4. Generating embeddings...
|
| 16 |
+
(This may take several minutes...)
|
| 17 |
+
Encoded 3,200/13,047 (48 texts/sec, ~207s remaining)
|
| 18 |
+
Encoded 6,400/13,047 (47 texts/sec, ~141s remaining)
|
| 19 |
+
Encoded 9,600/13,047 (47 texts/sec, ~74s remaining)
|
| 20 |
+
Encoded 12,800/13,047 (46 texts/sec, ~5s remaining)
|
| 21 |
+
β Generated 13,047 embeddings in 280.6s
|
| 22 |
+
β Shape: (13047, 384)
|
| 23 |
+
|
| 24 |
+
5. Building FAISS index...
|
| 25 |
+
Dimension: 384
|
| 26 |
+
β Index built in 0.009s
|
| 27 |
+
β Total vectors in index: 13,047
|
| 28 |
+
|
| 29 |
+
6. Saving FAISS index to disk...
|
| 30 |
+
β Index saved to: /Users/electron/workspace/Nanocentury AI/NIO/BioGuideMCP/congress_faiss.index
|
| 31 |
+
β Time: 0.004s
|
| 32 |
+
|
| 33 |
+
7. Saving bio ID mapping...
|
| 34 |
+
β Mapping saved to: /Users/electron/workspace/Nanocentury AI/NIO/BioGuideMCP/congress_bio_ids.pkl
|
| 35 |
+
β Time: 0.001s
|
| 36 |
+
|
| 37 |
+
============================================================
|
| 38 |
+
FAISS INDEX BUILD COMPLETE
|
| 39 |
+
============================================================
|
| 40 |
+
Total biographies indexed: 13,047
|
| 41 |
+
Index file size: 19.11 MB
|
| 42 |
+
Mapping file size: 0.12 MB
|
| 43 |
+
Total size: 19.24 MB
|
| 44 |
+
|
| 45 |
+
The MCP server will now load this index on startup for semantic search.
|
| 46 |
+
You can now use the 'semantic_search_biography' tool!
|
gradio_app.py
ADDED
|
@@ -0,0 +1,574 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Gradio MCP Server for Congressional Bioguide profiles.
|
| 4 |
+
Provides search and analysis capabilities via Gradio interface.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import sqlite3
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import warnings
|
| 12 |
+
from typing import List, Dict, Any
|
| 13 |
+
import numpy as np
|
| 14 |
+
from sentence_transformers import SentenceTransformer
|
| 15 |
+
import faiss
|
| 16 |
+
import pickle
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
# Suppress warnings
|
| 20 |
+
warnings.filterwarnings('ignore')
|
| 21 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 22 |
+
|
| 23 |
+
# Initialize global resources
|
| 24 |
+
SCRIPT_DIR = Path(__file__).parent.absolute()
|
| 25 |
+
DB_PATH = str(SCRIPT_DIR / "congress.db")
|
| 26 |
+
FAISS_INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index")
|
| 27 |
+
BIO_IDS_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl")
|
| 28 |
+
|
| 29 |
+
# Global state
|
| 30 |
+
model = None
|
| 31 |
+
faiss_index = None
|
| 32 |
+
bio_id_mapping = None
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def initialize_search_index():
|
| 36 |
+
"""Initialize the semantic search components."""
|
| 37 |
+
global model, faiss_index, bio_id_mapping
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
if Path(FAISS_INDEX_PATH).exists() and Path(BIO_IDS_PATH).exists():
|
| 41 |
+
print(f"Loading FAISS index from: {FAISS_INDEX_PATH}")
|
| 42 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 43 |
+
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
| 44 |
+
with open(BIO_IDS_PATH, "rb") as f:
|
| 45 |
+
bio_id_mapping = pickle.load(f)
|
| 46 |
+
print(f"β Loaded {faiss_index.ntotal} embeddings")
|
| 47 |
+
return True
|
| 48 |
+
else:
|
| 49 |
+
print(f"FAISS index not found. Semantic search will be unavailable.")
|
| 50 |
+
return False
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"Error loading search index: {e}")
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def get_db_connection():
|
| 57 |
+
"""Get a database connection."""
|
| 58 |
+
return sqlite3.connect(DB_PATH)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def execute_query(query: str, params: tuple = ()) -> List[Dict[str, Any]]:
|
| 62 |
+
"""Execute a SQL query and return results as list of dicts."""
|
| 63 |
+
conn = get_db_connection()
|
| 64 |
+
conn.row_factory = sqlite3.Row
|
| 65 |
+
cursor = conn.cursor()
|
| 66 |
+
cursor.execute(query, params)
|
| 67 |
+
results = [dict(row) for row in cursor.fetchall()]
|
| 68 |
+
conn.close()
|
| 69 |
+
return results
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# Initialize search index on startup
|
| 73 |
+
print("Initializing Congressional Bioguide MCP Server...")
|
| 74 |
+
initialize_search_index()
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# MCP Tool Functions with decorators
|
| 78 |
+
@gr.mcp.tool()
|
| 79 |
+
def search_by_name(family_name: str = "", given_name: str = "", limit: int = 10) -> str:
|
| 80 |
+
"""
|
| 81 |
+
Search for Congressional members by name.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
family_name: Last name to search for (partial match)
|
| 85 |
+
given_name: First name to search for (partial match)
|
| 86 |
+
limit: Maximum number of results to return (default: 10)
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
JSON string with search results including bio_id, name, birth/death dates, party, state
|
| 90 |
+
"""
|
| 91 |
+
try:
|
| 92 |
+
conditions = []
|
| 93 |
+
params = []
|
| 94 |
+
|
| 95 |
+
if family_name:
|
| 96 |
+
conditions.append("LOWER(m.unaccented_family_name) LIKE LOWER(?)")
|
| 97 |
+
params.append(f"%{family_name}%")
|
| 98 |
+
if given_name:
|
| 99 |
+
conditions.append("LOWER(m.unaccented_given_name) LIKE LOWER(?)")
|
| 100 |
+
params.append(f"%{given_name}%")
|
| 101 |
+
|
| 102 |
+
if not conditions:
|
| 103 |
+
return json.dumps({"error": "Please provide at least family_name or given_name"})
|
| 104 |
+
|
| 105 |
+
query = f"""
|
| 106 |
+
SELECT DISTINCT m.bio_id, m.given_name, m.middle_name, m.family_name,
|
| 107 |
+
m.birth_date, m.death_date,
|
| 108 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 109 |
+
FROM members m
|
| 110 |
+
LEFT JOIN job_positions j ON m.bio_id = j.bio_id
|
| 111 |
+
WHERE {' AND '.join(conditions)}
|
| 112 |
+
ORDER BY m.family_name, m.given_name
|
| 113 |
+
LIMIT ?
|
| 114 |
+
"""
|
| 115 |
+
params.append(limit)
|
| 116 |
+
results = execute_query(query, tuple(params))
|
| 117 |
+
|
| 118 |
+
return json.dumps({"count": len(results), "results": results}, indent=2)
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
return json.dumps({"error": str(e)})
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@gr.mcp.tool()
|
| 125 |
+
def search_by_party(party: str, congress_number: int = None) -> str:
|
| 126 |
+
"""
|
| 127 |
+
Search for Congressional members by political party.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
party: Party name (e.g., 'Republican', 'Democrat', 'Whig')
|
| 131 |
+
congress_number: Optional Congress number to filter by (e.g., 117)
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
JSON string with members from the specified party
|
| 135 |
+
"""
|
| 136 |
+
try:
|
| 137 |
+
if congress_number:
|
| 138 |
+
query = """
|
| 139 |
+
SELECT DISTINCT m.bio_id, m.given_name, m.family_name, m.birth_date, m.death_date,
|
| 140 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 141 |
+
FROM members m
|
| 142 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 143 |
+
WHERE j.party = ? AND j.congress_number = ?
|
| 144 |
+
ORDER BY m.family_name, m.given_name
|
| 145 |
+
LIMIT 100
|
| 146 |
+
"""
|
| 147 |
+
results = execute_query(query, (party, congress_number))
|
| 148 |
+
else:
|
| 149 |
+
query = """
|
| 150 |
+
SELECT DISTINCT m.bio_id, m.given_name, m.family_name, m.birth_date, m.death_date,
|
| 151 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 152 |
+
FROM members m
|
| 153 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 154 |
+
WHERE j.party = ?
|
| 155 |
+
ORDER BY m.family_name, m.given_name
|
| 156 |
+
LIMIT 100
|
| 157 |
+
"""
|
| 158 |
+
results = execute_query(query, (party,))
|
| 159 |
+
|
| 160 |
+
return json.dumps({"count": len(results), "party": party, "results": results}, indent=2)
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
return json.dumps({"error": str(e)})
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
@gr.mcp.tool()
|
| 167 |
+
def search_by_state(state_code: str, congress_number: int = None) -> str:
|
| 168 |
+
"""
|
| 169 |
+
Search for Congressional members by state.
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
state_code: Two-letter state code (e.g., 'CA', 'NY', 'TX')
|
| 173 |
+
congress_number: Optional Congress number to filter by
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
JSON string with members from the specified state
|
| 177 |
+
"""
|
| 178 |
+
try:
|
| 179 |
+
state_code = state_code.upper()
|
| 180 |
+
|
| 181 |
+
if congress_number:
|
| 182 |
+
query = """
|
| 183 |
+
SELECT DISTINCT m.bio_id, m.given_name, m.family_name, m.birth_date, m.death_date,
|
| 184 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 185 |
+
FROM members m
|
| 186 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 187 |
+
WHERE j.region_code = ? AND j.congress_number = ?
|
| 188 |
+
ORDER BY m.family_name, m.given_name
|
| 189 |
+
LIMIT 100
|
| 190 |
+
"""
|
| 191 |
+
results = execute_query(query, (state_code, congress_number))
|
| 192 |
+
else:
|
| 193 |
+
query = """
|
| 194 |
+
SELECT DISTINCT m.bio_id, m.given_name, m.family_name, m.birth_date, m.death_date,
|
| 195 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 196 |
+
FROM members m
|
| 197 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 198 |
+
WHERE j.region_code = ?
|
| 199 |
+
ORDER BY m.family_name, m.given_name
|
| 200 |
+
LIMIT 100
|
| 201 |
+
"""
|
| 202 |
+
results = execute_query(query, (state_code,))
|
| 203 |
+
|
| 204 |
+
return json.dumps({"count": len(results), "state": state_code, "results": results}, indent=2)
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
return json.dumps({"error": str(e)})
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
@gr.mcp.tool()
|
| 211 |
+
def semantic_search_biography(query: str, top_k: int = 5) -> str:
|
| 212 |
+
"""
|
| 213 |
+
Perform AI-powered semantic search on member biographies using natural language.
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
query: Natural language query (e.g., 'lawyers who became judges', 'Civil War veterans')
|
| 217 |
+
top_k: Number of results to return (default: 5, max: 20)
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
JSON string with matching members and their similarity scores
|
| 221 |
+
"""
|
| 222 |
+
try:
|
| 223 |
+
if not all([model, faiss_index, bio_id_mapping]):
|
| 224 |
+
return json.dumps({"error": "Semantic search is not available. FAISS index not loaded."})
|
| 225 |
+
|
| 226 |
+
# Limit top_k
|
| 227 |
+
top_k = min(max(1, top_k), 20)
|
| 228 |
+
|
| 229 |
+
# Encode query
|
| 230 |
+
query_embedding = model.encode([query])[0].astype('float32')
|
| 231 |
+
query_embedding = query_embedding.reshape(1, -1)
|
| 232 |
+
faiss.normalize_L2(query_embedding)
|
| 233 |
+
|
| 234 |
+
# Search
|
| 235 |
+
scores, indices = faiss_index.search(query_embedding, top_k)
|
| 236 |
+
|
| 237 |
+
# Get profiles
|
| 238 |
+
results = []
|
| 239 |
+
for idx, score in zip(indices[0], scores[0]):
|
| 240 |
+
if idx < len(bio_id_mapping):
|
| 241 |
+
bio_id = bio_id_mapping[idx]
|
| 242 |
+
member_query = """
|
| 243 |
+
SELECT m.bio_id, m.given_name, m.middle_name, m.family_name,
|
| 244 |
+
m.birth_date, m.death_date, m.profile_text,
|
| 245 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 246 |
+
FROM members m
|
| 247 |
+
LEFT JOIN job_positions j ON m.bio_id = j.bio_id
|
| 248 |
+
WHERE m.bio_id = ?
|
| 249 |
+
LIMIT 1
|
| 250 |
+
"""
|
| 251 |
+
member_data = execute_query(member_query, (bio_id,))
|
| 252 |
+
if member_data:
|
| 253 |
+
member = member_data[0]
|
| 254 |
+
# Truncate profile_text for response
|
| 255 |
+
if member.get('profile_text'):
|
| 256 |
+
member['profile_text'] = member['profile_text'][:500] + "..."
|
| 257 |
+
member['similarity_score'] = float(score)
|
| 258 |
+
results.append(member)
|
| 259 |
+
|
| 260 |
+
return json.dumps({"query": query, "count": len(results), "results": results}, indent=2)
|
| 261 |
+
|
| 262 |
+
except Exception as e:
|
| 263 |
+
return json.dumps({"error": str(e)})
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
@gr.mcp.tool()
|
| 267 |
+
def get_member_profile(bio_id: str) -> str:
|
| 268 |
+
"""
|
| 269 |
+
Get complete profile for a specific member by their Bioguide ID.
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
bio_id: Bioguide ID (e.g., 'L000313' for John Lewis, 'W000374')
|
| 273 |
+
|
| 274 |
+
Returns:
|
| 275 |
+
JSON string with complete member profile including positions and relationships
|
| 276 |
+
"""
|
| 277 |
+
try:
|
| 278 |
+
bio_id = bio_id.upper()
|
| 279 |
+
|
| 280 |
+
conn = get_db_connection()
|
| 281 |
+
conn.row_factory = sqlite3.Row
|
| 282 |
+
cursor = conn.cursor()
|
| 283 |
+
|
| 284 |
+
cursor.execute("SELECT * FROM members WHERE bio_id = ?", (bio_id,))
|
| 285 |
+
member = cursor.fetchone()
|
| 286 |
+
|
| 287 |
+
if not member:
|
| 288 |
+
conn.close()
|
| 289 |
+
return json.dumps({"error": f"No member found with bio_id: {bio_id}"})
|
| 290 |
+
|
| 291 |
+
profile = dict(member)
|
| 292 |
+
|
| 293 |
+
# Get job positions
|
| 294 |
+
cursor.execute("SELECT * FROM job_positions WHERE bio_id = ? ORDER BY start_date", (bio_id,))
|
| 295 |
+
profile['job_positions'] = [dict(row) for row in cursor.fetchall()]
|
| 296 |
+
|
| 297 |
+
# Get relationships
|
| 298 |
+
cursor.execute("SELECT * FROM relationships WHERE bio_id = ?", (bio_id,))
|
| 299 |
+
profile['relationships'] = [dict(row) for row in cursor.fetchall()]
|
| 300 |
+
|
| 301 |
+
# Get creative works
|
| 302 |
+
cursor.execute("SELECT * FROM creative_works WHERE bio_id = ?", (bio_id,))
|
| 303 |
+
profile['creative_works'] = [dict(row) for row in cursor.fetchall()]
|
| 304 |
+
|
| 305 |
+
conn.close()
|
| 306 |
+
|
| 307 |
+
return json.dumps(profile, indent=2)
|
| 308 |
+
|
| 309 |
+
except Exception as e:
|
| 310 |
+
return json.dumps({"error": str(e)})
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
@gr.mcp.tool()
|
| 314 |
+
def count_members_by_party(filter_congress: int = None) -> str:
|
| 315 |
+
"""
|
| 316 |
+
Count members by political party.
|
| 317 |
+
|
| 318 |
+
Args:
|
| 319 |
+
filter_congress: Optional Congress number to filter by (e.g., 117)
|
| 320 |
+
|
| 321 |
+
Returns:
|
| 322 |
+
JSON string with member counts grouped by party
|
| 323 |
+
"""
|
| 324 |
+
try:
|
| 325 |
+
if filter_congress:
|
| 326 |
+
query = """
|
| 327 |
+
SELECT j.party as party, COUNT(DISTINCT m.bio_id) as count
|
| 328 |
+
FROM members m
|
| 329 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 330 |
+
WHERE j.congress_number = ?
|
| 331 |
+
GROUP BY j.party
|
| 332 |
+
ORDER BY count DESC
|
| 333 |
+
"""
|
| 334 |
+
results = execute_query(query, (filter_congress,))
|
| 335 |
+
else:
|
| 336 |
+
query = """
|
| 337 |
+
SELECT j.party as party, COUNT(DISTINCT m.bio_id) as count
|
| 338 |
+
FROM members m
|
| 339 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 340 |
+
GROUP BY j.party
|
| 341 |
+
ORDER BY count DESC
|
| 342 |
+
"""
|
| 343 |
+
results = execute_query(query)
|
| 344 |
+
|
| 345 |
+
total = sum(r['count'] for r in results)
|
| 346 |
+
return json.dumps({"total_members": total, "by_party": results}, indent=2)
|
| 347 |
+
|
| 348 |
+
except Exception as e:
|
| 349 |
+
return json.dumps({"error": str(e)})
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
@gr.mcp.tool()
|
| 353 |
+
def count_members_by_state(filter_congress: int = None) -> str:
|
| 354 |
+
"""
|
| 355 |
+
Count members by state.
|
| 356 |
+
|
| 357 |
+
Args:
|
| 358 |
+
filter_congress: Optional Congress number to filter by
|
| 359 |
+
|
| 360 |
+
Returns:
|
| 361 |
+
JSON string with member counts grouped by state
|
| 362 |
+
"""
|
| 363 |
+
try:
|
| 364 |
+
if filter_congress:
|
| 365 |
+
query = """
|
| 366 |
+
SELECT j.region_code as state, COUNT(DISTINCT m.bio_id) as count
|
| 367 |
+
FROM members m
|
| 368 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 369 |
+
WHERE j.congress_number = ?
|
| 370 |
+
GROUP BY j.region_code
|
| 371 |
+
ORDER BY count DESC
|
| 372 |
+
"""
|
| 373 |
+
results = execute_query(query, (filter_congress,))
|
| 374 |
+
else:
|
| 375 |
+
query = """
|
| 376 |
+
SELECT j.region_code as state, COUNT(DISTINCT m.bio_id) as count
|
| 377 |
+
FROM members m
|
| 378 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 379 |
+
GROUP BY j.region_code
|
| 380 |
+
ORDER BY count DESC
|
| 381 |
+
"""
|
| 382 |
+
results = execute_query(query)
|
| 383 |
+
|
| 384 |
+
total = sum(r['count'] for r in results)
|
| 385 |
+
return json.dumps({"total_members": total, "by_state": results}, indent=2)
|
| 386 |
+
|
| 387 |
+
except Exception as e:
|
| 388 |
+
return json.dumps({"error": str(e)})
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
@gr.mcp.tool()
|
| 392 |
+
def execute_sql_query(query: str) -> str:
|
| 393 |
+
"""
|
| 394 |
+
Execute a custom SQL SELECT query against the Congressional database (READ-ONLY).
|
| 395 |
+
|
| 396 |
+
Args:
|
| 397 |
+
query: SQL SELECT query to execute
|
| 398 |
+
|
| 399 |
+
Returns:
|
| 400 |
+
JSON string with query results
|
| 401 |
+
"""
|
| 402 |
+
try:
|
| 403 |
+
# Security: only allow SELECT queries
|
| 404 |
+
if not query.strip().upper().startswith("SELECT"):
|
| 405 |
+
return json.dumps({"error": "Only SELECT queries are allowed"})
|
| 406 |
+
|
| 407 |
+
results = execute_query(query)
|
| 408 |
+
return json.dumps({"count": len(results), "results": results}, indent=2)
|
| 409 |
+
|
| 410 |
+
except Exception as e:
|
| 411 |
+
return json.dumps({"error": str(e)})
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
@gr.mcp.tool()
|
| 415 |
+
def get_database_schema() -> str:
|
| 416 |
+
"""
|
| 417 |
+
Get the database schema showing all tables and columns available for querying.
|
| 418 |
+
|
| 419 |
+
Returns:
|
| 420 |
+
JSON string with database schema information
|
| 421 |
+
"""
|
| 422 |
+
schema_info = {
|
| 423 |
+
"tables": {
|
| 424 |
+
"members": {
|
| 425 |
+
"description": "Main table with member biographical information",
|
| 426 |
+
"columns": [
|
| 427 |
+
"bio_id (PRIMARY KEY) - Bioguide ID",
|
| 428 |
+
"family_name - Last name",
|
| 429 |
+
"given_name - First name",
|
| 430 |
+
"middle_name - Middle name",
|
| 431 |
+
"birth_date - Birth date (YYYY-MM-DD)",
|
| 432 |
+
"death_date - Death date (YYYY-MM-DD)",
|
| 433 |
+
"profile_text - Full biography text"
|
| 434 |
+
]
|
| 435 |
+
},
|
| 436 |
+
"job_positions": {
|
| 437 |
+
"description": "Congressional positions held by members",
|
| 438 |
+
"columns": [
|
| 439 |
+
"bio_id (FOREIGN KEY) - References members",
|
| 440 |
+
"job_name - Position title (Representative, Senator)",
|
| 441 |
+
"start_date - Start date of position",
|
| 442 |
+
"end_date - End date of position",
|
| 443 |
+
"congress_number - Congress number (e.g., 117)",
|
| 444 |
+
"party - Party affiliation",
|
| 445 |
+
"region_code - State/region code (e.g., 'CA', 'NY')"
|
| 446 |
+
]
|
| 447 |
+
},
|
| 448 |
+
"relationships": {
|
| 449 |
+
"description": "Family relationships between members",
|
| 450 |
+
"columns": ["bio_id", "related_bio_id", "relationship_type"]
|
| 451 |
+
},
|
| 452 |
+
"creative_works": {
|
| 453 |
+
"description": "Publications and creative works by members",
|
| 454 |
+
"columns": ["bio_id", "citation_text"]
|
| 455 |
+
}
|
| 456 |
+
}
|
| 457 |
+
}
|
| 458 |
+
return json.dumps(schema_info, indent=2)
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
# Create Gradio Interfaces for each tool
|
| 462 |
+
demo = gr.TabbedInterface(
|
| 463 |
+
[
|
| 464 |
+
# Search by Name
|
| 465 |
+
gr.Interface(
|
| 466 |
+
fn=search_by_name,
|
| 467 |
+
inputs=[
|
| 468 |
+
gr.Textbox(label="Family Name (Last Name)", placeholder="e.g., Lincoln"),
|
| 469 |
+
gr.Textbox(label="Given Name (First Name)", placeholder="e.g., Abraham"),
|
| 470 |
+
gr.Slider(minimum=1, maximum=50, value=10, step=1, label="Max Results")
|
| 471 |
+
],
|
| 472 |
+
outputs=gr.JSON(label="Search Results"),
|
| 473 |
+
title="Search by Name",
|
| 474 |
+
description="Search for Congressional members by their first or last name."
|
| 475 |
+
),
|
| 476 |
+
|
| 477 |
+
# Search by Party
|
| 478 |
+
gr.Interface(
|
| 479 |
+
fn=search_by_party,
|
| 480 |
+
inputs=[
|
| 481 |
+
gr.Textbox(label="Party Name", placeholder="e.g., Republican, Democrat, Whig"),
|
| 482 |
+
gr.Number(label="Congress Number (optional)", value=None, precision=0)
|
| 483 |
+
],
|
| 484 |
+
outputs=gr.JSON(label="Search Results"),
|
| 485 |
+
title="Search by Party",
|
| 486 |
+
description="Find members by political party affiliation."
|
| 487 |
+
),
|
| 488 |
+
|
| 489 |
+
# Search by State
|
| 490 |
+
gr.Interface(
|
| 491 |
+
fn=search_by_state,
|
| 492 |
+
inputs=[
|
| 493 |
+
gr.Textbox(label="State Code", placeholder="e.g., CA, NY, TX"),
|
| 494 |
+
gr.Number(label="Congress Number (optional)", value=None, precision=0)
|
| 495 |
+
],
|
| 496 |
+
outputs=gr.JSON(label="Search Results"),
|
| 497 |
+
title="Search by State",
|
| 498 |
+
description="Find members by the state they represented."
|
| 499 |
+
),
|
| 500 |
+
|
| 501 |
+
# Semantic Search
|
| 502 |
+
gr.Interface(
|
| 503 |
+
fn=semantic_search_biography,
|
| 504 |
+
inputs=[
|
| 505 |
+
gr.Textbox(label="Search Query", placeholder="e.g., 'lawyers who became judges' or 'Civil War veterans'", lines=3),
|
| 506 |
+
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Results")
|
| 507 |
+
],
|
| 508 |
+
outputs=gr.JSON(label="Search Results"),
|
| 509 |
+
title="AI Semantic Search",
|
| 510 |
+
description="Use natural language to search biographies. Find members by career, background, or accomplishments."
|
| 511 |
+
),
|
| 512 |
+
|
| 513 |
+
# Get Member Profile
|
| 514 |
+
gr.Interface(
|
| 515 |
+
fn=get_member_profile,
|
| 516 |
+
inputs=gr.Textbox(label="Bioguide ID", placeholder="e.g., L000313 (John Lewis)"),
|
| 517 |
+
outputs=gr.JSON(label="Member Profile"),
|
| 518 |
+
title="Get Member Profile",
|
| 519 |
+
description="Get complete profile for a specific member using their Bioguide ID."
|
| 520 |
+
),
|
| 521 |
+
|
| 522 |
+
# Count by Party
|
| 523 |
+
gr.Interface(
|
| 524 |
+
fn=count_members_by_party,
|
| 525 |
+
inputs=gr.Number(label="Filter by Congress Number (optional)", value=None, precision=0),
|
| 526 |
+
outputs=gr.JSON(label="Party Counts"),
|
| 527 |
+
title="Count by Party",
|
| 528 |
+
description="Get member counts grouped by political party."
|
| 529 |
+
),
|
| 530 |
+
|
| 531 |
+
# Count by State
|
| 532 |
+
gr.Interface(
|
| 533 |
+
fn=count_members_by_state,
|
| 534 |
+
inputs=gr.Number(label="Filter by Congress Number (optional)", value=None, precision=0),
|
| 535 |
+
outputs=gr.JSON(label="State Counts"),
|
| 536 |
+
title="Count by State",
|
| 537 |
+
description="Get member counts grouped by state."
|
| 538 |
+
),
|
| 539 |
+
|
| 540 |
+
# SQL Query
|
| 541 |
+
gr.Interface(
|
| 542 |
+
fn=execute_sql_query,
|
| 543 |
+
inputs=gr.Textbox(label="SQL Query", placeholder="SELECT * FROM members LIMIT 10", lines=3),
|
| 544 |
+
outputs=gr.JSON(label="Query Results"),
|
| 545 |
+
title="Execute SQL",
|
| 546 |
+
description="Execute custom SQL SELECT queries (read-only)."
|
| 547 |
+
),
|
| 548 |
+
|
| 549 |
+
# Database Schema
|
| 550 |
+
gr.Interface(
|
| 551 |
+
fn=get_database_schema,
|
| 552 |
+
inputs=None,
|
| 553 |
+
outputs=gr.JSON(label="Database Schema"),
|
| 554 |
+
title="Database Schema",
|
| 555 |
+
description="View the database structure and available tables/columns."
|
| 556 |
+
),
|
| 557 |
+
],
|
| 558 |
+
tab_names=[
|
| 559 |
+
"Search by Name",
|
| 560 |
+
"Search by Party",
|
| 561 |
+
"Search by State",
|
| 562 |
+
"AI Semantic Search",
|
| 563 |
+
"Member Profile",
|
| 564 |
+
"Count by Party",
|
| 565 |
+
"Count by State",
|
| 566 |
+
"Execute SQL",
|
| 567 |
+
"Database Schema"
|
| 568 |
+
],
|
| 569 |
+
title="ποΈ Congressional Bioguide MCP Server",
|
| 570 |
+
theme=gr.themes.Soft()
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
if __name__ == "__main__":
|
| 574 |
+
demo.launch(mcp_server=True)
|
ingest_data.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Ingestion script for Congressional Bioguide profiles.
|
| 4 |
+
Creates SQLite database and FAISS semantic search index.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import sqlite3
|
| 9 |
+
import os
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Dict, List, Any
|
| 13 |
+
import faiss
|
| 14 |
+
import numpy as np
|
| 15 |
+
import pickle
|
| 16 |
+
from sentence_transformers import SentenceTransformer
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class BioguideIngester:
|
| 20 |
+
def __init__(self, data_dir: str = "BioguideProfiles", db_path: str = "congress.db"):
|
| 21 |
+
self.data_dir = Path(data_dir)
|
| 22 |
+
self.db_path = db_path
|
| 23 |
+
self.model = None # Load model only when needed for FAISS indexing
|
| 24 |
+
|
| 25 |
+
def create_database_schema(self):
|
| 26 |
+
"""Create SQLite database schema for Congressional profiles."""
|
| 27 |
+
conn = sqlite3.connect(self.db_path)
|
| 28 |
+
cursor = conn.cursor()
|
| 29 |
+
|
| 30 |
+
# Main members table
|
| 31 |
+
cursor.execute("""
|
| 32 |
+
CREATE TABLE IF NOT EXISTS members (
|
| 33 |
+
bio_id TEXT PRIMARY KEY,
|
| 34 |
+
family_name TEXT,
|
| 35 |
+
given_name TEXT,
|
| 36 |
+
middle_name TEXT,
|
| 37 |
+
honorific_prefix TEXT,
|
| 38 |
+
unaccented_family_name TEXT,
|
| 39 |
+
unaccented_given_name TEXT,
|
| 40 |
+
unaccented_middle_name TEXT,
|
| 41 |
+
birth_date TEXT,
|
| 42 |
+
birth_circa INTEGER,
|
| 43 |
+
death_date TEXT,
|
| 44 |
+
death_circa INTEGER,
|
| 45 |
+
profile_text TEXT,
|
| 46 |
+
full_name TEXT GENERATED ALWAYS AS (
|
| 47 |
+
COALESCE(honorific_prefix || ' ', '') ||
|
| 48 |
+
COALESCE(given_name, '') || ' ' ||
|
| 49 |
+
COALESCE(middle_name || ' ', '') ||
|
| 50 |
+
COALESCE(family_name, '')
|
| 51 |
+
) STORED
|
| 52 |
+
)
|
| 53 |
+
""")
|
| 54 |
+
|
| 55 |
+
# Images table
|
| 56 |
+
cursor.execute("""
|
| 57 |
+
CREATE TABLE IF NOT EXISTS images (
|
| 58 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 59 |
+
bio_id TEXT,
|
| 60 |
+
content_url TEXT,
|
| 61 |
+
caption TEXT,
|
| 62 |
+
FOREIGN KEY (bio_id) REFERENCES members(bio_id)
|
| 63 |
+
)
|
| 64 |
+
""")
|
| 65 |
+
|
| 66 |
+
# Job positions table
|
| 67 |
+
cursor.execute("""
|
| 68 |
+
CREATE TABLE IF NOT EXISTS job_positions (
|
| 69 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 70 |
+
bio_id TEXT,
|
| 71 |
+
job_name TEXT,
|
| 72 |
+
job_type TEXT,
|
| 73 |
+
start_date TEXT,
|
| 74 |
+
start_circa INTEGER,
|
| 75 |
+
end_date TEXT,
|
| 76 |
+
end_circa INTEGER,
|
| 77 |
+
congress_number INTEGER,
|
| 78 |
+
congress_name TEXT,
|
| 79 |
+
party TEXT,
|
| 80 |
+
caucus TEXT,
|
| 81 |
+
region_type TEXT,
|
| 82 |
+
region_code TEXT,
|
| 83 |
+
note TEXT,
|
| 84 |
+
FOREIGN KEY (bio_id) REFERENCES members(bio_id)
|
| 85 |
+
)
|
| 86 |
+
""")
|
| 87 |
+
|
| 88 |
+
# Relationships table
|
| 89 |
+
cursor.execute("""
|
| 90 |
+
CREATE TABLE IF NOT EXISTS relationships (
|
| 91 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 92 |
+
bio_id TEXT,
|
| 93 |
+
related_bio_id TEXT,
|
| 94 |
+
relationship_type TEXT,
|
| 95 |
+
FOREIGN KEY (bio_id) REFERENCES members(bio_id),
|
| 96 |
+
FOREIGN KEY (related_bio_id) REFERENCES members(bio_id)
|
| 97 |
+
)
|
| 98 |
+
""")
|
| 99 |
+
|
| 100 |
+
# Creative works table
|
| 101 |
+
cursor.execute("""
|
| 102 |
+
CREATE TABLE IF NOT EXISTS creative_works (
|
| 103 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 104 |
+
bio_id TEXT,
|
| 105 |
+
citation_text TEXT,
|
| 106 |
+
FOREIGN KEY (bio_id) REFERENCES members(bio_id)
|
| 107 |
+
)
|
| 108 |
+
""")
|
| 109 |
+
|
| 110 |
+
# Assets table
|
| 111 |
+
cursor.execute("""
|
| 112 |
+
CREATE TABLE IF NOT EXISTS assets (
|
| 113 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 114 |
+
bio_id TEXT,
|
| 115 |
+
name TEXT,
|
| 116 |
+
asset_type TEXT,
|
| 117 |
+
content_url TEXT,
|
| 118 |
+
credit_line TEXT,
|
| 119 |
+
accession_number TEXT,
|
| 120 |
+
upload_date TEXT,
|
| 121 |
+
FOREIGN KEY (bio_id) REFERENCES members(bio_id)
|
| 122 |
+
)
|
| 123 |
+
""")
|
| 124 |
+
|
| 125 |
+
# Create indexes for common queries
|
| 126 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_family_name ON members(unaccented_family_name)")
|
| 127 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_given_name ON members(unaccented_given_name)")
|
| 128 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_birth_date ON members(birth_date)")
|
| 129 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_death_date ON members(death_date)")
|
| 130 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_job_congress ON job_positions(congress_number)")
|
| 131 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_job_party ON job_positions(party)")
|
| 132 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_job_region ON job_positions(region_code)")
|
| 133 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_job_type ON job_positions(job_name)")
|
| 134 |
+
|
| 135 |
+
conn.commit()
|
| 136 |
+
conn.close()
|
| 137 |
+
print("β Database schema created")
|
| 138 |
+
|
| 139 |
+
def extract_data_field(self, data: Dict[str, Any], key: str, default=None):
|
| 140 |
+
"""Safely extract data from nested 'data' field if it exists."""
|
| 141 |
+
if 'data' in data:
|
| 142 |
+
return data['data'].get(key, default)
|
| 143 |
+
return data.get(key, default)
|
| 144 |
+
|
| 145 |
+
def ingest_profiles(self):
|
| 146 |
+
"""Ingest all JSON profiles into SQLite database."""
|
| 147 |
+
conn = sqlite3.connect(self.db_path)
|
| 148 |
+
cursor = conn.cursor()
|
| 149 |
+
|
| 150 |
+
profile_files = list(self.data_dir.glob("*.json"))
|
| 151 |
+
total = len(profile_files)
|
| 152 |
+
|
| 153 |
+
print(f"Ingesting {total} profiles...")
|
| 154 |
+
|
| 155 |
+
for idx, profile_file in enumerate(profile_files, 1):
|
| 156 |
+
if idx % 1000 == 0:
|
| 157 |
+
print(f" Processed {idx}/{total} profiles...")
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
with open(profile_file, 'r', encoding='utf-8') as f:
|
| 161 |
+
data = json.load(f)
|
| 162 |
+
|
| 163 |
+
# Handle nested 'data' structure
|
| 164 |
+
bio_id = self.extract_data_field(data, 'usCongressBioId')
|
| 165 |
+
if not bio_id:
|
| 166 |
+
print(f" Skipping {profile_file}: no bio_id found")
|
| 167 |
+
continue
|
| 168 |
+
|
| 169 |
+
# Insert member data
|
| 170 |
+
cursor.execute("""
|
| 171 |
+
INSERT OR REPLACE INTO members (
|
| 172 |
+
bio_id, family_name, given_name, middle_name, honorific_prefix,
|
| 173 |
+
unaccented_family_name, unaccented_given_name, unaccented_middle_name,
|
| 174 |
+
birth_date, birth_circa, death_date, death_circa, profile_text
|
| 175 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 176 |
+
""", (
|
| 177 |
+
bio_id,
|
| 178 |
+
self.extract_data_field(data, 'familyName'),
|
| 179 |
+
self.extract_data_field(data, 'givenName'),
|
| 180 |
+
self.extract_data_field(data, 'middleName'),
|
| 181 |
+
self.extract_data_field(data, 'honorificPrefix'),
|
| 182 |
+
self.extract_data_field(data, 'unaccentedFamilyName'),
|
| 183 |
+
self.extract_data_field(data, 'unaccentedGivenName'),
|
| 184 |
+
self.extract_data_field(data, 'unaccentedMiddleName'),
|
| 185 |
+
self.extract_data_field(data, 'birthDate'),
|
| 186 |
+
1 if self.extract_data_field(data, 'birthCirca') else 0,
|
| 187 |
+
self.extract_data_field(data, 'deathDate'),
|
| 188 |
+
1 if self.extract_data_field(data, 'deathCirca') else 0,
|
| 189 |
+
self.extract_data_field(data, 'profileText')
|
| 190 |
+
))
|
| 191 |
+
|
| 192 |
+
# Insert images
|
| 193 |
+
images = self.extract_data_field(data, 'image', [])
|
| 194 |
+
for img in images:
|
| 195 |
+
cursor.execute("""
|
| 196 |
+
INSERT INTO images (bio_id, content_url, caption)
|
| 197 |
+
VALUES (?, ?, ?)
|
| 198 |
+
""", (bio_id, img.get('contentUrl'), img.get('caption')))
|
| 199 |
+
|
| 200 |
+
# Insert job positions
|
| 201 |
+
job_positions = self.extract_data_field(data, 'jobPositions', [])
|
| 202 |
+
for job_pos in job_positions:
|
| 203 |
+
job = job_pos.get('job', {})
|
| 204 |
+
congress_aff = job_pos.get('congressAffiliation', {})
|
| 205 |
+
congress = congress_aff.get('congress', {})
|
| 206 |
+
party_list = congress_aff.get('partyAffiliation', [])
|
| 207 |
+
caucus_list = congress_aff.get('caucusAffiliation', [])
|
| 208 |
+
represents = congress_aff.get('represents', {})
|
| 209 |
+
notes = congress_aff.get('note', [])
|
| 210 |
+
note_text = notes[0].get('content') if notes else None
|
| 211 |
+
|
| 212 |
+
cursor.execute("""
|
| 213 |
+
INSERT INTO job_positions (
|
| 214 |
+
bio_id, job_name, job_type, start_date, start_circa,
|
| 215 |
+
end_date, end_circa, congress_number, congress_name,
|
| 216 |
+
party, caucus, region_type, region_code, note
|
| 217 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 218 |
+
""", (
|
| 219 |
+
bio_id,
|
| 220 |
+
job.get('name'),
|
| 221 |
+
job.get('jobType'),
|
| 222 |
+
job_pos.get('startDate'),
|
| 223 |
+
1 if job_pos.get('startCirca') else 0,
|
| 224 |
+
job_pos.get('endDate'),
|
| 225 |
+
1 if job_pos.get('endCirca') else 0,
|
| 226 |
+
congress.get('congressNumber'),
|
| 227 |
+
congress.get('name'),
|
| 228 |
+
party_list[0].get('party', {}).get('name') if party_list else None,
|
| 229 |
+
caucus_list[0].get('party', {}).get('name') if caucus_list else None,
|
| 230 |
+
represents.get('regionType'),
|
| 231 |
+
represents.get('regionCode'),
|
| 232 |
+
note_text
|
| 233 |
+
))
|
| 234 |
+
|
| 235 |
+
# Insert relationships
|
| 236 |
+
relationships = self.extract_data_field(data, 'relationship', [])
|
| 237 |
+
for rel in relationships:
|
| 238 |
+
related = rel.get('relatedTo', {})
|
| 239 |
+
cursor.execute("""
|
| 240 |
+
INSERT INTO relationships (bio_id, related_bio_id, relationship_type)
|
| 241 |
+
VALUES (?, ?, ?)
|
| 242 |
+
""", (bio_id, related.get('usCongressBioId'), rel.get('relationshipType')))
|
| 243 |
+
|
| 244 |
+
# Insert creative works
|
| 245 |
+
creative_works = self.extract_data_field(data, 'creativeWork', [])
|
| 246 |
+
for work in creative_works:
|
| 247 |
+
cursor.execute("""
|
| 248 |
+
INSERT INTO creative_works (bio_id, citation_text)
|
| 249 |
+
VALUES (?, ?)
|
| 250 |
+
""", (bio_id, work.get('freeFormCitationText')))
|
| 251 |
+
|
| 252 |
+
# Insert assets
|
| 253 |
+
assets = self.extract_data_field(data, 'asset', [])
|
| 254 |
+
for asset in assets:
|
| 255 |
+
cursor.execute("""
|
| 256 |
+
INSERT INTO assets (
|
| 257 |
+
bio_id, name, asset_type, content_url, credit_line,
|
| 258 |
+
accession_number, upload_date
|
| 259 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
| 260 |
+
""", (
|
| 261 |
+
bio_id,
|
| 262 |
+
asset.get('name'),
|
| 263 |
+
asset.get('assetType'),
|
| 264 |
+
asset.get('contentUrl'),
|
| 265 |
+
asset.get('creditLine'),
|
| 266 |
+
asset.get('accessionNumber'),
|
| 267 |
+
asset.get('uploadDate')
|
| 268 |
+
))
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
print(f" Error processing {profile_file}: {e}")
|
| 272 |
+
continue
|
| 273 |
+
|
| 274 |
+
conn.commit()
|
| 275 |
+
conn.close()
|
| 276 |
+
print(f"β Ingested {total} profiles into database")
|
| 277 |
+
|
| 278 |
+
def build_faiss_index(self):
|
| 279 |
+
"""Build FAISS index for semantic search on profile biographies."""
|
| 280 |
+
print("\n" + "=" * 60)
|
| 281 |
+
print("BUILDING FAISS INDEX FOR SEMANTIC SEARCH")
|
| 282 |
+
print("=" * 60)
|
| 283 |
+
|
| 284 |
+
try:
|
| 285 |
+
# Load model
|
| 286 |
+
print("\n1. Loading sentence transformer model...")
|
| 287 |
+
start_time = time.time()
|
| 288 |
+
|
| 289 |
+
# Disable all parallelism to avoid Python 3.14 issues
|
| 290 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 291 |
+
os.environ['OMP_NUM_THREADS'] = '1'
|
| 292 |
+
os.environ['MKL_NUM_THREADS'] = '1'
|
| 293 |
+
os.environ['OPENBLAS_NUM_THREADS'] = '1'
|
| 294 |
+
|
| 295 |
+
import torch
|
| 296 |
+
torch.set_num_threads(1)
|
| 297 |
+
|
| 298 |
+
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 299 |
+
print(f" β Model loaded in {time.time() - start_time:.3f}s")
|
| 300 |
+
|
| 301 |
+
# Load biographies from database
|
| 302 |
+
print("\n2. Loading biographies from database...")
|
| 303 |
+
start_time = time.time()
|
| 304 |
+
conn = sqlite3.connect(self.db_path)
|
| 305 |
+
cursor = conn.cursor()
|
| 306 |
+
cursor.execute("SELECT bio_id, profile_text FROM members WHERE profile_text IS NOT NULL")
|
| 307 |
+
profiles = cursor.fetchall()
|
| 308 |
+
conn.close()
|
| 309 |
+
print(f" β Loaded {len(profiles):,} biographies in {time.time() - start_time:.3f}s")
|
| 310 |
+
|
| 311 |
+
if len(profiles) == 0:
|
| 312 |
+
print("\nβ ERROR: No profiles with text found in database!")
|
| 313 |
+
return False
|
| 314 |
+
|
| 315 |
+
# Prepare data
|
| 316 |
+
print("\n3. Preparing data for encoding...")
|
| 317 |
+
start_time = time.time()
|
| 318 |
+
bio_ids = [p[0] for p in profiles]
|
| 319 |
+
texts = [p[1] if p[1] else "" for p in profiles]
|
| 320 |
+
print(f" β Prepared {len(bio_ids):,} texts")
|
| 321 |
+
print(f" β Time: {time.time() - start_time:.3f}s")
|
| 322 |
+
|
| 323 |
+
# Generate embeddings in batches
|
| 324 |
+
print("\n4. Generating embeddings...")
|
| 325 |
+
start_time = time.time()
|
| 326 |
+
batch_size = 32
|
| 327 |
+
embeddings = []
|
| 328 |
+
|
| 329 |
+
for i in range(0, len(texts), batch_size):
|
| 330 |
+
batch = texts[i:i + batch_size]
|
| 331 |
+
batch_embeddings = self.model.encode(
|
| 332 |
+
batch,
|
| 333 |
+
show_progress_bar=False,
|
| 334 |
+
convert_to_numpy=True,
|
| 335 |
+
normalize_embeddings=False,
|
| 336 |
+
device='cpu' # Explicit CPU to avoid GPU issues
|
| 337 |
+
)
|
| 338 |
+
embeddings.extend(batch_embeddings)
|
| 339 |
+
|
| 340 |
+
# Progress update every 100 batches
|
| 341 |
+
if (i // batch_size + 1) % 100 == 0:
|
| 342 |
+
elapsed = time.time() - start_time
|
| 343 |
+
rate = (i + len(batch)) / elapsed
|
| 344 |
+
print(f" Encoded {i + len(batch):,}/{len(texts):,} ({rate:.0f} texts/sec)")
|
| 345 |
+
|
| 346 |
+
embeddings = np.array(embeddings, dtype=np.float32)
|
| 347 |
+
elapsed = time.time() - start_time
|
| 348 |
+
print(f" β Generated {len(embeddings):,} embeddings in {elapsed:.3f}s")
|
| 349 |
+
print(f" β Shape: {embeddings.shape}")
|
| 350 |
+
|
| 351 |
+
# Build FAISS index
|
| 352 |
+
print("\n5. Building FAISS index...")
|
| 353 |
+
start_time = time.time()
|
| 354 |
+
dimension = embeddings.shape[1]
|
| 355 |
+
print(f" Dimension: {dimension}")
|
| 356 |
+
|
| 357 |
+
# Use IndexFlatIP for exact cosine similarity search
|
| 358 |
+
index = faiss.IndexFlatIP(dimension)
|
| 359 |
+
|
| 360 |
+
# Normalize embeddings for cosine similarity
|
| 361 |
+
faiss.normalize_L2(embeddings)
|
| 362 |
+
|
| 363 |
+
# Add to index
|
| 364 |
+
index.add(embeddings)
|
| 365 |
+
print(f" β Index built in {time.time() - start_time:.3f}s")
|
| 366 |
+
print(f" β Total vectors in index: {index.ntotal:,}")
|
| 367 |
+
|
| 368 |
+
# Save FAISS index
|
| 369 |
+
print("\n6. Saving FAISS index to disk...")
|
| 370 |
+
start_time = time.time()
|
| 371 |
+
faiss.write_index(index, "congress_faiss.index")
|
| 372 |
+
print(f" β Index saved to: congress_faiss.index")
|
| 373 |
+
print(f" β Time: {time.time() - start_time:.3f}s")
|
| 374 |
+
|
| 375 |
+
# Save note ID mapping
|
| 376 |
+
print("\n7. Saving bio ID mapping...")
|
| 377 |
+
start_time = time.time()
|
| 378 |
+
with open("congress_bio_ids.pkl", "wb") as f:
|
| 379 |
+
pickle.dump(bio_ids, f)
|
| 380 |
+
print(f" β Mapping saved to: congress_bio_ids.pkl")
|
| 381 |
+
print(f" β Time: {time.time() - start_time:.3f}s")
|
| 382 |
+
|
| 383 |
+
# Get file sizes
|
| 384 |
+
from pathlib import Path
|
| 385 |
+
index_size_mb = Path("congress_faiss.index").stat().st_size / (1024**2)
|
| 386 |
+
mapping_size_mb = Path("congress_bio_ids.pkl").stat().st_size / (1024**2)
|
| 387 |
+
|
| 388 |
+
print("\n" + "=" * 60)
|
| 389 |
+
print("FAISS INDEX BUILD COMPLETE")
|
| 390 |
+
print("=" * 60)
|
| 391 |
+
print(f"Total embeddings indexed: {len(bio_ids):,}")
|
| 392 |
+
print(f"Index file size: {index_size_mb:.2f} MB")
|
| 393 |
+
print(f"Mapping file size: {mapping_size_mb:.2f} MB")
|
| 394 |
+
print(f"Total size: {index_size_mb + mapping_size_mb:.2f} MB")
|
| 395 |
+
print("\nThe MCP server will load this index on startup for fast searches.")
|
| 396 |
+
|
| 397 |
+
return True
|
| 398 |
+
|
| 399 |
+
except Exception as e:
|
| 400 |
+
print(f"\nβ ERROR building FAISS index: {e}")
|
| 401 |
+
print(f" This may be due to Python 3.14 compatibility issues.")
|
| 402 |
+
print(f" The database is still usable, but semantic search will not work.")
|
| 403 |
+
print(f" Consider using Python 3.11 or 3.12 for full functionality.")
|
| 404 |
+
import traceback
|
| 405 |
+
traceback.print_exc()
|
| 406 |
+
return False
|
| 407 |
+
|
| 408 |
+
def run(self):
|
| 409 |
+
"""Run the complete ingestion pipeline."""
|
| 410 |
+
print("Starting Congressional Bioguide ingestion...")
|
| 411 |
+
print("=" * 60)
|
| 412 |
+
|
| 413 |
+
try:
|
| 414 |
+
self.create_database_schema()
|
| 415 |
+
self.ingest_profiles()
|
| 416 |
+
faiss_success = self.build_faiss_index()
|
| 417 |
+
|
| 418 |
+
print("\n" + "=" * 60)
|
| 419 |
+
print("INGESTION COMPLETE")
|
| 420 |
+
print("=" * 60)
|
| 421 |
+
print(f"Database: {self.db_path}")
|
| 422 |
+
|
| 423 |
+
if faiss_success:
|
| 424 |
+
print(f"FAISS index: congress_faiss.index β")
|
| 425 |
+
print(f"ID mapping: congress_bio_ids.pkl β")
|
| 426 |
+
print("\nAll features available, including semantic search!")
|
| 427 |
+
else:
|
| 428 |
+
print(f"FAISS index: β (failed to build)")
|
| 429 |
+
print("\nDatabase is ready, but semantic search is unavailable.")
|
| 430 |
+
print("All other MCP tools will work normally.")
|
| 431 |
+
|
| 432 |
+
return faiss_success
|
| 433 |
+
|
| 434 |
+
except Exception as e:
|
| 435 |
+
print(f"\nβ FATAL ERROR: {e}")
|
| 436 |
+
import traceback
|
| 437 |
+
traceback.print_exc()
|
| 438 |
+
return False
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
def main():
|
| 442 |
+
ingester = BioguideIngester()
|
| 443 |
+
ingester.run()
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
if __name__ == "__main__":
|
| 447 |
+
main()
|
mcp_config_example.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mcpServers": {
|
| 3 |
+
"congressional-bioguide": {
|
| 4 |
+
"command": "/Users/electron/workspace/Nanocentury AI/NIO/BioGuideMCP/venv/bin/python",
|
| 5 |
+
"args": [
|
| 6 |
+
"/Users/electron/workspace/Nanocentury AI/NIO/BioGuideMCP/server.py"
|
| 7 |
+
],
|
| 8 |
+
"cwd": "/Users/electron/workspace/Nanocentury AI/NIO/BioGuideMCP"
|
| 9 |
+
}
|
| 10 |
+
}
|
| 11 |
+
}
|
requirements-minimal.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Minimal requirements for database-only mode (no semantic search)
|
| 2 |
+
# Works with any Python version including 3.14+
|
| 3 |
+
mcp>=0.9.0
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Requires Python 3.10-3.13 (NOT 3.14+ due to FAISS incompatibility)
|
| 2 |
+
mcp>=1.0.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
sentence-transformers>=2.2.0
|
| 5 |
+
torch>=2.0.0
|
| 6 |
+
faiss-cpu>=1.7.4
|
| 7 |
+
gradio>=5.0.0
|
server.py
ADDED
|
@@ -0,0 +1,1219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
MCP Server for Congressional Bioguide profiles.
|
| 4 |
+
Provides SQL queries and semantic search capabilities.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import sqlite3
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import warnings
|
| 12 |
+
from typing import List, Dict, Any, Optional
|
| 13 |
+
import numpy as np
|
| 14 |
+
from sentence_transformers import SentenceTransformer
|
| 15 |
+
import faiss
|
| 16 |
+
import pickle
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
from mcp.server import Server
|
| 20 |
+
from mcp.types import Tool, TextContent, ImageContent, EmbeddedResource
|
| 21 |
+
import mcp.server.stdio
|
| 22 |
+
|
| 23 |
+
# Suppress all warnings to prevent JSON protocol corruption
|
| 24 |
+
warnings.filterwarnings('ignore')
|
| 25 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# Initialize global resources - use absolute paths
|
| 29 |
+
SCRIPT_DIR = Path(__file__).parent.absolute()
|
| 30 |
+
DB_PATH = str(SCRIPT_DIR / "congress.db")
|
| 31 |
+
FAISS_INDEX_PATH = str(SCRIPT_DIR / "congress_faiss.index")
|
| 32 |
+
BIO_IDS_PATH = str(SCRIPT_DIR / "congress_bio_ids.pkl")
|
| 33 |
+
|
| 34 |
+
# Load FAISS index and model
|
| 35 |
+
model = None
|
| 36 |
+
faiss_index = None
|
| 37 |
+
bio_id_mapping = None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def initialize_search_index():
|
| 41 |
+
"""Initialize the semantic search components."""
|
| 42 |
+
global model, faiss_index, bio_id_mapping
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
if Path(FAISS_INDEX_PATH).exists() and Path(BIO_IDS_PATH).exists():
|
| 46 |
+
print(f"Loading FAISS index from: {FAISS_INDEX_PATH}", file=sys.stderr, flush=True)
|
| 47 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 48 |
+
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
| 49 |
+
with open(BIO_IDS_PATH, "rb") as f:
|
| 50 |
+
bio_id_mapping = pickle.load(f)
|
| 51 |
+
print(f"β Loaded {faiss_index.ntotal} embeddings", file=sys.stderr, flush=True)
|
| 52 |
+
return True
|
| 53 |
+
else:
|
| 54 |
+
print(f"FAISS index not found at: {FAISS_INDEX_PATH}", file=sys.stderr, flush=True)
|
| 55 |
+
print(f"Bio IDs not found at: {BIO_IDS_PATH}", file=sys.stderr, flush=True)
|
| 56 |
+
return False
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"Error loading search index: {e}", file=sys.stderr, flush=True)
|
| 59 |
+
return False
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def get_db_connection():
|
| 63 |
+
"""Get a database connection."""
|
| 64 |
+
return sqlite3.connect(DB_PATH)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def execute_query(query: str, params: tuple = ()) -> List[Dict[str, Any]]:
|
| 68 |
+
"""Execute a SQL query and return results as list of dicts."""
|
| 69 |
+
conn = get_db_connection()
|
| 70 |
+
conn.row_factory = sqlite3.Row
|
| 71 |
+
cursor = conn.cursor()
|
| 72 |
+
cursor.execute(query, params)
|
| 73 |
+
results = [dict(row) for row in cursor.fetchall()]
|
| 74 |
+
conn.close()
|
| 75 |
+
return results
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def format_member_concise(member: Dict[str, Any]) -> Dict[str, Any]:
|
| 79 |
+
"""Format member data to concise output with only essential fields."""
|
| 80 |
+
return {
|
| 81 |
+
'bio_id': member.get('bio_id'),
|
| 82 |
+
'name': f"{member.get('given_name', '')} {member.get('middle_name', '') + ' ' if member.get('middle_name') else ''}{member.get('family_name', '')}".strip(),
|
| 83 |
+
'birth_date': member.get('birth_date'),
|
| 84 |
+
'death_date': member.get('death_date'),
|
| 85 |
+
'party': member.get('party'),
|
| 86 |
+
'state': member.get('region_code'),
|
| 87 |
+
'position': member.get('job_name'),
|
| 88 |
+
'congress': member.get('congress_number')
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def get_member_profile(bio_id: str) -> Optional[Dict[str, Any]]:
|
| 93 |
+
"""Get complete profile for a member including all related data."""
|
| 94 |
+
conn = get_db_connection()
|
| 95 |
+
conn.row_factory = sqlite3.Row
|
| 96 |
+
cursor = conn.cursor()
|
| 97 |
+
|
| 98 |
+
# Get member data
|
| 99 |
+
cursor.execute("SELECT * FROM members WHERE bio_id = ?", (bio_id,))
|
| 100 |
+
member = cursor.fetchone()
|
| 101 |
+
if not member:
|
| 102 |
+
conn.close()
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
profile = dict(member)
|
| 106 |
+
|
| 107 |
+
# Get images
|
| 108 |
+
cursor.execute("SELECT * FROM images WHERE bio_id = ?", (bio_id,))
|
| 109 |
+
profile['images'] = [dict(row) for row in cursor.fetchall()]
|
| 110 |
+
|
| 111 |
+
# Get job positions
|
| 112 |
+
cursor.execute("SELECT * FROM job_positions WHERE bio_id = ? ORDER BY start_date", (bio_id,))
|
| 113 |
+
profile['job_positions'] = [dict(row) for row in cursor.fetchall()]
|
| 114 |
+
|
| 115 |
+
# Get relationships
|
| 116 |
+
cursor.execute("SELECT * FROM relationships WHERE bio_id = ?", (bio_id,))
|
| 117 |
+
profile['relationships'] = [dict(row) for row in cursor.fetchall()]
|
| 118 |
+
|
| 119 |
+
# Get creative works
|
| 120 |
+
cursor.execute("SELECT * FROM creative_works WHERE bio_id = ?", (bio_id,))
|
| 121 |
+
profile['creative_works'] = [dict(row) for row in cursor.fetchall()]
|
| 122 |
+
|
| 123 |
+
# Get assets
|
| 124 |
+
cursor.execute("SELECT * FROM assets WHERE bio_id = ?", (bio_id,))
|
| 125 |
+
profile['assets'] = [dict(row) for row in cursor.fetchall()]
|
| 126 |
+
|
| 127 |
+
conn.close()
|
| 128 |
+
return profile
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def semantic_search(query_text: str, top_k: int = 10) -> List[str]:
|
| 132 |
+
"""Perform semantic search and return matching bio_ids."""
|
| 133 |
+
if not all([model, faiss_index, bio_id_mapping]):
|
| 134 |
+
raise ValueError("Search index not initialized. Run ingest_data.py first.")
|
| 135 |
+
|
| 136 |
+
# Encode query
|
| 137 |
+
query_embedding = model.encode([query_text])[0].astype('float32')
|
| 138 |
+
query_embedding = query_embedding.reshape(1, -1)
|
| 139 |
+
|
| 140 |
+
# Normalize for cosine similarity
|
| 141 |
+
faiss.normalize_L2(query_embedding)
|
| 142 |
+
|
| 143 |
+
# Search
|
| 144 |
+
scores, indices = faiss_index.search(query_embedding, top_k)
|
| 145 |
+
|
| 146 |
+
# Map indices to bio_ids
|
| 147 |
+
results = []
|
| 148 |
+
for idx, score in zip(indices[0], scores[0]):
|
| 149 |
+
if idx < len(bio_id_mapping):
|
| 150 |
+
results.append({
|
| 151 |
+
'bio_id': bio_id_mapping[idx],
|
| 152 |
+
'similarity_score': float(score)
|
| 153 |
+
})
|
| 154 |
+
|
| 155 |
+
return results
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# Initialize MCP server
|
| 159 |
+
server = Server("congressional-bioguide")
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
@server.list_tools()
|
| 163 |
+
async def list_tools() -> List[Tool]:
|
| 164 |
+
"""List all available tools."""
|
| 165 |
+
return [
|
| 166 |
+
Tool(
|
| 167 |
+
name="search_by_name",
|
| 168 |
+
description="Search for Congressional members by name. Returns concise results (name, dates, party, congress) by default.",
|
| 169 |
+
inputSchema={
|
| 170 |
+
"type": "object",
|
| 171 |
+
"properties": {
|
| 172 |
+
"family_name": {
|
| 173 |
+
"type": "string",
|
| 174 |
+
"description": "Family/last name to search for (partial match)"
|
| 175 |
+
},
|
| 176 |
+
"given_name": {
|
| 177 |
+
"type": "string",
|
| 178 |
+
"description": "Given/first name to search for (partial match)"
|
| 179 |
+
},
|
| 180 |
+
"full_name": {
|
| 181 |
+
"type": "string",
|
| 182 |
+
"description": "Full name to search for (partial match in any name field)"
|
| 183 |
+
},
|
| 184 |
+
"limit": {
|
| 185 |
+
"type": "integer",
|
| 186 |
+
"description": "Maximum results to return (default: 50)",
|
| 187 |
+
"default": 50
|
| 188 |
+
},
|
| 189 |
+
"return_full_profile": {
|
| 190 |
+
"type": "boolean",
|
| 191 |
+
"description": "Return full profile data including biography (default: false)",
|
| 192 |
+
"default": False
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
),
|
| 197 |
+
Tool(
|
| 198 |
+
name="search_by_party",
|
| 199 |
+
description="Search for Congressional members by political party affiliation.",
|
| 200 |
+
inputSchema={
|
| 201 |
+
"type": "object",
|
| 202 |
+
"properties": {
|
| 203 |
+
"party": {
|
| 204 |
+
"type": "string",
|
| 205 |
+
"description": "Party name (e.g., 'Republican', 'Democrat', 'Whig')"
|
| 206 |
+
},
|
| 207 |
+
"congress_number": {
|
| 208 |
+
"type": "integer",
|
| 209 |
+
"description": "Optional: Filter by specific Congress number (e.g., 117)"
|
| 210 |
+
}
|
| 211 |
+
},
|
| 212 |
+
"required": ["party"]
|
| 213 |
+
}
|
| 214 |
+
),
|
| 215 |
+
Tool(
|
| 216 |
+
name="search_by_state",
|
| 217 |
+
description="Search for Congressional members by state or region they represented.",
|
| 218 |
+
inputSchema={
|
| 219 |
+
"type": "object",
|
| 220 |
+
"properties": {
|
| 221 |
+
"state_code": {
|
| 222 |
+
"type": "string",
|
| 223 |
+
"description": "State code (e.g., 'CA', 'NY', 'TX')"
|
| 224 |
+
},
|
| 225 |
+
"congress_number": {
|
| 226 |
+
"type": "integer",
|
| 227 |
+
"description": "Optional: Filter by specific Congress number"
|
| 228 |
+
}
|
| 229 |
+
},
|
| 230 |
+
"required": ["state_code"]
|
| 231 |
+
}
|
| 232 |
+
),
|
| 233 |
+
Tool(
|
| 234 |
+
name="search_by_congress",
|
| 235 |
+
description="Get all members who served in a specific Congress.",
|
| 236 |
+
inputSchema={
|
| 237 |
+
"type": "object",
|
| 238 |
+
"properties": {
|
| 239 |
+
"congress_number": {
|
| 240 |
+
"type": "integer",
|
| 241 |
+
"description": "Congress number (e.g., 117 for the 117th Congress)"
|
| 242 |
+
},
|
| 243 |
+
"chamber": {
|
| 244 |
+
"type": "string",
|
| 245 |
+
"description": "Optional: Filter by chamber ('Representative' or 'Senator')"
|
| 246 |
+
}
|
| 247 |
+
},
|
| 248 |
+
"required": ["congress_number"]
|
| 249 |
+
}
|
| 250 |
+
),
|
| 251 |
+
Tool(
|
| 252 |
+
name="search_by_date_range",
|
| 253 |
+
description="Search for members who served during a specific date range.",
|
| 254 |
+
inputSchema={
|
| 255 |
+
"type": "object",
|
| 256 |
+
"properties": {
|
| 257 |
+
"start_date": {
|
| 258 |
+
"type": "string",
|
| 259 |
+
"description": "Start date in YYYY-MM-DD format"
|
| 260 |
+
},
|
| 261 |
+
"end_date": {
|
| 262 |
+
"type": "string",
|
| 263 |
+
"description": "End date in YYYY-MM-DD format"
|
| 264 |
+
}
|
| 265 |
+
},
|
| 266 |
+
"required": ["start_date", "end_date"]
|
| 267 |
+
}
|
| 268 |
+
),
|
| 269 |
+
Tool(
|
| 270 |
+
name="semantic_search_biography",
|
| 271 |
+
description="Perform semantic search on member biographies. Use natural language to find members based on career details, accomplishments, background, etc.",
|
| 272 |
+
inputSchema={
|
| 273 |
+
"type": "object",
|
| 274 |
+
"properties": {
|
| 275 |
+
"query": {
|
| 276 |
+
"type": "string",
|
| 277 |
+
"description": "Natural language query to search biographies (e.g., 'lawyers who became judges', 'Civil War veterans')"
|
| 278 |
+
},
|
| 279 |
+
"top_k": {
|
| 280 |
+
"type": "integer",
|
| 281 |
+
"description": "Number of results to return (default: 10)",
|
| 282 |
+
"default": 5
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
"required": ["query"]
|
| 286 |
+
}
|
| 287 |
+
),
|
| 288 |
+
Tool(
|
| 289 |
+
name="get_member_profile",
|
| 290 |
+
description="Get complete profile information for a specific member by their Bioguide ID.",
|
| 291 |
+
inputSchema={
|
| 292 |
+
"type": "object",
|
| 293 |
+
"properties": {
|
| 294 |
+
"bio_id": {
|
| 295 |
+
"type": "string",
|
| 296 |
+
"description": "Bioguide ID (e.g., 'W000374', 'P000144')"
|
| 297 |
+
}
|
| 298 |
+
},
|
| 299 |
+
"required": ["bio_id"]
|
| 300 |
+
}
|
| 301 |
+
),
|
| 302 |
+
Tool(
|
| 303 |
+
name="execute_sql_query",
|
| 304 |
+
description="Execute a custom SQL query against the Congressional database. Use for complex queries not covered by other tools. READ-ONLY access.",
|
| 305 |
+
inputSchema={
|
| 306 |
+
"type": "object",
|
| 307 |
+
"properties": {
|
| 308 |
+
"query": {
|
| 309 |
+
"type": "string",
|
| 310 |
+
"description": "SQL SELECT query to execute"
|
| 311 |
+
}
|
| 312 |
+
},
|
| 313 |
+
"required": ["query"]
|
| 314 |
+
}
|
| 315 |
+
),
|
| 316 |
+
Tool(
|
| 317 |
+
name="get_database_schema",
|
| 318 |
+
description="Get the database schema showing all tables and columns available for querying.",
|
| 319 |
+
inputSchema={
|
| 320 |
+
"type": "object",
|
| 321 |
+
"properties": {}
|
| 322 |
+
}
|
| 323 |
+
),
|
| 324 |
+
Tool(
|
| 325 |
+
name="search_by_relationship",
|
| 326 |
+
description="Find members who have family relationships with other members (e.g., father, son, spouse).",
|
| 327 |
+
inputSchema={
|
| 328 |
+
"type": "object",
|
| 329 |
+
"properties": {
|
| 330 |
+
"relationship_type": {
|
| 331 |
+
"type": "string",
|
| 332 |
+
"description": "Type of relationship (e.g., 'father', 'son', 'spouse', 'brother')"
|
| 333 |
+
}
|
| 334 |
+
}
|
| 335 |
+
}
|
| 336 |
+
),
|
| 337 |
+
Tool(
|
| 338 |
+
name="search_biography_regex",
|
| 339 |
+
description="Search member biographies using regex patterns. Returns concise member info (name, dates, party, state) for matches. Use filters to narrow results.",
|
| 340 |
+
inputSchema={
|
| 341 |
+
"type": "object",
|
| 342 |
+
"properties": {
|
| 343 |
+
"pattern": {
|
| 344 |
+
"type": "string",
|
| 345 |
+
"description": "Regex pattern to search for in biographies (e.g., 'Harvard', 'lawyer', 'served.*army', 'born in [0-9]{4}')"
|
| 346 |
+
},
|
| 347 |
+
"case_sensitive": {
|
| 348 |
+
"type": "boolean",
|
| 349 |
+
"description": "Whether search should be case-sensitive (default: false)",
|
| 350 |
+
"default": False
|
| 351 |
+
},
|
| 352 |
+
"limit": {
|
| 353 |
+
"type": "integer",
|
| 354 |
+
"description": "Maximum number of results to return (default: 5)",
|
| 355 |
+
"default": 5
|
| 356 |
+
},
|
| 357 |
+
"filter_party": {
|
| 358 |
+
"type": "string",
|
| 359 |
+
"description": "Optional: Filter results by party (e.g., 'Republican', 'Democrat')"
|
| 360 |
+
},
|
| 361 |
+
"filter_state": {
|
| 362 |
+
"type": "string",
|
| 363 |
+
"description": "Optional: Filter results by state code (e.g., 'CA', 'NY')"
|
| 364 |
+
},
|
| 365 |
+
"filter_congress": {
|
| 366 |
+
"type": "integer",
|
| 367 |
+
"description": "Optional: Filter results by Congress number (e.g., 117)"
|
| 368 |
+
},
|
| 369 |
+
"return_full_profile": {
|
| 370 |
+
"type": "boolean",
|
| 371 |
+
"description": "Return full profile including biography text (default: false)",
|
| 372 |
+
"default": False
|
| 373 |
+
}
|
| 374 |
+
},
|
| 375 |
+
"required": ["pattern"]
|
| 376 |
+
}
|
| 377 |
+
),
|
| 378 |
+
Tool(
|
| 379 |
+
name="count_members",
|
| 380 |
+
description="Count members matching specific criteria. Returns aggregated counts by party, state, position, or custom grouping. Much more efficient than returning full member lists.",
|
| 381 |
+
inputSchema={
|
| 382 |
+
"type": "object",
|
| 383 |
+
"properties": {
|
| 384 |
+
"group_by": {
|
| 385 |
+
"type": "string",
|
| 386 |
+
"description": "Field to group by: 'party', 'state', 'position', 'congress', or 'year'",
|
| 387 |
+
"enum": ["party", "state", "position", "congress", "year"]
|
| 388 |
+
},
|
| 389 |
+
"filter_party": {
|
| 390 |
+
"type": "string",
|
| 391 |
+
"description": "Optional: Filter by party name"
|
| 392 |
+
},
|
| 393 |
+
"filter_state": {
|
| 394 |
+
"type": "string",
|
| 395 |
+
"description": "Optional: Filter by state code"
|
| 396 |
+
},
|
| 397 |
+
"filter_congress": {
|
| 398 |
+
"type": "integer",
|
| 399 |
+
"description": "Optional: Filter by Congress number"
|
| 400 |
+
},
|
| 401 |
+
"filter_position": {
|
| 402 |
+
"type": "string",
|
| 403 |
+
"description": "Optional: Filter by position (Representative, Senator)"
|
| 404 |
+
},
|
| 405 |
+
"date_range_start": {
|
| 406 |
+
"type": "string",
|
| 407 |
+
"description": "Optional: Start date (YYYY-MM-DD)"
|
| 408 |
+
},
|
| 409 |
+
"date_range_end": {
|
| 410 |
+
"type": "string",
|
| 411 |
+
"description": "Optional: End date (YYYY-MM-DD)"
|
| 412 |
+
}
|
| 413 |
+
},
|
| 414 |
+
"required": ["group_by"]
|
| 415 |
+
}
|
| 416 |
+
),
|
| 417 |
+
Tool(
|
| 418 |
+
name="temporal_analysis",
|
| 419 |
+
description="Analyze member trends over time. Shows how membership changed across years, decades, or congresses. Perfect for historical analysis.",
|
| 420 |
+
inputSchema={
|
| 421 |
+
"type": "object",
|
| 422 |
+
"properties": {
|
| 423 |
+
"analysis_type": {
|
| 424 |
+
"type": "string",
|
| 425 |
+
"description": "Type of temporal analysis",
|
| 426 |
+
"enum": ["party_over_time", "state_representation", "position_counts", "demographics"]
|
| 427 |
+
},
|
| 428 |
+
"time_unit": {
|
| 429 |
+
"type": "string",
|
| 430 |
+
"description": "Time granularity: 'congress', 'year', 'decade'",
|
| 431 |
+
"enum": ["congress", "year", "decade"],
|
| 432 |
+
"default": "congress"
|
| 433 |
+
},
|
| 434 |
+
"start_date": {
|
| 435 |
+
"type": "string",
|
| 436 |
+
"description": "Optional: Start date (YYYY-MM-DD)"
|
| 437 |
+
},
|
| 438 |
+
"end_date": {
|
| 439 |
+
"type": "string",
|
| 440 |
+
"description": "Optional: End date (YYYY-MM-DD)"
|
| 441 |
+
},
|
| 442 |
+
"filter_party": {
|
| 443 |
+
"type": "string",
|
| 444 |
+
"description": "Optional: Filter to specific party"
|
| 445 |
+
},
|
| 446 |
+
"filter_state": {
|
| 447 |
+
"type": "string",
|
| 448 |
+
"description": "Optional: Filter to specific state"
|
| 449 |
+
}
|
| 450 |
+
},
|
| 451 |
+
"required": ["analysis_type"]
|
| 452 |
+
}
|
| 453 |
+
),
|
| 454 |
+
Tool(
|
| 455 |
+
name="count_by_biography_content",
|
| 456 |
+
description="Count members whose biographies mention specific keywords or phrases (e.g., 'Harvard', 'lawyer', 'Civil War'). Much more efficient than searching when you only need counts.",
|
| 457 |
+
inputSchema={
|
| 458 |
+
"type": "object",
|
| 459 |
+
"properties": {
|
| 460 |
+
"keywords": {
|
| 461 |
+
"type": "array",
|
| 462 |
+
"items": {"type": "string"},
|
| 463 |
+
"description": "List of keywords or phrases to search for (case-insensitive)"
|
| 464 |
+
},
|
| 465 |
+
"match_all": {
|
| 466 |
+
"type": "boolean",
|
| 467 |
+
"description": "If true, count members matching ALL keywords. If false, count members matching ANY keyword (default: false)",
|
| 468 |
+
"default": False
|
| 469 |
+
},
|
| 470 |
+
"breakdown_by": {
|
| 471 |
+
"type": "string",
|
| 472 |
+
"description": "Optional: Break down counts by party, state, position, or congress",
|
| 473 |
+
"enum": ["party", "state", "position", "congress", "none"],
|
| 474 |
+
"default": "none"
|
| 475 |
+
},
|
| 476 |
+
"filter_party": {
|
| 477 |
+
"type": "string",
|
| 478 |
+
"description": "Optional: Only count members from specific party"
|
| 479 |
+
},
|
| 480 |
+
"filter_state": {
|
| 481 |
+
"type": "string",
|
| 482 |
+
"description": "Optional: Only count members from specific state"
|
| 483 |
+
}
|
| 484 |
+
},
|
| 485 |
+
"required": ["keywords"]
|
| 486 |
+
}
|
| 487 |
+
)
|
| 488 |
+
]
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
@server.call_tool()
|
| 492 |
+
async def call_tool(name: str, arguments: Any) -> List[TextContent]:
|
| 493 |
+
"""Handle tool calls."""
|
| 494 |
+
|
| 495 |
+
try:
|
| 496 |
+
if name == "search_by_name":
|
| 497 |
+
family_name = arguments.get("family_name")
|
| 498 |
+
given_name = arguments.get("given_name")
|
| 499 |
+
full_name = arguments.get("full_name")
|
| 500 |
+
limit = arguments.get("limit", 50)
|
| 501 |
+
return_full = arguments.get("return_full_profile", False)
|
| 502 |
+
|
| 503 |
+
conditions = []
|
| 504 |
+
params = []
|
| 505 |
+
|
| 506 |
+
if family_name:
|
| 507 |
+
conditions.append("LOWER(m.unaccented_family_name) LIKE LOWER(?)")
|
| 508 |
+
params.append(f"%{family_name}%")
|
| 509 |
+
if given_name:
|
| 510 |
+
conditions.append("LOWER(m.unaccented_given_name) LIKE LOWER(?)")
|
| 511 |
+
params.append(f"%{given_name}%")
|
| 512 |
+
if full_name:
|
| 513 |
+
conditions.append("""(LOWER(m.unaccented_family_name) LIKE LOWER(?)
|
| 514 |
+
OR LOWER(m.unaccented_given_name) LIKE LOWER(?)
|
| 515 |
+
OR LOWER(m.unaccented_middle_name) LIKE LOWER(?))""")
|
| 516 |
+
params.extend([f"%{full_name}%"] * 3)
|
| 517 |
+
|
| 518 |
+
if not conditions:
|
| 519 |
+
return [TextContent(type="text", text="Please provide at least one name parameter.")]
|
| 520 |
+
|
| 521 |
+
if return_full:
|
| 522 |
+
query = f"SELECT * FROM members m WHERE {' AND '.join(conditions)} ORDER BY m.family_name, m.given_name LIMIT ?"
|
| 523 |
+
params.append(limit)
|
| 524 |
+
results = execute_query(query, tuple(params))
|
| 525 |
+
else:
|
| 526 |
+
# Return concise results with job info
|
| 527 |
+
query = f"""
|
| 528 |
+
SELECT DISTINCT m.bio_id, m.given_name, m.middle_name, m.family_name,
|
| 529 |
+
m.birth_date, m.death_date,
|
| 530 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 531 |
+
FROM members m
|
| 532 |
+
LEFT JOIN job_positions j ON m.bio_id = j.bio_id
|
| 533 |
+
WHERE {' AND '.join(conditions)}
|
| 534 |
+
ORDER BY m.family_name, m.given_name
|
| 535 |
+
LIMIT ?
|
| 536 |
+
"""
|
| 537 |
+
params.append(limit)
|
| 538 |
+
results = execute_query(query, tuple(params))
|
| 539 |
+
results = [format_member_concise(r) for r in results]
|
| 540 |
+
|
| 541 |
+
response = {
|
| 542 |
+
"count": len(results),
|
| 543 |
+
"limit": limit,
|
| 544 |
+
"results": results
|
| 545 |
+
}
|
| 546 |
+
return [TextContent(type="text", text=json.dumps(response, indent=2))]
|
| 547 |
+
|
| 548 |
+
elif name == "search_by_party":
|
| 549 |
+
party = arguments["party"]
|
| 550 |
+
congress_number = arguments.get("congress_number")
|
| 551 |
+
|
| 552 |
+
if congress_number:
|
| 553 |
+
query = """
|
| 554 |
+
SELECT DISTINCT m.* FROM members m
|
| 555 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 556 |
+
WHERE j.party = ? AND j.congress_number = ?
|
| 557 |
+
ORDER BY m.family_name, m.given_name
|
| 558 |
+
"""
|
| 559 |
+
results = execute_query(query, (party, congress_number))
|
| 560 |
+
else:
|
| 561 |
+
query = """
|
| 562 |
+
SELECT DISTINCT m.* FROM members m
|
| 563 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 564 |
+
WHERE j.party = ?
|
| 565 |
+
ORDER BY m.family_name, m.given_name
|
| 566 |
+
"""
|
| 567 |
+
results = execute_query(query, (party,))
|
| 568 |
+
|
| 569 |
+
return [TextContent(type="text", text=json.dumps(results, indent=2))]
|
| 570 |
+
|
| 571 |
+
elif name == "search_by_state":
|
| 572 |
+
state_code = arguments["state_code"].upper()
|
| 573 |
+
congress_number = arguments.get("congress_number")
|
| 574 |
+
|
| 575 |
+
if congress_number:
|
| 576 |
+
query = """
|
| 577 |
+
SELECT DISTINCT m.*, j.job_name, j.party, j.congress_number
|
| 578 |
+
FROM members m
|
| 579 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 580 |
+
WHERE j.region_code = ? AND j.congress_number = ?
|
| 581 |
+
ORDER BY m.family_name, m.given_name
|
| 582 |
+
"""
|
| 583 |
+
results = execute_query(query, (state_code, congress_number))
|
| 584 |
+
else:
|
| 585 |
+
query = """
|
| 586 |
+
SELECT DISTINCT m.*, j.job_name, j.party, j.congress_number
|
| 587 |
+
FROM members m
|
| 588 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 589 |
+
WHERE j.region_code = ?
|
| 590 |
+
ORDER BY m.family_name, m.given_name
|
| 591 |
+
"""
|
| 592 |
+
results = execute_query(query, (state_code,))
|
| 593 |
+
|
| 594 |
+
return [TextContent(type="text", text=json.dumps(results, indent=2))]
|
| 595 |
+
|
| 596 |
+
elif name == "search_by_congress":
|
| 597 |
+
congress_number = arguments["congress_number"]
|
| 598 |
+
chamber = arguments.get("chamber")
|
| 599 |
+
|
| 600 |
+
if chamber:
|
| 601 |
+
query = """
|
| 602 |
+
SELECT DISTINCT m.*, j.job_name, j.party, j.region_code
|
| 603 |
+
FROM members m
|
| 604 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 605 |
+
WHERE j.congress_number = ? AND j.job_name = ?
|
| 606 |
+
ORDER BY m.family_name, m.given_name
|
| 607 |
+
"""
|
| 608 |
+
results = execute_query(query, (congress_number, chamber))
|
| 609 |
+
else:
|
| 610 |
+
query = """
|
| 611 |
+
SELECT DISTINCT m.*, j.job_name, j.party, j.region_code
|
| 612 |
+
FROM members m
|
| 613 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 614 |
+
WHERE j.congress_number = ?
|
| 615 |
+
ORDER BY m.family_name, m.given_name
|
| 616 |
+
"""
|
| 617 |
+
results = execute_query(query, (congress_number,))
|
| 618 |
+
|
| 619 |
+
return [TextContent(type="text", text=json.dumps(results, indent=2))]
|
| 620 |
+
|
| 621 |
+
elif name == "search_by_date_range":
|
| 622 |
+
start_date = arguments["start_date"]
|
| 623 |
+
end_date = arguments["end_date"]
|
| 624 |
+
|
| 625 |
+
query = """
|
| 626 |
+
SELECT DISTINCT m.*, j.job_name, j.start_date, j.end_date
|
| 627 |
+
FROM members m
|
| 628 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 629 |
+
WHERE (j.start_date <= ? AND (j.end_date >= ? OR j.end_date IS NULL))
|
| 630 |
+
ORDER BY j.start_date, m.family_name, m.given_name
|
| 631 |
+
"""
|
| 632 |
+
results = execute_query(query, (end_date, start_date))
|
| 633 |
+
|
| 634 |
+
return [TextContent(type="text", text=json.dumps(results, indent=2))]
|
| 635 |
+
|
| 636 |
+
elif name == "semantic_search_biography":
|
| 637 |
+
query_text = arguments["query"]
|
| 638 |
+
top_k = arguments.get("top_k", 10)
|
| 639 |
+
|
| 640 |
+
# Perform semantic search
|
| 641 |
+
search_results = semantic_search(query_text, top_k)
|
| 642 |
+
|
| 643 |
+
# Get full profiles for top results
|
| 644 |
+
profiles = []
|
| 645 |
+
for result in search_results:
|
| 646 |
+
profile = get_member_profile(result['bio_id'])
|
| 647 |
+
if profile:
|
| 648 |
+
profile['similarity_score'] = result['similarity_score']
|
| 649 |
+
profiles.append(profile)
|
| 650 |
+
|
| 651 |
+
return [TextContent(type="text", text=json.dumps(profiles, indent=2))]
|
| 652 |
+
|
| 653 |
+
elif name == "get_member_profile":
|
| 654 |
+
bio_id = arguments["bio_id"]
|
| 655 |
+
profile = get_member_profile(bio_id)
|
| 656 |
+
|
| 657 |
+
if profile:
|
| 658 |
+
return [TextContent(type="text", text=json.dumps(profile, indent=2))]
|
| 659 |
+
else:
|
| 660 |
+
return [TextContent(type="text", text=f"No profile found for bio_id: {bio_id}")]
|
| 661 |
+
|
| 662 |
+
elif name == "execute_sql_query":
|
| 663 |
+
query = arguments["query"]
|
| 664 |
+
|
| 665 |
+
# Basic security: only allow SELECT queries
|
| 666 |
+
if not query.strip().upper().startswith("SELECT"):
|
| 667 |
+
return [TextContent(type="text", text="Error: Only SELECT queries are allowed.")]
|
| 668 |
+
|
| 669 |
+
results = execute_query(query)
|
| 670 |
+
return [TextContent(type="text", text=json.dumps(results, indent=2))]
|
| 671 |
+
|
| 672 |
+
elif name == "get_database_schema":
|
| 673 |
+
schema_info = {
|
| 674 |
+
"tables": {
|
| 675 |
+
"members": {
|
| 676 |
+
"description": "Main table with member biographical information",
|
| 677 |
+
"columns": [
|
| 678 |
+
"bio_id (PRIMARY KEY) - Bioguide ID",
|
| 679 |
+
"family_name - Last name",
|
| 680 |
+
"given_name - First name",
|
| 681 |
+
"middle_name - Middle name",
|
| 682 |
+
"honorific_prefix - Title (Mr., Mrs., etc.)",
|
| 683 |
+
"unaccented_family_name - Family name without accents",
|
| 684 |
+
"unaccented_given_name - Given name without accents",
|
| 685 |
+
"unaccented_middle_name - Middle name without accents",
|
| 686 |
+
"birth_date - Birth date (YYYY-MM-DD)",
|
| 687 |
+
"birth_circa - Whether birth date is approximate (0/1)",
|
| 688 |
+
"death_date - Death date (YYYY-MM-DD)",
|
| 689 |
+
"death_circa - Whether death date is approximate (0/1)",
|
| 690 |
+
"profile_text - Full biography text",
|
| 691 |
+
"full_name - Generated full name column"
|
| 692 |
+
]
|
| 693 |
+
},
|
| 694 |
+
"job_positions": {
|
| 695 |
+
"description": "Congressional positions held by members",
|
| 696 |
+
"columns": [
|
| 697 |
+
"id (PRIMARY KEY)",
|
| 698 |
+
"bio_id (FOREIGN KEY) - References members",
|
| 699 |
+
"job_name - Position title (Representative, Senator)",
|
| 700 |
+
"job_type - Type of position",
|
| 701 |
+
"start_date - Start date of position",
|
| 702 |
+
"start_circa - Whether start date is approximate (0/1)",
|
| 703 |
+
"end_date - End date of position",
|
| 704 |
+
"end_circa - Whether end date is approximate (0/1)",
|
| 705 |
+
"congress_number - Congress number (e.g., 117)",
|
| 706 |
+
"congress_name - Full Congress name",
|
| 707 |
+
"party - Party affiliation",
|
| 708 |
+
"caucus - Caucus affiliation",
|
| 709 |
+
"region_type - Type of region represented",
|
| 710 |
+
"region_code - State/region code (e.g., 'CA', 'NY')",
|
| 711 |
+
"note - Additional notes"
|
| 712 |
+
]
|
| 713 |
+
},
|
| 714 |
+
"images": {
|
| 715 |
+
"description": "Profile images",
|
| 716 |
+
"columns": ["id", "bio_id", "content_url", "caption"]
|
| 717 |
+
},
|
| 718 |
+
"relationships": {
|
| 719 |
+
"description": "Family relationships between members",
|
| 720 |
+
"columns": ["id", "bio_id", "related_bio_id", "relationship_type"]
|
| 721 |
+
},
|
| 722 |
+
"creative_works": {
|
| 723 |
+
"description": "Publications and creative works by members",
|
| 724 |
+
"columns": ["id", "bio_id", "citation_text"]
|
| 725 |
+
},
|
| 726 |
+
"assets": {
|
| 727 |
+
"description": "Additional assets (images, documents)",
|
| 728 |
+
"columns": ["id", "bio_id", "name", "asset_type", "content_url",
|
| 729 |
+
"credit_line", "accession_number", "upload_date"]
|
| 730 |
+
}
|
| 731 |
+
},
|
| 732 |
+
"indexes": [
|
| 733 |
+
"idx_family_name - Index on unaccented_family_name",
|
| 734 |
+
"idx_given_name - Index on unaccented_given_name",
|
| 735 |
+
"idx_birth_date - Index on birth_date",
|
| 736 |
+
"idx_death_date - Index on death_date",
|
| 737 |
+
"idx_job_congress - Index on congress_number",
|
| 738 |
+
"idx_job_party - Index on party",
|
| 739 |
+
"idx_job_region - Index on region_code",
|
| 740 |
+
"idx_job_type - Index on job_name"
|
| 741 |
+
]
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
+
return [TextContent(type="text", text=json.dumps(schema_info, indent=2))]
|
| 745 |
+
|
| 746 |
+
elif name == "search_by_relationship":
|
| 747 |
+
relationship_type = arguments.get("relationship_type")
|
| 748 |
+
|
| 749 |
+
if relationship_type:
|
| 750 |
+
query = """
|
| 751 |
+
SELECT m1.bio_id, m1.family_name, m1.given_name,
|
| 752 |
+
r.relationship_type, r.related_bio_id,
|
| 753 |
+
m2.family_name as related_family_name,
|
| 754 |
+
m2.given_name as related_given_name
|
| 755 |
+
FROM members m1
|
| 756 |
+
JOIN relationships r ON m1.bio_id = r.bio_id
|
| 757 |
+
JOIN members m2 ON r.related_bio_id = m2.bio_id
|
| 758 |
+
WHERE r.relationship_type = ?
|
| 759 |
+
ORDER BY m1.family_name, m1.given_name
|
| 760 |
+
"""
|
| 761 |
+
results = execute_query(query, (relationship_type,))
|
| 762 |
+
else:
|
| 763 |
+
query = """
|
| 764 |
+
SELECT m1.bio_id, m1.family_name, m1.given_name,
|
| 765 |
+
r.relationship_type, r.related_bio_id,
|
| 766 |
+
m2.family_name as related_family_name,
|
| 767 |
+
m2.given_name as related_given_name
|
| 768 |
+
FROM members m1
|
| 769 |
+
JOIN relationships r ON m1.bio_id = r.bio_id
|
| 770 |
+
JOIN members m2 ON r.related_bio_id = m2.bio_id
|
| 771 |
+
ORDER BY m1.family_name, m1.given_name
|
| 772 |
+
"""
|
| 773 |
+
results = execute_query(query)
|
| 774 |
+
|
| 775 |
+
return [TextContent(type="text", text=json.dumps(results, indent=2))]
|
| 776 |
+
|
| 777 |
+
elif name == "search_biography_regex":
|
| 778 |
+
import re
|
| 779 |
+
|
| 780 |
+
pattern = arguments["pattern"]
|
| 781 |
+
case_sensitive = arguments.get("case_sensitive", False)
|
| 782 |
+
limit = arguments.get("limit", 5)
|
| 783 |
+
filter_party = arguments.get("filter_party")
|
| 784 |
+
filter_state = arguments.get("filter_state")
|
| 785 |
+
filter_congress = arguments.get("filter_congress")
|
| 786 |
+
return_full = arguments.get("return_full_profile", False)
|
| 787 |
+
|
| 788 |
+
try:
|
| 789 |
+
# Compile regex pattern
|
| 790 |
+
flags = 0 if case_sensitive else re.IGNORECASE
|
| 791 |
+
regex = re.compile(pattern, flags)
|
| 792 |
+
|
| 793 |
+
# Build query with optional filters
|
| 794 |
+
conn = get_db_connection()
|
| 795 |
+
conn.row_factory = sqlite3.Row
|
| 796 |
+
cursor = conn.cursor()
|
| 797 |
+
|
| 798 |
+
# Base query - join with job_positions for filtering
|
| 799 |
+
query = """
|
| 800 |
+
SELECT DISTINCT m.bio_id, m.family_name, m.given_name, m.middle_name,
|
| 801 |
+
m.birth_date, m.death_date, m.profile_text,
|
| 802 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 803 |
+
FROM members m
|
| 804 |
+
LEFT JOIN job_positions j ON m.bio_id = j.bio_id
|
| 805 |
+
WHERE m.profile_text IS NOT NULL
|
| 806 |
+
"""
|
| 807 |
+
|
| 808 |
+
where_conditions = []
|
| 809 |
+
params = []
|
| 810 |
+
|
| 811 |
+
if filter_party:
|
| 812 |
+
where_conditions.append("j.party = ?")
|
| 813 |
+
params.append(filter_party)
|
| 814 |
+
if filter_state:
|
| 815 |
+
where_conditions.append("j.region_code = ?")
|
| 816 |
+
params.append(filter_state)
|
| 817 |
+
if filter_congress:
|
| 818 |
+
where_conditions.append("j.congress_number = ?")
|
| 819 |
+
params.append(filter_congress)
|
| 820 |
+
|
| 821 |
+
if where_conditions:
|
| 822 |
+
query += " AND " + " AND ".join(where_conditions)
|
| 823 |
+
|
| 824 |
+
cursor.execute(query, tuple(params))
|
| 825 |
+
|
| 826 |
+
# Filter using regex
|
| 827 |
+
matches = []
|
| 828 |
+
for row in cursor:
|
| 829 |
+
if regex.search(row['profile_text']):
|
| 830 |
+
if return_full:
|
| 831 |
+
# Return full profile
|
| 832 |
+
matches.append(dict(row))
|
| 833 |
+
else:
|
| 834 |
+
# Return concise info only
|
| 835 |
+
match_result = {
|
| 836 |
+
"bio_id": row['bio_id'],
|
| 837 |
+
"name": f"{row['given_name']} {row['middle_name'] or ''} {row['family_name']}".strip(),
|
| 838 |
+
"birth_date": row['birth_date'],
|
| 839 |
+
"death_date": row['death_date'],
|
| 840 |
+
"party": row['party'],
|
| 841 |
+
"state": row['region_code'],
|
| 842 |
+
"position": row['job_name'],
|
| 843 |
+
"congress": row['congress_number']
|
| 844 |
+
}
|
| 845 |
+
matches.append(match_result)
|
| 846 |
+
|
| 847 |
+
if len(matches) >= limit:
|
| 848 |
+
break
|
| 849 |
+
|
| 850 |
+
conn.close()
|
| 851 |
+
|
| 852 |
+
result = {
|
| 853 |
+
"pattern": pattern,
|
| 854 |
+
"case_sensitive": case_sensitive,
|
| 855 |
+
"total_members_found": len(matches),
|
| 856 |
+
"limit": limit,
|
| 857 |
+
"filters_applied": {
|
| 858 |
+
"party": filter_party,
|
| 859 |
+
"state": filter_state,
|
| 860 |
+
"congress": filter_congress
|
| 861 |
+
},
|
| 862 |
+
"results": matches
|
| 863 |
+
}
|
| 864 |
+
|
| 865 |
+
return [TextContent(type="text", text=json.dumps(result, indent=2))]
|
| 866 |
+
|
| 867 |
+
except re.error as e:
|
| 868 |
+
return [TextContent(type="text", text=f"Invalid regex pattern: {str(e)}")]
|
| 869 |
+
|
| 870 |
+
elif name == "count_members":
|
| 871 |
+
group_by = arguments["group_by"]
|
| 872 |
+
filter_party = arguments.get("filter_party")
|
| 873 |
+
filter_state = arguments.get("filter_state")
|
| 874 |
+
filter_congress = arguments.get("filter_congress")
|
| 875 |
+
filter_position = arguments.get("filter_position")
|
| 876 |
+
date_start = arguments.get("date_range_start")
|
| 877 |
+
date_end = arguments.get("date_range_end")
|
| 878 |
+
|
| 879 |
+
# Build WHERE clause
|
| 880 |
+
where_conditions = []
|
| 881 |
+
params = []
|
| 882 |
+
|
| 883 |
+
if filter_party:
|
| 884 |
+
where_conditions.append("j.party = ?")
|
| 885 |
+
params.append(filter_party)
|
| 886 |
+
if filter_state:
|
| 887 |
+
where_conditions.append("j.region_code = ?")
|
| 888 |
+
params.append(filter_state)
|
| 889 |
+
if filter_congress:
|
| 890 |
+
where_conditions.append("j.congress_number = ?")
|
| 891 |
+
params.append(filter_congress)
|
| 892 |
+
if filter_position:
|
| 893 |
+
where_conditions.append("j.job_name = ?")
|
| 894 |
+
params.append(filter_position)
|
| 895 |
+
if date_start and date_end:
|
| 896 |
+
where_conditions.append("(j.start_date <= ? AND (j.end_date >= ? OR j.end_date IS NULL))")
|
| 897 |
+
params.extend([date_end, date_start])
|
| 898 |
+
|
| 899 |
+
where_clause = "WHERE " + " AND ".join(where_conditions) if where_conditions else ""
|
| 900 |
+
|
| 901 |
+
# Build GROUP BY query
|
| 902 |
+
if group_by == "party":
|
| 903 |
+
query = f"""
|
| 904 |
+
SELECT j.party as group_key, COUNT(DISTINCT m.bio_id) as count
|
| 905 |
+
FROM members m
|
| 906 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 907 |
+
{where_clause}
|
| 908 |
+
GROUP BY j.party
|
| 909 |
+
ORDER BY count DESC
|
| 910 |
+
"""
|
| 911 |
+
elif group_by == "state":
|
| 912 |
+
query = f"""
|
| 913 |
+
SELECT j.region_code as group_key, COUNT(DISTINCT m.bio_id) as count
|
| 914 |
+
FROM members m
|
| 915 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 916 |
+
{where_clause}
|
| 917 |
+
GROUP BY j.region_code
|
| 918 |
+
ORDER BY count DESC
|
| 919 |
+
"""
|
| 920 |
+
elif group_by == "position":
|
| 921 |
+
query = f"""
|
| 922 |
+
SELECT j.job_name as group_key, COUNT(DISTINCT m.bio_id) as count
|
| 923 |
+
FROM members m
|
| 924 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 925 |
+
{where_clause}
|
| 926 |
+
GROUP BY j.job_name
|
| 927 |
+
ORDER BY count DESC
|
| 928 |
+
"""
|
| 929 |
+
elif group_by == "congress":
|
| 930 |
+
query = f"""
|
| 931 |
+
SELECT j.congress_number as group_key, COUNT(DISTINCT m.bio_id) as count
|
| 932 |
+
FROM members m
|
| 933 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 934 |
+
{where_clause}
|
| 935 |
+
GROUP BY j.congress_number
|
| 936 |
+
ORDER BY j.congress_number
|
| 937 |
+
"""
|
| 938 |
+
elif group_by == "year":
|
| 939 |
+
query = f"""
|
| 940 |
+
SELECT SUBSTR(j.start_date, 1, 4) as group_key, COUNT(DISTINCT m.bio_id) as count
|
| 941 |
+
FROM members m
|
| 942 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 943 |
+
{where_clause}
|
| 944 |
+
GROUP BY SUBSTR(j.start_date, 1, 4)
|
| 945 |
+
ORDER BY group_key
|
| 946 |
+
"""
|
| 947 |
+
|
| 948 |
+
results = execute_query(query, tuple(params))
|
| 949 |
+
total = sum(r['count'] for r in results)
|
| 950 |
+
|
| 951 |
+
response = {
|
| 952 |
+
"group_by": group_by,
|
| 953 |
+
"total_unique_members": total,
|
| 954 |
+
"groups": results,
|
| 955 |
+
"filters_applied": {
|
| 956 |
+
"party": filter_party,
|
| 957 |
+
"state": filter_state,
|
| 958 |
+
"congress": filter_congress,
|
| 959 |
+
"position": filter_position,
|
| 960 |
+
"date_range": [date_start, date_end] if date_start and date_end else None
|
| 961 |
+
}
|
| 962 |
+
}
|
| 963 |
+
|
| 964 |
+
return [TextContent(type="text", text=json.dumps(response, indent=2))]
|
| 965 |
+
|
| 966 |
+
elif name == "temporal_analysis":
|
| 967 |
+
analysis_type = arguments["analysis_type"]
|
| 968 |
+
time_unit = arguments.get("time_unit", "congress")
|
| 969 |
+
start_date = arguments.get("start_date")
|
| 970 |
+
end_date = arguments.get("end_date")
|
| 971 |
+
filter_party = arguments.get("filter_party")
|
| 972 |
+
filter_state = arguments.get("filter_state")
|
| 973 |
+
|
| 974 |
+
# Build WHERE clause
|
| 975 |
+
where_conditions = []
|
| 976 |
+
params = []
|
| 977 |
+
|
| 978 |
+
if start_date:
|
| 979 |
+
where_conditions.append("j.start_date >= ?")
|
| 980 |
+
params.append(start_date)
|
| 981 |
+
if end_date:
|
| 982 |
+
where_conditions.append("j.start_date <= ?")
|
| 983 |
+
params.append(end_date)
|
| 984 |
+
if filter_party:
|
| 985 |
+
where_conditions.append("j.party = ?")
|
| 986 |
+
params.append(filter_party)
|
| 987 |
+
if filter_state:
|
| 988 |
+
where_conditions.append("j.region_code = ?")
|
| 989 |
+
params.append(filter_state)
|
| 990 |
+
|
| 991 |
+
where_clause = "WHERE " + " AND ".join(where_conditions) if where_conditions else ""
|
| 992 |
+
|
| 993 |
+
if analysis_type == "party_over_time":
|
| 994 |
+
if time_unit == "congress":
|
| 995 |
+
query = f"""
|
| 996 |
+
SELECT j.congress_number, j.party, COUNT(DISTINCT m.bio_id) as count
|
| 997 |
+
FROM members m
|
| 998 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 999 |
+
{where_clause}
|
| 1000 |
+
GROUP BY j.congress_number, j.party
|
| 1001 |
+
ORDER BY j.congress_number, j.party
|
| 1002 |
+
"""
|
| 1003 |
+
elif time_unit == "year":
|
| 1004 |
+
query = f"""
|
| 1005 |
+
SELECT SUBSTR(j.start_date, 1, 4) as year, j.party, COUNT(DISTINCT m.bio_id) as count
|
| 1006 |
+
FROM members m
|
| 1007 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 1008 |
+
{where_clause}
|
| 1009 |
+
GROUP BY year, j.party
|
| 1010 |
+
ORDER BY year, j.party
|
| 1011 |
+
"""
|
| 1012 |
+
elif time_unit == "decade":
|
| 1013 |
+
query = f"""
|
| 1014 |
+
SELECT (CAST(SUBSTR(j.start_date, 1, 4) AS INTEGER) / 10) * 10 as decade,
|
| 1015 |
+
j.party, COUNT(DISTINCT m.bio_id) as count
|
| 1016 |
+
FROM members m
|
| 1017 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 1018 |
+
{where_clause}
|
| 1019 |
+
GROUP BY decade, j.party
|
| 1020 |
+
ORDER BY decade, j.party
|
| 1021 |
+
"""
|
| 1022 |
+
|
| 1023 |
+
elif analysis_type == "state_representation":
|
| 1024 |
+
if time_unit == "congress":
|
| 1025 |
+
query = f"""
|
| 1026 |
+
SELECT j.congress_number, j.region_code, COUNT(DISTINCT m.bio_id) as count
|
| 1027 |
+
FROM members m
|
| 1028 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 1029 |
+
{where_clause}
|
| 1030 |
+
GROUP BY j.congress_number, j.region_code
|
| 1031 |
+
ORDER BY j.congress_number, count DESC
|
| 1032 |
+
"""
|
| 1033 |
+
else:
|
| 1034 |
+
query = f"""
|
| 1035 |
+
SELECT SUBSTR(j.start_date, 1, 4) as year, j.region_code, COUNT(DISTINCT m.bio_id) as count
|
| 1036 |
+
FROM members m
|
| 1037 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 1038 |
+
{where_clause}
|
| 1039 |
+
GROUP BY year, j.region_code
|
| 1040 |
+
ORDER BY year, count DESC
|
| 1041 |
+
"""
|
| 1042 |
+
|
| 1043 |
+
elif analysis_type == "position_counts":
|
| 1044 |
+
query = f"""
|
| 1045 |
+
SELECT j.congress_number, j.job_name, COUNT(DISTINCT m.bio_id) as count
|
| 1046 |
+
FROM members m
|
| 1047 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 1048 |
+
{where_clause}
|
| 1049 |
+
GROUP BY j.congress_number, j.job_name
|
| 1050 |
+
ORDER BY j.congress_number
|
| 1051 |
+
"""
|
| 1052 |
+
|
| 1053 |
+
elif analysis_type == "demographics":
|
| 1054 |
+
# Analyze birth year distribution over time
|
| 1055 |
+
if time_unit == "congress":
|
| 1056 |
+
query = f"""
|
| 1057 |
+
SELECT j.congress_number,
|
| 1058 |
+
AVG(CAST(SUBSTR(m.birth_date, 1, 4) AS INTEGER)) as avg_birth_year,
|
| 1059 |
+
COUNT(DISTINCT m.bio_id) as count
|
| 1060 |
+
FROM members m
|
| 1061 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 1062 |
+
{where_clause}
|
| 1063 |
+
GROUP BY j.congress_number
|
| 1064 |
+
ORDER BY j.congress_number
|
| 1065 |
+
"""
|
| 1066 |
+
else:
|
| 1067 |
+
query = f"""
|
| 1068 |
+
SELECT SUBSTR(j.start_date, 1, 4) as year,
|
| 1069 |
+
AVG(CAST(SUBSTR(m.birth_date, 1, 4) AS INTEGER)) as avg_birth_year,
|
| 1070 |
+
COUNT(DISTINCT m.bio_id) as count
|
| 1071 |
+
FROM members m
|
| 1072 |
+
JOIN job_positions j ON m.bio_id = j.bio_id
|
| 1073 |
+
{where_clause}
|
| 1074 |
+
GROUP BY year
|
| 1075 |
+
ORDER BY year
|
| 1076 |
+
"""
|
| 1077 |
+
|
| 1078 |
+
results = execute_query(query, tuple(params))
|
| 1079 |
+
|
| 1080 |
+
response = {
|
| 1081 |
+
"analysis_type": analysis_type,
|
| 1082 |
+
"time_unit": time_unit,
|
| 1083 |
+
"data_points": len(results),
|
| 1084 |
+
"results": results,
|
| 1085 |
+
"filters_applied": {
|
| 1086 |
+
"start_date": start_date,
|
| 1087 |
+
"end_date": end_date,
|
| 1088 |
+
"party": filter_party,
|
| 1089 |
+
"state": filter_state
|
| 1090 |
+
}
|
| 1091 |
+
}
|
| 1092 |
+
|
| 1093 |
+
return [TextContent(type="text", text=json.dumps(response, indent=2))]
|
| 1094 |
+
|
| 1095 |
+
elif name == "count_by_biography_content":
|
| 1096 |
+
keywords = arguments["keywords"]
|
| 1097 |
+
match_all = arguments.get("match_all", False)
|
| 1098 |
+
breakdown_by = arguments.get("breakdown_by", "none")
|
| 1099 |
+
filter_party = arguments.get("filter_party")
|
| 1100 |
+
filter_state = arguments.get("filter_state")
|
| 1101 |
+
|
| 1102 |
+
# Build the query to find matching members
|
| 1103 |
+
conn = get_db_connection()
|
| 1104 |
+
conn.row_factory = sqlite3.Row
|
| 1105 |
+
cursor = conn.cursor()
|
| 1106 |
+
|
| 1107 |
+
# Get all members with their job info
|
| 1108 |
+
base_query = """
|
| 1109 |
+
SELECT DISTINCT m.bio_id, m.profile_text,
|
| 1110 |
+
j.party, j.region_code, j.job_name, j.congress_number
|
| 1111 |
+
FROM members m
|
| 1112 |
+
LEFT JOIN job_positions j ON m.bio_id = j.bio_id
|
| 1113 |
+
WHERE m.profile_text IS NOT NULL
|
| 1114 |
+
"""
|
| 1115 |
+
|
| 1116 |
+
where_conditions = []
|
| 1117 |
+
params = []
|
| 1118 |
+
|
| 1119 |
+
if filter_party:
|
| 1120 |
+
where_conditions.append("j.party = ?")
|
| 1121 |
+
params.append(filter_party)
|
| 1122 |
+
if filter_state:
|
| 1123 |
+
where_conditions.append("j.region_code = ?")
|
| 1124 |
+
params.append(filter_state)
|
| 1125 |
+
|
| 1126 |
+
if where_conditions:
|
| 1127 |
+
base_query += " AND " + " AND ".join(where_conditions)
|
| 1128 |
+
|
| 1129 |
+
cursor.execute(base_query, tuple(params))
|
| 1130 |
+
all_members = cursor.fetchall()
|
| 1131 |
+
|
| 1132 |
+
# Filter members by keywords
|
| 1133 |
+
matching_members = []
|
| 1134 |
+
for member in all_members:
|
| 1135 |
+
profile_text_lower = member['profile_text'].lower() if member['profile_text'] else ""
|
| 1136 |
+
|
| 1137 |
+
if match_all:
|
| 1138 |
+
# ALL keywords must be present
|
| 1139 |
+
if all(keyword.lower() in profile_text_lower for keyword in keywords):
|
| 1140 |
+
matching_members.append(dict(member))
|
| 1141 |
+
else:
|
| 1142 |
+
# ANY keyword must be present
|
| 1143 |
+
if any(keyword.lower() in profile_text_lower for keyword in keywords):
|
| 1144 |
+
matching_members.append(dict(member))
|
| 1145 |
+
|
| 1146 |
+
conn.close()
|
| 1147 |
+
|
| 1148 |
+
# Count total unique members
|
| 1149 |
+
unique_bio_ids = set(m['bio_id'] for m in matching_members)
|
| 1150 |
+
total_count = len(unique_bio_ids)
|
| 1151 |
+
|
| 1152 |
+
# Breakdown if requested
|
| 1153 |
+
breakdown = None
|
| 1154 |
+
if breakdown_by != "none" and matching_members:
|
| 1155 |
+
breakdown_counts = {}
|
| 1156 |
+
|
| 1157 |
+
for member in matching_members:
|
| 1158 |
+
if breakdown_by == "party":
|
| 1159 |
+
key = member.get('party', 'Unknown')
|
| 1160 |
+
elif breakdown_by == "state":
|
| 1161 |
+
key = member.get('region_code', 'Unknown')
|
| 1162 |
+
elif breakdown_by == "position":
|
| 1163 |
+
key = member.get('job_name', 'Unknown')
|
| 1164 |
+
elif breakdown_by == "congress":
|
| 1165 |
+
key = member.get('congress_number', 'Unknown')
|
| 1166 |
+
else:
|
| 1167 |
+
key = 'Unknown'
|
| 1168 |
+
|
| 1169 |
+
if key not in breakdown_counts:
|
| 1170 |
+
breakdown_counts[key] = set()
|
| 1171 |
+
breakdown_counts[key].add(member['bio_id'])
|
| 1172 |
+
|
| 1173 |
+
# Convert sets to counts
|
| 1174 |
+
breakdown = [
|
| 1175 |
+
{"group": k, "count": len(v)}
|
| 1176 |
+
for k, v in sorted(breakdown_counts.items(), key=lambda x: len(x[1]), reverse=True)
|
| 1177 |
+
]
|
| 1178 |
+
|
| 1179 |
+
response = {
|
| 1180 |
+
"keywords": keywords,
|
| 1181 |
+
"match_all": match_all,
|
| 1182 |
+
"total_members_matching": total_count,
|
| 1183 |
+
"breakdown_by": breakdown_by,
|
| 1184 |
+
"breakdown": breakdown,
|
| 1185 |
+
"filters_applied": {
|
| 1186 |
+
"party": filter_party,
|
| 1187 |
+
"state": filter_state
|
| 1188 |
+
}
|
| 1189 |
+
}
|
| 1190 |
+
|
| 1191 |
+
return [TextContent(type="text", text=json.dumps(response, indent=2))]
|
| 1192 |
+
|
| 1193 |
+
else:
|
| 1194 |
+
return [TextContent(type="text", text=f"Unknown tool: {name}")]
|
| 1195 |
+
|
| 1196 |
+
except Exception as e:
|
| 1197 |
+
return [TextContent(type="text", text=f"Error executing tool {name}: {str(e)}")]
|
| 1198 |
+
|
| 1199 |
+
|
| 1200 |
+
async def main():
|
| 1201 |
+
"""Main entry point for the MCP server."""
|
| 1202 |
+
# Initialize search index (log to stderr to not interfere with stdio JSON protocol)
|
| 1203 |
+
if initialize_search_index():
|
| 1204 |
+
print("Search index loaded successfully", file=sys.stderr, flush=True)
|
| 1205 |
+
else:
|
| 1206 |
+
print("Warning: Search index not found. Run ingest_data.py to create it.", file=sys.stderr, flush=True)
|
| 1207 |
+
|
| 1208 |
+
# Run the server
|
| 1209 |
+
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
|
| 1210 |
+
await server.run(
|
| 1211 |
+
read_stream,
|
| 1212 |
+
write_stream,
|
| 1213 |
+
server.create_initialization_options()
|
| 1214 |
+
)
|
| 1215 |
+
|
| 1216 |
+
|
| 1217 |
+
if __name__ == "__main__":
|
| 1218 |
+
import asyncio
|
| 1219 |
+
asyncio.run(main())
|
setup.sh
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Setup script for Congressional Bioguide MCP Server
|
| 3 |
+
|
| 4 |
+
set -e
|
| 5 |
+
|
| 6 |
+
echo "Setting up Congressional Bioguide MCP Server..."
|
| 7 |
+
echo "=============================================="
|
| 8 |
+
|
| 9 |
+
# Check for compatible Python versions
|
| 10 |
+
PYTHON_CMD=""
|
| 11 |
+
|
| 12 |
+
# Try to find a compatible Python version (3.10-3.13)
|
| 13 |
+
for version in python3.13 python3.12 python3.11 python3.10; do
|
| 14 |
+
if command -v $version &> /dev/null; then
|
| 15 |
+
PYTHON_CMD=$version
|
| 16 |
+
echo "β Found compatible Python: $($PYTHON_CMD --version)"
|
| 17 |
+
break
|
| 18 |
+
fi
|
| 19 |
+
done
|
| 20 |
+
|
| 21 |
+
# Fall back to python3 if no specific version found
|
| 22 |
+
if [ -z "$PYTHON_CMD" ]; then
|
| 23 |
+
if command -v python3 &> /dev/null; then
|
| 24 |
+
PYTHON_CMD=python3
|
| 25 |
+
PYTHON_VERSION=$($PYTHON_CMD --version 2>&1 | awk '{print $2}')
|
| 26 |
+
MAJOR=$(echo $PYTHON_VERSION | cut -d. -f1)
|
| 27 |
+
MINOR=$(echo $PYTHON_VERSION | cut -d. -f2)
|
| 28 |
+
|
| 29 |
+
echo "β οΈ Found Python $PYTHON_VERSION"
|
| 30 |
+
|
| 31 |
+
if [ "$MAJOR" -eq 3 ] && [ "$MINOR" -ge 14 ]; then
|
| 32 |
+
echo ""
|
| 33 |
+
echo "ERROR: Python 3.14+ is not compatible with FAISS library"
|
| 34 |
+
echo ""
|
| 35 |
+
echo "Please install Python 3.13 or 3.12 using pyenv:"
|
| 36 |
+
echo " brew install pyenv"
|
| 37 |
+
echo " pyenv install 3.13"
|
| 38 |
+
echo " pyenv local 3.13"
|
| 39 |
+
echo " ./setup.sh"
|
| 40 |
+
echo ""
|
| 41 |
+
exit 1
|
| 42 |
+
elif [ "$MAJOR" -eq 3 ] && [ "$MINOR" -lt 10 ]; then
|
| 43 |
+
echo "ERROR: Python 3.10 or higher required (found $PYTHON_VERSION)"
|
| 44 |
+
exit 1
|
| 45 |
+
fi
|
| 46 |
+
else
|
| 47 |
+
echo "ERROR: Python 3 not found"
|
| 48 |
+
exit 1
|
| 49 |
+
fi
|
| 50 |
+
fi
|
| 51 |
+
|
| 52 |
+
# Create virtual environment if it doesn't exist
|
| 53 |
+
if [ ! -d "venv" ]; then
|
| 54 |
+
echo "Creating virtual environment with $PYTHON_CMD..."
|
| 55 |
+
$PYTHON_CMD -m venv venv
|
| 56 |
+
echo "β Virtual environment created"
|
| 57 |
+
else
|
| 58 |
+
echo "β Virtual environment already exists"
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
# Activate virtual environment
|
| 62 |
+
source venv/bin/activate
|
| 63 |
+
|
| 64 |
+
# Verify we're using the venv python
|
| 65 |
+
echo "Using Python: $(which python3)"
|
| 66 |
+
echo "Version: $(python3 --version)"
|
| 67 |
+
|
| 68 |
+
# Install dependencies
|
| 69 |
+
echo ""
|
| 70 |
+
echo "Installing dependencies..."
|
| 71 |
+
pip install --upgrade pip
|
| 72 |
+
pip install -r requirements.txt
|
| 73 |
+
echo "β Dependencies installed"
|
| 74 |
+
|
| 75 |
+
# Run ingestion
|
| 76 |
+
echo ""
|
| 77 |
+
echo "Running data ingestion..."
|
| 78 |
+
python3 ingest_data.py
|
| 79 |
+
|
| 80 |
+
echo ""
|
| 81 |
+
echo "=============================================="
|
| 82 |
+
echo "β Setup complete!"
|
| 83 |
+
echo ""
|
| 84 |
+
echo "To run the server:"
|
| 85 |
+
echo " source venv/bin/activate"
|
| 86 |
+
echo " python3 server.py"
|
test_embeddings_data.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test the embeddings data to check for issues before FAISS operations.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
import sqlite3
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
print("=" * 60)
|
| 12 |
+
print("EMBEDDINGS DATA VALIDATION TEST")
|
| 13 |
+
print("=" * 60)
|
| 14 |
+
print(f"Python version: {sys.version}")
|
| 15 |
+
print()
|
| 16 |
+
|
| 17 |
+
# Load model
|
| 18 |
+
print("Loading sentence transformer...")
|
| 19 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 20 |
+
from sentence_transformers import SentenceTransformer
|
| 21 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 22 |
+
print("β Model loaded\n")
|
| 23 |
+
|
| 24 |
+
# Load ALL biographies
|
| 25 |
+
print("Loading ALL biographies from database...")
|
| 26 |
+
conn = sqlite3.connect("congress.db")
|
| 27 |
+
cursor = conn.cursor()
|
| 28 |
+
cursor.execute("""
|
| 29 |
+
SELECT bio_id, profile_text
|
| 30 |
+
FROM members
|
| 31 |
+
WHERE profile_text IS NOT NULL AND profile_text != ''
|
| 32 |
+
""")
|
| 33 |
+
rows = cursor.fetchall()
|
| 34 |
+
conn.close()
|
| 35 |
+
|
| 36 |
+
bio_ids = [r[0] for r in rows]
|
| 37 |
+
texts = [r[1] for r in rows]
|
| 38 |
+
print(f"β Loaded {len(texts)} biographies\n")
|
| 39 |
+
|
| 40 |
+
# Encode ALL
|
| 41 |
+
print("Encoding all biographies...")
|
| 42 |
+
print("(This will take a few minutes...)")
|
| 43 |
+
embeddings = []
|
| 44 |
+
batch_size = 32
|
| 45 |
+
|
| 46 |
+
for i in range(0, len(texts), batch_size):
|
| 47 |
+
batch = texts[i:i + batch_size]
|
| 48 |
+
batch_embeddings = model.encode(
|
| 49 |
+
batch,
|
| 50 |
+
show_progress_bar=False,
|
| 51 |
+
convert_to_numpy=True,
|
| 52 |
+
normalize_embeddings=False,
|
| 53 |
+
device='cpu'
|
| 54 |
+
)
|
| 55 |
+
embeddings.extend(batch_embeddings)
|
| 56 |
+
|
| 57 |
+
if (i // batch_size + 1) % 100 == 0:
|
| 58 |
+
print(f" Encoded {i + len(batch)}/{len(texts)}...")
|
| 59 |
+
|
| 60 |
+
embeddings = np.array(embeddings, dtype=np.float32)
|
| 61 |
+
print(f"β Encoded all, shape: {embeddings.shape}\n")
|
| 62 |
+
|
| 63 |
+
# Validate embeddings
|
| 64 |
+
print("Validating embeddings data...")
|
| 65 |
+
print(f" Shape: {embeddings.shape}")
|
| 66 |
+
print(f" Dtype: {embeddings.dtype}")
|
| 67 |
+
print(f" Min value: {np.min(embeddings)}")
|
| 68 |
+
print(f" Max value: {np.max(embeddings)}")
|
| 69 |
+
print(f" Mean: {np.mean(embeddings)}")
|
| 70 |
+
print(f" Has NaN: {np.any(np.isnan(embeddings))}")
|
| 71 |
+
print(f" Has Inf: {np.any(np.isinf(embeddings))}")
|
| 72 |
+
print(f" Is C-contiguous: {embeddings.flags['C_CONTIGUOUS']}")
|
| 73 |
+
print(f" Memory usage: {embeddings.nbytes / (1024**2):.2f} MB")
|
| 74 |
+
|
| 75 |
+
if np.any(np.isnan(embeddings)):
|
| 76 |
+
print("\nβ ERROR: Embeddings contain NaN values!")
|
| 77 |
+
sys.exit(1)
|
| 78 |
+
|
| 79 |
+
if np.any(np.isinf(embeddings)):
|
| 80 |
+
print("\nβ ERROR: Embeddings contain Inf values!")
|
| 81 |
+
sys.exit(1)
|
| 82 |
+
|
| 83 |
+
print("\nβ Embeddings data looks good")
|
| 84 |
+
|
| 85 |
+
# Now test FAISS operations one by one
|
| 86 |
+
print("\n" + "=" * 60)
|
| 87 |
+
print("Testing FAISS operations...")
|
| 88 |
+
print("=" * 60)
|
| 89 |
+
|
| 90 |
+
import faiss
|
| 91 |
+
|
| 92 |
+
dimension = embeddings.shape[1]
|
| 93 |
+
print(f"\n1. Creating IndexFlatIP with dimension {dimension}...")
|
| 94 |
+
try:
|
| 95 |
+
index = faiss.IndexFlatIP(dimension)
|
| 96 |
+
print(" β Index created")
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print(f" β FAILED at index creation: {e}")
|
| 99 |
+
import traceback
|
| 100 |
+
traceback.print_exc()
|
| 101 |
+
sys.exit(1)
|
| 102 |
+
|
| 103 |
+
print(f"\n2. Normalizing {len(embeddings)} embeddings...")
|
| 104 |
+
try:
|
| 105 |
+
# Make a copy to preserve original
|
| 106 |
+
embeddings_norm = embeddings.copy()
|
| 107 |
+
print(f" Before normalize - sample norm: {np.linalg.norm(embeddings_norm[0]):.4f}")
|
| 108 |
+
|
| 109 |
+
faiss.normalize_L2(embeddings_norm)
|
| 110 |
+
|
| 111 |
+
print(f" After normalize - sample norm: {np.linalg.norm(embeddings_norm[0]):.4f}")
|
| 112 |
+
print(f" β Normalized")
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f" β FAILED at normalize: {e}")
|
| 115 |
+
import traceback
|
| 116 |
+
traceback.print_exc()
|
| 117 |
+
sys.exit(1)
|
| 118 |
+
|
| 119 |
+
print(f"\n3. Adding {len(embeddings_norm)} vectors to index...")
|
| 120 |
+
try:
|
| 121 |
+
index.add(embeddings_norm)
|
| 122 |
+
print(f" β Added {index.ntotal} vectors")
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f" β FAILED at add: {e}")
|
| 125 |
+
import traceback
|
| 126 |
+
traceback.print_exc()
|
| 127 |
+
sys.exit(1)
|
| 128 |
+
|
| 129 |
+
print(f"\n4. Writing index to disk...")
|
| 130 |
+
try:
|
| 131 |
+
faiss.write_index(index, "test_full.faiss")
|
| 132 |
+
print(f" β Index written")
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f" β FAILED at write: {e}")
|
| 135 |
+
import traceback
|
| 136 |
+
traceback.print_exc()
|
| 137 |
+
sys.exit(1)
|
| 138 |
+
|
| 139 |
+
print("\n" + "=" * 60)
|
| 140 |
+
print("β
SUCCESS! Full pipeline works!")
|
| 141 |
+
print("=" * 60)
|
| 142 |
+
print(f"\nProcessed {len(embeddings)} embeddings successfully")
|
| 143 |
+
print("The index has been created: test_full.faiss")
|
test_faiss_minimal.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Minimal FAISS test to isolate segfault issue.
|
| 4 |
+
Tests each step individually to find the exact failure point.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
print("=" * 60)
|
| 11 |
+
print("MINIMAL FAISS TEST - Step by step debugging")
|
| 12 |
+
print("=" * 60)
|
| 13 |
+
print(f"Python version: {sys.version}")
|
| 14 |
+
print()
|
| 15 |
+
|
| 16 |
+
# Test 1: Import numpy
|
| 17 |
+
print("Test 1: Import numpy...")
|
| 18 |
+
try:
|
| 19 |
+
import numpy as np
|
| 20 |
+
print(f" β numpy imported successfully (version {np.__version__})")
|
| 21 |
+
except Exception as e:
|
| 22 |
+
print(f" β Failed: {e}")
|
| 23 |
+
sys.exit(1)
|
| 24 |
+
|
| 25 |
+
# Test 2: Import faiss
|
| 26 |
+
print("\nTest 2: Import faiss...")
|
| 27 |
+
try:
|
| 28 |
+
import faiss
|
| 29 |
+
print(f" β faiss imported successfully")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f" β Failed: {e}")
|
| 32 |
+
sys.exit(1)
|
| 33 |
+
|
| 34 |
+
# Test 3: Create simple numpy array
|
| 35 |
+
print("\nTest 3: Create numpy array...")
|
| 36 |
+
try:
|
| 37 |
+
test_data = np.random.rand(10, 128).astype('float32')
|
| 38 |
+
print(f" β Created array with shape {test_data.shape}")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f" β Failed: {e}")
|
| 41 |
+
sys.exit(1)
|
| 42 |
+
|
| 43 |
+
# Test 4: Create FAISS index
|
| 44 |
+
print("\nTest 4: Create FAISS index...")
|
| 45 |
+
try:
|
| 46 |
+
dimension = 128
|
| 47 |
+
index = faiss.IndexFlatL2(dimension)
|
| 48 |
+
print(f" β Created IndexFlatL2 with dimension {dimension}")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f" β Failed: {e}")
|
| 51 |
+
sys.exit(1)
|
| 52 |
+
|
| 53 |
+
# Test 5: Add vectors to index
|
| 54 |
+
print("\nTest 5: Add vectors to FAISS index...")
|
| 55 |
+
try:
|
| 56 |
+
index.add(test_data)
|
| 57 |
+
print(f" β Added {index.ntotal} vectors to index")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f" β Failed: {e}")
|
| 60 |
+
sys.exit(1)
|
| 61 |
+
|
| 62 |
+
# Test 6: Search index
|
| 63 |
+
print("\nTest 6: Search FAISS index...")
|
| 64 |
+
try:
|
| 65 |
+
query = np.random.rand(1, 128).astype('float32')
|
| 66 |
+
distances, indices = index.search(query, 5)
|
| 67 |
+
print(f" β Search completed, found {len(indices[0])} results")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f" β Failed: {e}")
|
| 70 |
+
sys.exit(1)
|
| 71 |
+
|
| 72 |
+
# Test 7: Test with IndexFlatIP (what we actually use)
|
| 73 |
+
print("\nTest 7: Create IndexFlatIP...")
|
| 74 |
+
try:
|
| 75 |
+
index_ip = faiss.IndexFlatIP(dimension)
|
| 76 |
+
print(f" β Created IndexFlatIP")
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f" β Failed: {e}")
|
| 79 |
+
sys.exit(1)
|
| 80 |
+
|
| 81 |
+
# Test 8: Normalize vectors (critical step)
|
| 82 |
+
print("\nTest 8: Normalize vectors with faiss.normalize_L2...")
|
| 83 |
+
try:
|
| 84 |
+
test_data_copy = test_data.copy()
|
| 85 |
+
faiss.normalize_L2(test_data_copy)
|
| 86 |
+
print(f" β Normalized vectors")
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f" β Failed: {e}")
|
| 89 |
+
import traceback
|
| 90 |
+
traceback.print_exc()
|
| 91 |
+
sys.exit(1)
|
| 92 |
+
|
| 93 |
+
# Test 9: Add normalized vectors to IndexFlatIP
|
| 94 |
+
print("\nTest 9: Add normalized vectors to IndexFlatIP...")
|
| 95 |
+
try:
|
| 96 |
+
index_ip.add(test_data_copy)
|
| 97 |
+
print(f" β Added {index_ip.ntotal} normalized vectors")
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f" β Failed: {e}")
|
| 100 |
+
import traceback
|
| 101 |
+
traceback.print_exc()
|
| 102 |
+
sys.exit(1)
|
| 103 |
+
|
| 104 |
+
# Test 10: Write index to disk
|
| 105 |
+
print("\nTest 10: Write index to disk...")
|
| 106 |
+
try:
|
| 107 |
+
faiss.write_index(index_ip, "test_index.faiss")
|
| 108 |
+
print(f" β Index written to test_index.faiss")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
print(f" β Failed: {e}")
|
| 111 |
+
import traceback
|
| 112 |
+
traceback.print_exc()
|
| 113 |
+
sys.exit(1)
|
| 114 |
+
|
| 115 |
+
# Test 11: Read index from disk
|
| 116 |
+
print("\nTest 11: Read index from disk...")
|
| 117 |
+
try:
|
| 118 |
+
loaded_index = faiss.read_index("test_index.faiss")
|
| 119 |
+
print(f" β Index loaded, contains {loaded_index.ntotal} vectors")
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f" β Failed: {e}")
|
| 122 |
+
import traceback
|
| 123 |
+
traceback.print_exc()
|
| 124 |
+
sys.exit(1)
|
| 125 |
+
|
| 126 |
+
# Clean up
|
| 127 |
+
print("\nTest 12: Clean up test file...")
|
| 128 |
+
try:
|
| 129 |
+
import os
|
| 130 |
+
os.remove("test_index.faiss")
|
| 131 |
+
print(f" β Test file removed")
|
| 132 |
+
except Exception as e:
|
| 133 |
+
print(f" β οΈ Could not remove test file: {e}")
|
| 134 |
+
|
| 135 |
+
print("\n" + "=" * 60)
|
| 136 |
+
print("β
ALL TESTS PASSED!")
|
| 137 |
+
print("=" * 60)
|
| 138 |
+
print("\nFAISS is working correctly on your system.")
|
| 139 |
+
print("The issue may be with:")
|
| 140 |
+
print(" - Specific data from the database")
|
| 141 |
+
print(" - Memory/size of actual embeddings")
|
| 142 |
+
print(" - Sentence transformers interaction")
|
test_queries.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to validate the Congressional Bioguide database and search functionality.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sqlite3
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_database():
|
| 12 |
+
"""Test database structure and basic queries."""
|
| 13 |
+
print("Testing Database...")
|
| 14 |
+
print("=" * 60)
|
| 15 |
+
|
| 16 |
+
if not Path("congress.db").exists():
|
| 17 |
+
print("β Database not found. Run ingest_data.py first.")
|
| 18 |
+
return False
|
| 19 |
+
|
| 20 |
+
conn = sqlite3.connect("congress.db")
|
| 21 |
+
cursor = conn.cursor()
|
| 22 |
+
|
| 23 |
+
# Test 1: Count members
|
| 24 |
+
cursor.execute("SELECT COUNT(*) FROM members")
|
| 25 |
+
member_count = cursor.fetchone()[0]
|
| 26 |
+
print(f"β Members in database: {member_count}")
|
| 27 |
+
|
| 28 |
+
# Test 2: Count job positions
|
| 29 |
+
cursor.execute("SELECT COUNT(*) FROM job_positions")
|
| 30 |
+
job_count = cursor.fetchone()[0]
|
| 31 |
+
print(f"β Job positions recorded: {job_count}")
|
| 32 |
+
|
| 33 |
+
# Test 3: Search by name
|
| 34 |
+
cursor.execute("""
|
| 35 |
+
SELECT bio_id, family_name, given_name, birth_date, death_date
|
| 36 |
+
FROM members
|
| 37 |
+
WHERE unaccented_family_name = 'Lincoln'
|
| 38 |
+
ORDER BY birth_date
|
| 39 |
+
""")
|
| 40 |
+
lincolns = cursor.fetchall()
|
| 41 |
+
print(f"\nβ Found {len(lincolns)} member(s) with family name 'Lincoln':")
|
| 42 |
+
for bio_id, family, given, birth, death in lincolns:
|
| 43 |
+
print(f" - {given} {family} ({bio_id}): {birth} - {death or 'present'}")
|
| 44 |
+
|
| 45 |
+
# Test 4: Party breakdown
|
| 46 |
+
cursor.execute("""
|
| 47 |
+
SELECT party, COUNT(DISTINCT bio_id) as count
|
| 48 |
+
FROM job_positions
|
| 49 |
+
WHERE party IS NOT NULL
|
| 50 |
+
GROUP BY party
|
| 51 |
+
ORDER BY count DESC
|
| 52 |
+
LIMIT 10
|
| 53 |
+
""")
|
| 54 |
+
parties = cursor.fetchall()
|
| 55 |
+
print(f"\nβ Top parties by member count:")
|
| 56 |
+
for party, count in parties:
|
| 57 |
+
print(f" - {party}: {count} members")
|
| 58 |
+
|
| 59 |
+
# Test 5: State representation
|
| 60 |
+
cursor.execute("""
|
| 61 |
+
SELECT region_code, COUNT(DISTINCT bio_id) as count
|
| 62 |
+
FROM job_positions
|
| 63 |
+
WHERE region_code IS NOT NULL AND region_type = 'StateRegion'
|
| 64 |
+
GROUP BY region_code
|
| 65 |
+
ORDER BY count DESC
|
| 66 |
+
LIMIT 10
|
| 67 |
+
""")
|
| 68 |
+
states = cursor.fetchall()
|
| 69 |
+
print(f"\nβ Top states by member count:")
|
| 70 |
+
for state, count in states:
|
| 71 |
+
print(f" - {state}: {count} members")
|
| 72 |
+
|
| 73 |
+
# Test 6: Relationships
|
| 74 |
+
cursor.execute("SELECT COUNT(*) FROM relationships")
|
| 75 |
+
rel_count = cursor.fetchone()[0]
|
| 76 |
+
print(f"\nβ Family relationships recorded: {rel_count}")
|
| 77 |
+
|
| 78 |
+
if rel_count > 0:
|
| 79 |
+
cursor.execute("""
|
| 80 |
+
SELECT m1.given_name, m1.family_name, r.relationship_type,
|
| 81 |
+
m2.given_name, m2.family_name
|
| 82 |
+
FROM relationships r
|
| 83 |
+
JOIN members m1 ON r.bio_id = m1.bio_id
|
| 84 |
+
JOIN members m2 ON r.related_bio_id = m2.bio_id
|
| 85 |
+
LIMIT 5
|
| 86 |
+
""")
|
| 87 |
+
relationships = cursor.fetchall()
|
| 88 |
+
print(" Sample relationships:")
|
| 89 |
+
for given1, family1, rel_type, given2, family2 in relationships:
|
| 90 |
+
print(f" - {given1} {family1} is {rel_type} of {given2} {family2}")
|
| 91 |
+
|
| 92 |
+
# Test 7: Profile text
|
| 93 |
+
cursor.execute("""
|
| 94 |
+
SELECT bio_id, given_name, family_name, LENGTH(profile_text) as text_len
|
| 95 |
+
FROM members
|
| 96 |
+
WHERE profile_text IS NOT NULL
|
| 97 |
+
ORDER BY text_len DESC
|
| 98 |
+
LIMIT 5
|
| 99 |
+
""")
|
| 100 |
+
longest_profiles = cursor.fetchall()
|
| 101 |
+
print(f"\nβ Longest biography profiles:")
|
| 102 |
+
for bio_id, given, family, length in longest_profiles:
|
| 103 |
+
print(f" - {given} {family} ({bio_id}): {length} characters")
|
| 104 |
+
|
| 105 |
+
conn.close()
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def test_faiss_index():
|
| 110 |
+
"""Test FAISS index."""
|
| 111 |
+
print("\n\nTesting FAISS Index...")
|
| 112 |
+
print("=" * 60)
|
| 113 |
+
|
| 114 |
+
if not Path("congress_faiss.index").exists():
|
| 115 |
+
print("β FAISS index not found. Run ingest_data.py first.")
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
if not Path("congress_bio_ids.pkl").exists():
|
| 119 |
+
print("β Bio ID mapping not found. Run ingest_data.py first.")
|
| 120 |
+
return False
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
import faiss
|
| 124 |
+
import pickle
|
| 125 |
+
from sentence_transformers import SentenceTransformer
|
| 126 |
+
|
| 127 |
+
# Load index
|
| 128 |
+
index = faiss.read_index("congress_faiss.index")
|
| 129 |
+
with open("congress_bio_ids.pkl", "rb") as f:
|
| 130 |
+
bio_ids = pickle.load(f)
|
| 131 |
+
|
| 132 |
+
print(f"β FAISS index loaded: {index.ntotal} vectors")
|
| 133 |
+
print(f"β Dimension: {index.d}")
|
| 134 |
+
|
| 135 |
+
# Load model
|
| 136 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 137 |
+
print("β Sentence transformer model loaded")
|
| 138 |
+
|
| 139 |
+
# Test search
|
| 140 |
+
test_queries = [
|
| 141 |
+
"lawyers who became judges",
|
| 142 |
+
"Civil War veterans",
|
| 143 |
+
"served in the military",
|
| 144 |
+
"teachers and educators"
|
| 145 |
+
]
|
| 146 |
+
|
| 147 |
+
for query in test_queries:
|
| 148 |
+
print(f"\nβ Testing query: '{query}'")
|
| 149 |
+
query_embedding = model.encode([query])[0].reshape(1, -1).astype('float32')
|
| 150 |
+
faiss.normalize_L2(query_embedding)
|
| 151 |
+
|
| 152 |
+
scores, indices = index.search(query_embedding, 3)
|
| 153 |
+
|
| 154 |
+
# Load database to get names
|
| 155 |
+
conn = sqlite3.connect("congress.db")
|
| 156 |
+
cursor = conn.cursor()
|
| 157 |
+
|
| 158 |
+
print(" Top 3 results:")
|
| 159 |
+
for idx, score in zip(indices[0], scores[0]):
|
| 160 |
+
if idx < len(bio_ids):
|
| 161 |
+
bio_id = bio_ids[idx]
|
| 162 |
+
cursor.execute(
|
| 163 |
+
"SELECT given_name, family_name FROM members WHERE bio_id = ?",
|
| 164 |
+
(bio_id,)
|
| 165 |
+
)
|
| 166 |
+
result = cursor.fetchone()
|
| 167 |
+
if result:
|
| 168 |
+
given, family = result
|
| 169 |
+
print(f" - {given} {family} ({bio_id}): score={score:.4f}")
|
| 170 |
+
|
| 171 |
+
conn.close()
|
| 172 |
+
|
| 173 |
+
return True
|
| 174 |
+
|
| 175 |
+
except ImportError as e:
|
| 176 |
+
print(f"β Missing dependency: {e}")
|
| 177 |
+
print(" Run: pip install -r requirements.txt")
|
| 178 |
+
return False
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"β Error testing FAISS: {e}")
|
| 181 |
+
return False
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def test_sample_profile():
|
| 185 |
+
"""Display a sample profile."""
|
| 186 |
+
print("\n\nSample Profile...")
|
| 187 |
+
print("=" * 60)
|
| 188 |
+
|
| 189 |
+
conn = sqlite3.connect("congress.db")
|
| 190 |
+
conn.row_factory = sqlite3.Row
|
| 191 |
+
cursor = conn.cursor()
|
| 192 |
+
|
| 193 |
+
# Get a well-known member
|
| 194 |
+
cursor.execute("""
|
| 195 |
+
SELECT * FROM members
|
| 196 |
+
WHERE unaccented_family_name = 'Lincoln' AND unaccented_given_name = 'Abraham'
|
| 197 |
+
LIMIT 1
|
| 198 |
+
""")
|
| 199 |
+
member = cursor.fetchone()
|
| 200 |
+
|
| 201 |
+
if member:
|
| 202 |
+
bio_id = member['bio_id']
|
| 203 |
+
print(f"Profile: {member['given_name']} {member['family_name']} ({bio_id})")
|
| 204 |
+
print(f"Birth: {member['birth_date']}")
|
| 205 |
+
print(f"Death: {member['death_date']}")
|
| 206 |
+
print(f"\nBiography excerpt:")
|
| 207 |
+
profile_text = member['profile_text'] or ""
|
| 208 |
+
print(f" {profile_text[:300]}...")
|
| 209 |
+
|
| 210 |
+
# Get positions
|
| 211 |
+
cursor.execute("""
|
| 212 |
+
SELECT job_name, party, congress_number, region_code, start_date, end_date
|
| 213 |
+
FROM job_positions
|
| 214 |
+
WHERE bio_id = ?
|
| 215 |
+
ORDER BY start_date
|
| 216 |
+
""", (bio_id,))
|
| 217 |
+
positions = cursor.fetchall()
|
| 218 |
+
|
| 219 |
+
if positions:
|
| 220 |
+
print(f"\nPositions held ({len(positions)}):")
|
| 221 |
+
for pos in positions:
|
| 222 |
+
print(f" - {pos['job_name']} ({pos['party']}), {pos['region_code']}")
|
| 223 |
+
print(f" Congress {pos['congress_number']}: {pos['start_date']} - {pos['end_date']}")
|
| 224 |
+
|
| 225 |
+
conn.close()
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def main():
|
| 229 |
+
"""Run all tests."""
|
| 230 |
+
print("Congressional Bioguide Database Test Suite")
|
| 231 |
+
print("=" * 60)
|
| 232 |
+
print()
|
| 233 |
+
|
| 234 |
+
db_ok = test_database()
|
| 235 |
+
faiss_ok = test_faiss_index()
|
| 236 |
+
|
| 237 |
+
if db_ok:
|
| 238 |
+
test_sample_profile()
|
| 239 |
+
|
| 240 |
+
print("\n" + "=" * 60)
|
| 241 |
+
if db_ok and faiss_ok:
|
| 242 |
+
print("β All tests passed!")
|
| 243 |
+
print("\nThe system is ready to use. Start the MCP server with:")
|
| 244 |
+
print(" python3 server.py")
|
| 245 |
+
else:
|
| 246 |
+
print("β Some tests failed. Please check the errors above.")
|
| 247 |
+
if not db_ok:
|
| 248 |
+
print(" Run: python3 ingest_data.py")
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
if __name__ == "__main__":
|
| 252 |
+
main()
|
test_sentence_transformers.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test sentence-transformers to isolate the segfault.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
print("=" * 60)
|
| 10 |
+
print("SENTENCE TRANSFORMERS TEST")
|
| 11 |
+
print("=" * 60)
|
| 12 |
+
print(f"Python version: {sys.version}")
|
| 13 |
+
print()
|
| 14 |
+
|
| 15 |
+
# Test 1: Import sentence_transformers
|
| 16 |
+
print("Test 1: Import sentence_transformers...")
|
| 17 |
+
try:
|
| 18 |
+
from sentence_transformers import SentenceTransformer
|
| 19 |
+
print(f" β sentence_transformers imported")
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print(f" β Failed: {e}")
|
| 22 |
+
sys.exit(1)
|
| 23 |
+
|
| 24 |
+
# Test 2: Load model
|
| 25 |
+
print("\nTest 2: Load model (this downloads ~90MB on first run)...")
|
| 26 |
+
try:
|
| 27 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 28 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 29 |
+
print(f" β Model loaded")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f" β Failed: {e}")
|
| 32 |
+
import traceback
|
| 33 |
+
traceback.print_exc()
|
| 34 |
+
sys.exit(1)
|
| 35 |
+
|
| 36 |
+
# Test 3: Encode simple text
|
| 37 |
+
print("\nTest 3: Encode simple text...")
|
| 38 |
+
try:
|
| 39 |
+
text = "This is a test sentence."
|
| 40 |
+
embedding = model.encode([text])
|
| 41 |
+
print(f" β Encoded text, embedding shape: {embedding.shape}")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f" β Failed: {e}")
|
| 44 |
+
import traceback
|
| 45 |
+
traceback.print_exc()
|
| 46 |
+
sys.exit(1)
|
| 47 |
+
|
| 48 |
+
# Test 4: Encode batch
|
| 49 |
+
print("\nTest 4: Encode batch of texts...")
|
| 50 |
+
try:
|
| 51 |
+
texts = ["First sentence", "Second sentence", "Third sentence"]
|
| 52 |
+
embeddings = model.encode(texts, show_progress_bar=False)
|
| 53 |
+
print(f" β Encoded {len(texts)} texts, shape: {embeddings.shape}")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f" β Failed: {e}")
|
| 56 |
+
import traceback
|
| 57 |
+
traceback.print_exc()
|
| 58 |
+
sys.exit(1)
|
| 59 |
+
|
| 60 |
+
# Test 5: Encode with explicit parameters
|
| 61 |
+
print("\nTest 5: Encode with explicit parameters (like in our script)...")
|
| 62 |
+
try:
|
| 63 |
+
embeddings = model.encode(
|
| 64 |
+
texts,
|
| 65 |
+
show_progress_bar=False,
|
| 66 |
+
convert_to_numpy=True,
|
| 67 |
+
normalize_embeddings=False,
|
| 68 |
+
device='cpu'
|
| 69 |
+
)
|
| 70 |
+
print(f" β Encoded with explicit params, shape: {embeddings.shape}")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f" β Failed: {e}")
|
| 73 |
+
import traceback
|
| 74 |
+
traceback.print_exc()
|
| 75 |
+
sys.exit(1)
|
| 76 |
+
|
| 77 |
+
# Test 6: Encode larger batch
|
| 78 |
+
print("\nTest 6: Encode larger batch (100 texts)...")
|
| 79 |
+
try:
|
| 80 |
+
large_texts = [f"This is test sentence number {i}" for i in range(100)]
|
| 81 |
+
embeddings = model.encode(
|
| 82 |
+
large_texts,
|
| 83 |
+
show_progress_bar=False,
|
| 84 |
+
convert_to_numpy=True,
|
| 85 |
+
normalize_embeddings=False,
|
| 86 |
+
device='cpu'
|
| 87 |
+
)
|
| 88 |
+
print(f" β Encoded {len(large_texts)} texts, shape: {embeddings.shape}")
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f" β Failed: {e}")
|
| 91 |
+
import traceback
|
| 92 |
+
traceback.print_exc()
|
| 93 |
+
sys.exit(1)
|
| 94 |
+
|
| 95 |
+
# Test 7: Test with actual biography-like text
|
| 96 |
+
print("\nTest 7: Encode biography-like text...")
|
| 97 |
+
try:
|
| 98 |
+
bio = """A Representative from Illinois and 16th President of the United States;
|
| 99 |
+
born in Hardin County, Ky., February 12, 1809; moved with his parents to a tract
|
| 100 |
+
on Little Pigeon Creek, Ind., in 1816; attended a log-cabin school at short intervals
|
| 101 |
+
and was self-instructed in elementary branches."""
|
| 102 |
+
|
| 103 |
+
embedding = model.encode([bio], show_progress_bar=False, device='cpu')
|
| 104 |
+
print(f" β Encoded biography, shape: {embedding.shape}")
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f" β Failed: {e}")
|
| 107 |
+
import traceback
|
| 108 |
+
traceback.print_exc()
|
| 109 |
+
sys.exit(1)
|
| 110 |
+
|
| 111 |
+
print("\n" + "=" * 60)
|
| 112 |
+
print("β
ALL TESTS PASSED!")
|
| 113 |
+
print("=" * 60)
|
| 114 |
+
print("\nSentence transformers is working correctly.")
|
| 115 |
+
print("The issue may be with the combination of:")
|
| 116 |
+
print(" - Very large batch processing")
|
| 117 |
+
print(" - Integration with FAISS normalize")
|
| 118 |
+
print(" - Memory management with 13k+ texts")
|