Spaces:
Sleeping
Sleeping
Commit
·
6b0f230
1
Parent(s):
2794165
Improve RAG init & docs
Browse files- QUICKSTART.md +9 -2
- app.py +6 -0
- backend/main.py +49 -31
QUICKSTART.md
CHANGED
|
@@ -75,9 +75,11 @@ Create `.env` in the project root:
|
|
| 75 |
OPENAI_API_KEY=sk-your-actual-api-key-here
|
| 76 |
```
|
| 77 |
|
| 78 |
-
### 5. Add Documents
|
| 79 |
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
|
| 82 |
### 6. Run the Application
|
| 83 |
|
|
@@ -117,6 +119,11 @@ npm start
|
|
| 117 |
**"No documents found"**
|
| 118 |
- Check that files are in the `documents/` folder
|
| 119 |
- Supported formats: PDF, TXT, DOCX, DOC
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
**Frontend can't connect to backend**
|
| 122 |
- Ensure backend is running on port 8000
|
|
|
|
| 75 |
OPENAI_API_KEY=sk-your-actual-api-key-here
|
| 76 |
```
|
| 77 |
|
| 78 |
+
### 5. Add Documents / Processed Data
|
| 79 |
|
| 80 |
+
- **Local development:** copy your PDF/TXT/DOC/DOCX files into the `documents/` folder before running `uv run python backend/main.py`.
|
| 81 |
+
- **Deploying to Hugging Face Spaces:** large PDFs should be uploaded via the Space UI (Files & versions → Upload). Git pushes can’t include big binaries.
|
| 82 |
+
- If you have a pre-generated `processed_documents.json`, keep it in the project root (it’s copied by the Dockerfile). The backend logs will print whether this file and the `documents/` folder exist at startup.
|
| 83 |
|
| 84 |
### 6. Run the Application
|
| 85 |
|
|
|
|
| 119 |
**"No documents found"**
|
| 120 |
- Check that files are in the `documents/` folder
|
| 121 |
- Supported formats: PDF, TXT, DOCX, DOC
|
| 122 |
+
- On Hugging Face Spaces, make sure you uploaded the PDFs (or a `processed_documents.json`) via the **Files and versions** tab. Watch the build/startup logs for messages such as `[RAG Init] processed_documents.json exists? True`.
|
| 123 |
+
|
| 124 |
+
**"RAG system not initialized" (on Spaces)**
|
| 125 |
+
- Ensure `processed_documents.json` is present in the repo **and** not excluded by `.dockerignore`.
|
| 126 |
+
- Upload your source PDFs (or processed data) in the Space UI, then restart the Space so the startup hook can detect them.
|
| 127 |
|
| 128 |
**Frontend can't connect to backend**
|
| 129 |
- Ensure backend is running on port 8000
|
app.py
CHANGED
|
@@ -23,8 +23,14 @@ app.add_middleware(
|
|
| 23 |
)
|
| 24 |
|
| 25 |
# Mount the backend API
|
|
|
|
| 26 |
app.mount("/api", backend_app)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# Serve React frontend
|
| 29 |
frontend_path = Path(__file__).parent / "frontend" / "build"
|
| 30 |
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
# Mount the backend API
|
| 26 |
+
print("[root_app] Mounting backend application at /api")
|
| 27 |
app.mount("/api", backend_app)
|
| 28 |
|
| 29 |
+
|
| 30 |
+
@app.on_event("startup")
|
| 31 |
+
async def root_startup_event():
|
| 32 |
+
print("[root_app] Startup event triggered")
|
| 33 |
+
|
| 34 |
# Serve React frontend
|
| 35 |
frontend_path = Path(__file__).parent / "frontend" / "build"
|
| 36 |
|
backend/main.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from fastapi import FastAPI, HTTPException, Query
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
from fastapi.responses import FileResponse, JSONResponse
|
|
@@ -30,47 +31,62 @@ app.add_middleware(
|
|
| 30 |
# Initialize RAG system
|
| 31 |
rag_system = None
|
| 32 |
rag_ready = False
|
|
|
|
| 33 |
|
| 34 |
class QuestionRequest(BaseModel):
|
| 35 |
question: str
|
| 36 |
use_history: Optional[bool] = True
|
| 37 |
|
|
|
|
| 38 |
class QuestionResponse(BaseModel):
|
| 39 |
answer: str
|
| 40 |
sources: List[str]
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
@app.on_event("startup")
|
| 43 |
async def startup_event():
|
| 44 |
"""Process and index all PDFs on startup"""
|
| 45 |
-
global rag_system
|
| 46 |
-
global rag_ready
|
| 47 |
try:
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
rag_system = RAGSystem()
|
| 51 |
-
docs_folder = Path("documents")
|
| 52 |
-
processed_json = Path("processed_documents.json")
|
| 53 |
-
|
| 54 |
-
print(f"[startup_event] processed_documents.json exists? {processed_json.exists()}")
|
| 55 |
-
print(f"[startup_event] documents folder exists? {docs_folder.exists()}")
|
| 56 |
-
|
| 57 |
-
if docs_folder.exists() and any(docs_folder.glob("*.pdf")):
|
| 58 |
-
print("[startup_event] PDFs detected, processing...")
|
| 59 |
-
num_docs = rag_system.process_and_index_documents(str(docs_folder))
|
| 60 |
-
print(f"[startup_event] ✓ Processed and indexed {num_docs} documents")
|
| 61 |
-
rag_ready = True
|
| 62 |
print("[startup_event] RAG system ready.")
|
| 63 |
-
else:
|
| 64 |
-
print("[startup_event] No PDF files found in documents folder")
|
| 65 |
-
if processed_json.exists():
|
| 66 |
-
print(
|
| 67 |
-
"[startup_event] processed_documents.json found, but no PDFs to process. "
|
| 68 |
-
"Ensure processed data is uploaded if you expect answers."
|
| 69 |
-
)
|
| 70 |
-
rag_ready = True
|
| 71 |
-
else:
|
| 72 |
-
print("[startup_event] No processed_documents.json available; RAG system may remain uninitialized.")
|
| 73 |
-
rag_ready = False
|
| 74 |
except Exception as e:
|
| 75 |
print(f"[startup_event] Warning: Could not initialize RAG system: {e}")
|
| 76 |
import traceback
|
|
@@ -89,10 +105,12 @@ async def ask_question(request: QuestionRequest):
|
|
| 89 |
"""Answer a question using RAG with multi-turn chat history"""
|
| 90 |
global rag_system, rag_ready
|
| 91 |
if rag_system is None or not rag_ready:
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
| 96 |
|
| 97 |
if not request.question.strip():
|
| 98 |
raise HTTPException(status_code=400, detail="Question cannot be empty")
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
from fastapi import FastAPI, HTTPException, Query
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from fastapi.responses import FileResponse, JSONResponse
|
|
|
|
| 31 |
# Initialize RAG system
|
| 32 |
rag_system = None
|
| 33 |
rag_ready = False
|
| 34 |
+
_rag_init_lock = asyncio.Lock()
|
| 35 |
|
| 36 |
class QuestionRequest(BaseModel):
|
| 37 |
question: str
|
| 38 |
use_history: Optional[bool] = True
|
| 39 |
|
| 40 |
+
|
| 41 |
class QuestionResponse(BaseModel):
|
| 42 |
answer: str
|
| 43 |
sources: List[str]
|
| 44 |
|
| 45 |
+
|
| 46 |
+
async def initialize_rag_system(force: bool = False) -> None:
|
| 47 |
+
"""Initialize the RAG system if data is available."""
|
| 48 |
+
global rag_system, rag_ready
|
| 49 |
+
|
| 50 |
+
async with _rag_init_lock:
|
| 51 |
+
if not force and rag_system is not None and rag_ready:
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
rag_ready = False
|
| 56 |
+
print(f"[RAG Init] Starting initialization (force={force})")
|
| 57 |
+
rag_system = RAGSystem()
|
| 58 |
+
docs_folder = Path("documents")
|
| 59 |
+
processed_json = Path("processed_documents.json")
|
| 60 |
+
|
| 61 |
+
print(f"[RAG Init] processed_documents.json exists? {processed_json.exists()}")
|
| 62 |
+
print(f"[RAG Init] documents folder exists? {docs_folder.exists()}")
|
| 63 |
+
|
| 64 |
+
if docs_folder.exists() and any(docs_folder.glob("*.pdf")):
|
| 65 |
+
print("[RAG Init] PDFs detected, processing...")
|
| 66 |
+
num_docs = rag_system.process_and_index_documents(str(docs_folder))
|
| 67 |
+
print(f"[RAG Init] ✓ Processed and indexed {num_docs} documents")
|
| 68 |
+
rag_ready = True
|
| 69 |
+
elif processed_json.exists():
|
| 70 |
+
print("[RAG Init] processed_documents.json found. Using existing summaries.")
|
| 71 |
+
rag_ready = True
|
| 72 |
+
else:
|
| 73 |
+
print("[RAG Init] No PDFs or processed_documents.json found. RAG remains uninitialized.")
|
| 74 |
+
rag_ready = False
|
| 75 |
+
except Exception as exc:
|
| 76 |
+
print(f"[RAG Init] Initialization failed: {exc}")
|
| 77 |
+
import traceback
|
| 78 |
+
traceback.print_exc()
|
| 79 |
+
rag_ready = False
|
| 80 |
+
raise
|
| 81 |
+
|
| 82 |
+
|
| 83 |
@app.on_event("startup")
|
| 84 |
async def startup_event():
|
| 85 |
"""Process and index all PDFs on startup"""
|
|
|
|
|
|
|
| 86 |
try:
|
| 87 |
+
await initialize_rag_system(force=True)
|
| 88 |
+
if rag_ready:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
print("[startup_event] RAG system ready.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
except Exception as e:
|
| 91 |
print(f"[startup_event] Warning: Could not initialize RAG system: {e}")
|
| 92 |
import traceback
|
|
|
|
| 105 |
"""Answer a question using RAG with multi-turn chat history"""
|
| 106 |
global rag_system, rag_ready
|
| 107 |
if rag_system is None or not rag_ready:
|
| 108 |
+
await initialize_rag_system()
|
| 109 |
+
if rag_system is None or not rag_ready:
|
| 110 |
+
raise HTTPException(
|
| 111 |
+
status_code=503,
|
| 112 |
+
detail="RAG system not initialized. Upload PDFs or processed_documents.json, then restart the Space."
|
| 113 |
+
)
|
| 114 |
|
| 115 |
if not request.question.strip():
|
| 116 |
raise HTTPException(status_code=400, detail="Question cannot be empty")
|