AldawsariNLP commited on
Commit
6b0f230
·
1 Parent(s): 2794165

Improve RAG init & docs

Browse files
Files changed (3) hide show
  1. QUICKSTART.md +9 -2
  2. app.py +6 -0
  3. backend/main.py +49 -31
QUICKSTART.md CHANGED
@@ -75,9 +75,11 @@ Create `.env` in the project root:
75
  OPENAI_API_KEY=sk-your-actual-api-key-here
76
  ```
77
 
78
- ### 5. Add Documents
79
 
80
- Place your PDF, TXT, DOCX, or DOC files in the `documents/` folder.
 
 
81
 
82
  ### 6. Run the Application
83
 
@@ -117,6 +119,11 @@ npm start
117
  **"No documents found"**
118
  - Check that files are in the `documents/` folder
119
  - Supported formats: PDF, TXT, DOCX, DOC
 
 
 
 
 
120
 
121
  **Frontend can't connect to backend**
122
  - Ensure backend is running on port 8000
 
75
  OPENAI_API_KEY=sk-your-actual-api-key-here
76
  ```
77
 
78
+ ### 5. Add Documents / Processed Data
79
 
80
+ - **Local development:** copy your PDF/TXT/DOC/DOCX files into the `documents/` folder before running `uv run python backend/main.py`.
81
+ - **Deploying to Hugging Face Spaces:** large PDFs should be uploaded via the Space UI (Files & versions → Upload). Git pushes can’t include big binaries.
82
+ - If you have a pre-generated `processed_documents.json`, keep it in the project root (it’s copied by the Dockerfile). The backend logs will print whether this file and the `documents/` folder exist at startup.
83
 
84
  ### 6. Run the Application
85
 
 
119
  **"No documents found"**
120
  - Check that files are in the `documents/` folder
121
  - Supported formats: PDF, TXT, DOCX, DOC
122
+ - On Hugging Face Spaces, make sure you uploaded the PDFs (or a `processed_documents.json`) via the **Files and versions** tab. Watch the build/startup logs for messages such as `[RAG Init] processed_documents.json exists? True`.
123
+
124
+ **"RAG system not initialized" (on Spaces)**
125
+ - Ensure `processed_documents.json` is present in the repo **and** not excluded by `.dockerignore`.
126
+ - Upload your source PDFs (or processed data) in the Space UI, then restart the Space so the startup hook can detect them.
127
 
128
  **Frontend can't connect to backend**
129
  - Ensure backend is running on port 8000
app.py CHANGED
@@ -23,8 +23,14 @@ app.add_middleware(
23
  )
24
 
25
  # Mount the backend API
 
26
  app.mount("/api", backend_app)
27
 
 
 
 
 
 
28
  # Serve React frontend
29
  frontend_path = Path(__file__).parent / "frontend" / "build"
30
 
 
23
  )
24
 
25
  # Mount the backend API
26
+ print("[root_app] Mounting backend application at /api")
27
  app.mount("/api", backend_app)
28
 
29
+
30
+ @app.on_event("startup")
31
+ async def root_startup_event():
32
+ print("[root_app] Startup event triggered")
33
+
34
  # Serve React frontend
35
  frontend_path = Path(__file__).parent / "frontend" / "build"
36
 
backend/main.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from fastapi import FastAPI, HTTPException, Query
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from fastapi.responses import FileResponse, JSONResponse
@@ -30,47 +31,62 @@ app.add_middleware(
30
  # Initialize RAG system
31
  rag_system = None
32
  rag_ready = False
 
33
 
34
  class QuestionRequest(BaseModel):
35
  question: str
36
  use_history: Optional[bool] = True
37
 
 
38
  class QuestionResponse(BaseModel):
39
  answer: str
40
  sources: List[str]
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  @app.on_event("startup")
43
  async def startup_event():
44
  """Process and index all PDFs on startup"""
45
- global rag_system
46
- global rag_ready
47
  try:
48
- rag_ready = False
49
- print("[startup_event] Initializing RAG system...")
50
- rag_system = RAGSystem()
51
- docs_folder = Path("documents")
52
- processed_json = Path("processed_documents.json")
53
-
54
- print(f"[startup_event] processed_documents.json exists? {processed_json.exists()}")
55
- print(f"[startup_event] documents folder exists? {docs_folder.exists()}")
56
-
57
- if docs_folder.exists() and any(docs_folder.glob("*.pdf")):
58
- print("[startup_event] PDFs detected, processing...")
59
- num_docs = rag_system.process_and_index_documents(str(docs_folder))
60
- print(f"[startup_event] ✓ Processed and indexed {num_docs} documents")
61
- rag_ready = True
62
  print("[startup_event] RAG system ready.")
63
- else:
64
- print("[startup_event] No PDF files found in documents folder")
65
- if processed_json.exists():
66
- print(
67
- "[startup_event] processed_documents.json found, but no PDFs to process. "
68
- "Ensure processed data is uploaded if you expect answers."
69
- )
70
- rag_ready = True
71
- else:
72
- print("[startup_event] No processed_documents.json available; RAG system may remain uninitialized.")
73
- rag_ready = False
74
  except Exception as e:
75
  print(f"[startup_event] Warning: Could not initialize RAG system: {e}")
76
  import traceback
@@ -89,10 +105,12 @@ async def ask_question(request: QuestionRequest):
89
  """Answer a question using RAG with multi-turn chat history"""
90
  global rag_system, rag_ready
91
  if rag_system is None or not rag_ready:
92
- raise HTTPException(
93
- status_code=503,
94
- detail="RAG system not initialized. Upload PDFs or processed_documents.json, then restart the Space."
95
- )
 
 
96
 
97
  if not request.question.strip():
98
  raise HTTPException(status_code=400, detail="Question cannot be empty")
 
1
+ import asyncio
2
  from fastapi import FastAPI, HTTPException, Query
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from fastapi.responses import FileResponse, JSONResponse
 
31
  # Initialize RAG system
32
  rag_system = None
33
  rag_ready = False
34
+ _rag_init_lock = asyncio.Lock()
35
 
36
  class QuestionRequest(BaseModel):
37
  question: str
38
  use_history: Optional[bool] = True
39
 
40
+
41
  class QuestionResponse(BaseModel):
42
  answer: str
43
  sources: List[str]
44
 
45
+
46
+ async def initialize_rag_system(force: bool = False) -> None:
47
+ """Initialize the RAG system if data is available."""
48
+ global rag_system, rag_ready
49
+
50
+ async with _rag_init_lock:
51
+ if not force and rag_system is not None and rag_ready:
52
+ return
53
+
54
+ try:
55
+ rag_ready = False
56
+ print(f"[RAG Init] Starting initialization (force={force})")
57
+ rag_system = RAGSystem()
58
+ docs_folder = Path("documents")
59
+ processed_json = Path("processed_documents.json")
60
+
61
+ print(f"[RAG Init] processed_documents.json exists? {processed_json.exists()}")
62
+ print(f"[RAG Init] documents folder exists? {docs_folder.exists()}")
63
+
64
+ if docs_folder.exists() and any(docs_folder.glob("*.pdf")):
65
+ print("[RAG Init] PDFs detected, processing...")
66
+ num_docs = rag_system.process_and_index_documents(str(docs_folder))
67
+ print(f"[RAG Init] ✓ Processed and indexed {num_docs} documents")
68
+ rag_ready = True
69
+ elif processed_json.exists():
70
+ print("[RAG Init] processed_documents.json found. Using existing summaries.")
71
+ rag_ready = True
72
+ else:
73
+ print("[RAG Init] No PDFs or processed_documents.json found. RAG remains uninitialized.")
74
+ rag_ready = False
75
+ except Exception as exc:
76
+ print(f"[RAG Init] Initialization failed: {exc}")
77
+ import traceback
78
+ traceback.print_exc()
79
+ rag_ready = False
80
+ raise
81
+
82
+
83
  @app.on_event("startup")
84
  async def startup_event():
85
  """Process and index all PDFs on startup"""
 
 
86
  try:
87
+ await initialize_rag_system(force=True)
88
+ if rag_ready:
 
 
 
 
 
 
 
 
 
 
 
 
89
  print("[startup_event] RAG system ready.")
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
  print(f"[startup_event] Warning: Could not initialize RAG system: {e}")
92
  import traceback
 
105
  """Answer a question using RAG with multi-turn chat history"""
106
  global rag_system, rag_ready
107
  if rag_system is None or not rag_ready:
108
+ await initialize_rag_system()
109
+ if rag_system is None or not rag_ready:
110
+ raise HTTPException(
111
+ status_code=503,
112
+ detail="RAG system not initialized. Upload PDFs or processed_documents.json, then restart the Space."
113
+ )
114
 
115
  if not request.question.strip():
116
  raise HTTPException(status_code=400, detail="Question cannot be empty")