AldawsariNLP commited on
Commit
6f548c5
·
1 Parent(s): 99f014c

uploading pdfs...10

Browse files
Files changed (6) hide show
  1. .gitignore +0 -5
  2. Dockerfile +2 -1
  3. app.py +34 -1
  4. backend/main.py +3 -3
  5. files_upload.py +42 -0
  6. pyproject.toml +1 -0
.gitignore CHANGED
@@ -50,9 +50,4 @@ Thumbs.db
50
 
51
  # Documents (optional - you may want to track these)
52
  documents/
53
- # documents1/
54
-
55
- # documents/*.docx
56
- # documents/*.txt
57
-
58
  documents/*.pdf
 
50
 
51
  # Documents (optional - you may want to track these)
52
  documents/
 
 
 
 
 
53
  documents/*.pdf
Dockerfile CHANGED
@@ -22,7 +22,7 @@ COPY backend/ ./backend/
22
 
23
  # Copy processed documents and vector data
24
  # Ensure documents and vectorstore folders exist (even if empty)
25
- RUN mkdir -p vectorstore
26
 
27
  # Copy vectorstore folder if it exists in the build context
28
  # Note: If vectorstore/ doesn't exist in repo, ensure an empty vectorstore/ folder exists
@@ -39,6 +39,7 @@ COPY frontend/build/ ./frontend/build/
39
 
40
  # Copy main app entry point
41
  COPY app.py .
 
42
 
43
 
44
  # Expose port (Hugging Face Spaces uses 7860)
 
22
 
23
  # Copy processed documents and vector data
24
  # Ensure documents and vectorstore folders exist (even if empty)
25
+ RUN mkdir -p vectorstore documents
26
 
27
  # Copy vectorstore folder if it exists in the build context
28
  # Note: If vectorstore/ doesn't exist in repo, ensure an empty vectorstore/ folder exists
 
39
 
40
  # Copy main app entry point
41
  COPY app.py .
42
+ COPY files_upload.py ./files_upload.py
43
 
44
 
45
  # Expose port (Hugging Face Spaces uses 7860)
app.py CHANGED
@@ -3,6 +3,8 @@ Hugging Face Spaces entry point
3
  This file serves both the FastAPI backend and React frontend
4
  """
5
  import os
 
 
6
  from contextlib import asynccontextmanager
7
  from pathlib import Path
8
  from fastapi import FastAPI, Request
@@ -12,6 +14,33 @@ from fastapi.middleware.cors import CORSMiddleware
12
  from dotenv import load_dotenv
13
  from backend.main import app as backend_app, initialize_rag_system
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  @asynccontextmanager
16
  async def lifespan(app: FastAPI):
17
  """Lifespan context manager for FastAPI startup and shutdown"""
@@ -28,6 +57,7 @@ async def lifespan(app: FastAPI):
28
  project_root = Path(__file__).parent
29
  documents_dir = project_root / "documents"
30
  processed_json = project_root / "processed_documents.json"
 
31
 
32
  # Load environment variables from .env file (if it exists)
33
  env_path = project_root / ".env"
@@ -63,7 +93,10 @@ async def lifespan(app: FastAPI):
63
  print(f"[root_app] PROJECT_ROOT: {project_root}")
64
  print(f"[root_app] DOCUMENTS_DIR: {documents_dir} (exists: {documents_dir.exists()})")
65
  print(f"[root_app] PROCESSED_JSON: {processed_json} (exists: {processed_json.exists()})")
66
-
 
 
 
67
  initialize_rag_system()
68
  print("[root_app] RAG system initialization completed")
69
  except Exception as e:
 
3
  This file serves both the FastAPI backend and React frontend
4
  """
5
  import os
6
+ import subprocess
7
+ import sys
8
  from contextlib import asynccontextmanager
9
  from pathlib import Path
10
  from fastapi import FastAPI, Request
 
14
  from dotenv import load_dotenv
15
  from backend.main import app as backend_app, initialize_rag_system
16
 
17
+
18
+ def run_files_upload_script(script_path: Path) -> None:
19
+ """Run the files_upload.py helper to ensure documents are available."""
20
+ if not script_path.exists():
21
+ print(f"[root_app] Upload script not found at {script_path}")
22
+ return
23
+
24
+ print(f"[root_app] Running upload script: {script_path}")
25
+ try:
26
+ completed = subprocess.run(
27
+ [sys.executable, str(script_path)],
28
+ capture_output=True,
29
+ text=True,
30
+ check=True,
31
+ )
32
+ if completed.stdout:
33
+ print(f"[root_app] upload script output:\n{completed.stdout}")
34
+ if completed.stderr:
35
+ print(f"[root_app] upload script warnings:\n{completed.stderr}")
36
+ except subprocess.CalledProcessError as exc:
37
+ print(f"[root_app] WARNING: upload script failed with code {exc.returncode}")
38
+ if exc.stdout:
39
+ print(f"[root_app] upload script stdout:\n{exc.stdout}")
40
+ if exc.stderr:
41
+ print(f"[root_app] upload script stderr:\n{exc.stderr}")
42
+ raise
43
+
44
  @asynccontextmanager
45
  async def lifespan(app: FastAPI):
46
  """Lifespan context manager for FastAPI startup and shutdown"""
 
57
  project_root = Path(__file__).parent
58
  documents_dir = project_root / "documents"
59
  processed_json = project_root / "processed_documents.json"
60
+ upload_script = project_root / "files_upload.py"
61
 
62
  # Load environment variables from .env file (if it exists)
63
  env_path = project_root / ".env"
 
93
  print(f"[root_app] PROJECT_ROOT: {project_root}")
94
  print(f"[root_app] DOCUMENTS_DIR: {documents_dir} (exists: {documents_dir.exists()})")
95
  print(f"[root_app] PROCESSED_JSON: {processed_json} (exists: {processed_json.exists()})")
96
+
97
+ # Run the upload script before initializing the RAG system
98
+ run_files_upload_script(upload_script)
99
+
100
  initialize_rag_system()
101
  print("[root_app] RAG system initialization completed")
102
  except Exception as e:
backend/main.py CHANGED
@@ -107,9 +107,9 @@ def initialize_rag_system():
107
  print("[RAG Init] Starting initialization (import-time)")
108
 
109
  # Ensure documents folder exists
110
- if not DOCUMENTS_DIR.exists():
111
- DOCUMENTS_DIR.mkdir(parents=True, exist_ok=True)
112
- print(f"[RAG Init] Created documents folder at {DOCUMENTS_DIR}")
113
 
114
  rag_system = RAGSystem()
115
 
 
107
  print("[RAG Init] Starting initialization (import-time)")
108
 
109
  # Ensure documents folder exists
110
+ # if not DOCUMENTS_DIR.exists():
111
+ # DOCUMENTS_DIR.mkdir(parents=True, exist_ok=True)
112
+ # print(f"[RAG Init] Created documents folder at {DOCUMENTS_DIR}")
113
 
114
  rag_system = RAGSystem()
115
 
files_upload.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ from pathlib import Path
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables from .env if present
7
+ load_dotenv()
8
+
9
+ # Get token from environment variable (more secure)
10
+ token = os.getenv("HF_TOKEN")
11
+ if not token:
12
+ raise ValueError("HF_TOKEN environment variable not set. Set it with: export HF_TOKEN='your_token_here'")
13
+
14
+ # Initialize API with token
15
+ api = HfApi(token=token)
16
+ repo_id = "AldawsariNLP/Saudi-Law-AI-Assistant"
17
+
18
+ # Upload all PDFs from local documents folder
19
+ local_docs = Path("documents")
20
+ pdf_files = list(local_docs.glob("*.pdf"))
21
+
22
+ if not pdf_files:
23
+ print("No PDF files found in documents/ folder; skipping upload.")
24
+ exit(0)
25
+
26
+ print(f"Found {len(pdf_files)} PDF file(s) to upload")
27
+ for pdf_file in pdf_files:
28
+ print(f"Uploading {pdf_file.name}...")
29
+ try:
30
+ api.upload_file(
31
+ path_or_fileobj=str(pdf_file),
32
+ path_in_repo=f"documents/{pdf_file.name}",
33
+ repo_id=repo_id,
34
+ repo_type="space",
35
+ token=token, # Also pass token here for safety
36
+ )
37
+ print(f"✓ Successfully uploaded {pdf_file.name}")
38
+ except Exception as e:
39
+ print(f"✗ Failed to upload {pdf_file.name}: {e}")
40
+ raise
41
+
42
+ print("Upload complete!")
pyproject.toml CHANGED
@@ -10,6 +10,7 @@ dependencies = [
10
  "langchain==0.1.16",
11
  "langchain-community==0.0.36",
12
  "openai>=1.50.0",
 
13
  "httpx==0.27.2",
14
  "requests==2.31.0",
15
  "faiss-cpu==1.7.4",
 
10
  "langchain==0.1.16",
11
  "langchain-community==0.0.36",
12
  "openai>=1.50.0",
13
+ "huggingface_hub>=0.25.0",
14
  "httpx==0.27.2",
15
  "requests==2.31.0",
16
  "faiss-cpu==1.7.4",