AldawsariNLP commited on
Commit
1ab35b7
·
1 Parent(s): 9e2ed70

uploading pdfs...13

Browse files
Files changed (2) hide show
  1. Dockerfile +3 -1
  2. backend/main.py +54 -6
Dockerfile CHANGED
@@ -2,12 +2,14 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install system dependencies and uv
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  curl \
9
  git \
 
10
  && rm -rf /var/lib/apt/lists/* \
 
11
  && pip install uv
12
 
13
  # Copy pyproject.toml and uv.lock for dependency management
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies, git-lfs, and uv
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  curl \
9
  git \
10
+ git-lfs \
11
  && rm -rf /var/lib/apt/lists/* \
12
+ && git lfs install \
13
  && pip install uv
14
 
15
  # Copy pyproject.toml and uv.lock for dependency management
backend/main.py CHANGED
@@ -1,4 +1,5 @@
1
  import sys
 
2
  from contextlib import asynccontextmanager
3
  from fastapi import FastAPI, HTTPException, Query
4
  from fastapi.middleware.cors import CORSMiddleware
@@ -7,7 +8,7 @@ from pydantic import BaseModel
7
  from typing import List, Optional
8
  import os
9
  from pathlib import Path
10
- from urllib.parse import quote
11
  from dotenv import load_dotenv # load env vars
12
  try:
13
  from backend.rag_system import RAGSystem
@@ -215,15 +216,62 @@ async def clear_history():
215
  async def get_document(filename: str, mode: str = Query("download", enum=["download", "preview"])):
216
  """Serve processed document files for preview or download"""
217
  documents_dir = DOCUMENTS_DIR.resolve()
218
- file_path = (documents_dir / filename).resolve()
219
-
 
 
 
 
 
220
  # Prevent directory traversal
221
  if documents_dir not in file_path.parents and file_path != documents_dir:
222
  raise HTTPException(status_code=403, detail="Access denied")
223
-
 
224
  if not file_path.exists():
225
- print(f"[get_document] Document not found: {file_path}")
226
- raise HTTPException(status_code=404, detail="Document not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  file_extension = file_path.suffix.lower()
229
 
 
1
  import sys
2
+ import unicodedata
3
  from contextlib import asynccontextmanager
4
  from fastapi import FastAPI, HTTPException, Query
5
  from fastapi.middleware.cors import CORSMiddleware
 
8
  from typing import List, Optional
9
  import os
10
  from pathlib import Path
11
+ from urllib.parse import quote, unquote
12
  from dotenv import load_dotenv # load env vars
13
  try:
14
  from backend.rag_system import RAGSystem
 
216
  async def get_document(filename: str, mode: str = Query("download", enum=["download", "preview"])):
217
  """Serve processed document files for preview or download"""
218
  documents_dir = DOCUMENTS_DIR.resolve()
219
+
220
+ # Decode URL-encoded filename
221
+ decoded_filename = unquote(filename)
222
+
223
+ # Try direct path first
224
+ file_path = (documents_dir / decoded_filename).resolve()
225
+
226
  # Prevent directory traversal
227
  if documents_dir not in file_path.parents and file_path != documents_dir:
228
  raise HTTPException(status_code=403, detail="Access denied")
229
+
230
+ # If file doesn't exist, try to find it by matching actual files in directory
231
  if not file_path.exists():
232
+ print(f"[get_document] Document not found at direct path: {file_path}")
233
+ print(f"[get_document] Searching for filename: {decoded_filename}")
234
+
235
+ # List all PDF files in documents directory
236
+ actual_files = list(documents_dir.glob("*.pdf"))
237
+ print(f"[get_document] Found {len(actual_files)} PDF files in directory")
238
+
239
+ # Normalize the requested filename for comparison
240
+ def normalize_name(name: str) -> str:
241
+ """Normalize filename for comparison (handle encoding variations)"""
242
+ # Remove .pdf extension for comparison
243
+ base_name = name.replace(".pdf", "").lower()
244
+ # Normalize unicode (NFD -> NFC to handle composed vs decomposed)
245
+ normalized = unicodedata.normalize("NFC", base_name)
246
+ return normalized.strip()
247
+
248
+ requested_normalized = normalize_name(decoded_filename)
249
+
250
+ # Try to find matching file
251
+ matched_file = None
252
+ for actual_file in actual_files:
253
+ actual_name = actual_file.name
254
+ actual_normalized = normalize_name(actual_name)
255
+
256
+ print(f"[get_document] Comparing: '{requested_normalized}' with '{actual_normalized}'")
257
+
258
+ if requested_normalized == actual_normalized:
259
+ matched_file = actual_file
260
+ print(f"[get_document] Found match: {actual_file.name}")
261
+ break
262
+
263
+ if matched_file:
264
+ file_path = matched_file.resolve()
265
+ else:
266
+ # Log all available files for debugging
267
+ print(f"[get_document] Available files in directory:")
268
+ for f in actual_files:
269
+ print(f"[get_document] - {f.name}")
270
+ print(f"[get_document] Requested filename (normalized): {requested_normalized}")
271
+ raise HTTPException(
272
+ status_code=404,
273
+ detail=f"Document not found: {decoded_filename}. Available files: {[f.name for f in actual_files]}"
274
+ )
275
 
276
  file_extension = file_path.suffix.lower()
277