Commit ·
1ab35b7
1
Parent(s): 9e2ed70
uploading pdfs...13
Browse files- Dockerfile +3 -1
- backend/main.py +54 -6
Dockerfile
CHANGED
|
@@ -2,12 +2,14 @@ FROM python:3.10-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
-
# Install system dependencies and uv
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
| 8 |
curl \
|
| 9 |
git \
|
|
|
|
| 10 |
&& rm -rf /var/lib/apt/lists/* \
|
|
|
|
| 11 |
&& pip install uv
|
| 12 |
|
| 13 |
# Copy pyproject.toml and uv.lock for dependency management
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# Install system dependencies, git-lfs, and uv
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
| 8 |
curl \
|
| 9 |
git \
|
| 10 |
+
git-lfs \
|
| 11 |
&& rm -rf /var/lib/apt/lists/* \
|
| 12 |
+
&& git lfs install \
|
| 13 |
&& pip install uv
|
| 14 |
|
| 15 |
# Copy pyproject.toml and uv.lock for dependency management
|
backend/main.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import sys
|
|
|
|
| 2 |
from contextlib import asynccontextmanager
|
| 3 |
from fastapi import FastAPI, HTTPException, Query
|
| 4 |
from fastapi.middleware.cors import CORSMiddleware
|
|
@@ -7,7 +8,7 @@ from pydantic import BaseModel
|
|
| 7 |
from typing import List, Optional
|
| 8 |
import os
|
| 9 |
from pathlib import Path
|
| 10 |
-
from urllib.parse import quote
|
| 11 |
from dotenv import load_dotenv # load env vars
|
| 12 |
try:
|
| 13 |
from backend.rag_system import RAGSystem
|
|
@@ -215,15 +216,62 @@ async def clear_history():
|
|
| 215 |
async def get_document(filename: str, mode: str = Query("download", enum=["download", "preview"])):
|
| 216 |
"""Serve processed document files for preview or download"""
|
| 217 |
documents_dir = DOCUMENTS_DIR.resolve()
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
# Prevent directory traversal
|
| 221 |
if documents_dir not in file_path.parents and file_path != documents_dir:
|
| 222 |
raise HTTPException(status_code=403, detail="Access denied")
|
| 223 |
-
|
|
|
|
| 224 |
if not file_path.exists():
|
| 225 |
-
print(f"[get_document] Document not found: {file_path}")
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
file_extension = file_path.suffix.lower()
|
| 229 |
|
|
|
|
| 1 |
import sys
|
| 2 |
+
import unicodedata
|
| 3 |
from contextlib import asynccontextmanager
|
| 4 |
from fastapi import FastAPI, HTTPException, Query
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 8 |
from typing import List, Optional
|
| 9 |
import os
|
| 10 |
from pathlib import Path
|
| 11 |
+
from urllib.parse import quote, unquote
|
| 12 |
from dotenv import load_dotenv # load env vars
|
| 13 |
try:
|
| 14 |
from backend.rag_system import RAGSystem
|
|
|
|
| 216 |
async def get_document(filename: str, mode: str = Query("download", enum=["download", "preview"])):
|
| 217 |
"""Serve processed document files for preview or download"""
|
| 218 |
documents_dir = DOCUMENTS_DIR.resolve()
|
| 219 |
+
|
| 220 |
+
# Decode URL-encoded filename
|
| 221 |
+
decoded_filename = unquote(filename)
|
| 222 |
+
|
| 223 |
+
# Try direct path first
|
| 224 |
+
file_path = (documents_dir / decoded_filename).resolve()
|
| 225 |
+
|
| 226 |
# Prevent directory traversal
|
| 227 |
if documents_dir not in file_path.parents and file_path != documents_dir:
|
| 228 |
raise HTTPException(status_code=403, detail="Access denied")
|
| 229 |
+
|
| 230 |
+
# If file doesn't exist, try to find it by matching actual files in directory
|
| 231 |
if not file_path.exists():
|
| 232 |
+
print(f"[get_document] Document not found at direct path: {file_path}")
|
| 233 |
+
print(f"[get_document] Searching for filename: {decoded_filename}")
|
| 234 |
+
|
| 235 |
+
# List all PDF files in documents directory
|
| 236 |
+
actual_files = list(documents_dir.glob("*.pdf"))
|
| 237 |
+
print(f"[get_document] Found {len(actual_files)} PDF files in directory")
|
| 238 |
+
|
| 239 |
+
# Normalize the requested filename for comparison
|
| 240 |
+
def normalize_name(name: str) -> str:
|
| 241 |
+
"""Normalize filename for comparison (handle encoding variations)"""
|
| 242 |
+
# Remove .pdf extension for comparison
|
| 243 |
+
base_name = name.replace(".pdf", "").lower()
|
| 244 |
+
# Normalize unicode (NFD -> NFC to handle composed vs decomposed)
|
| 245 |
+
normalized = unicodedata.normalize("NFC", base_name)
|
| 246 |
+
return normalized.strip()
|
| 247 |
+
|
| 248 |
+
requested_normalized = normalize_name(decoded_filename)
|
| 249 |
+
|
| 250 |
+
# Try to find matching file
|
| 251 |
+
matched_file = None
|
| 252 |
+
for actual_file in actual_files:
|
| 253 |
+
actual_name = actual_file.name
|
| 254 |
+
actual_normalized = normalize_name(actual_name)
|
| 255 |
+
|
| 256 |
+
print(f"[get_document] Comparing: '{requested_normalized}' with '{actual_normalized}'")
|
| 257 |
+
|
| 258 |
+
if requested_normalized == actual_normalized:
|
| 259 |
+
matched_file = actual_file
|
| 260 |
+
print(f"[get_document] Found match: {actual_file.name}")
|
| 261 |
+
break
|
| 262 |
+
|
| 263 |
+
if matched_file:
|
| 264 |
+
file_path = matched_file.resolve()
|
| 265 |
+
else:
|
| 266 |
+
# Log all available files for debugging
|
| 267 |
+
print(f"[get_document] Available files in directory:")
|
| 268 |
+
for f in actual_files:
|
| 269 |
+
print(f"[get_document] - {f.name}")
|
| 270 |
+
print(f"[get_document] Requested filename (normalized): {requested_normalized}")
|
| 271 |
+
raise HTTPException(
|
| 272 |
+
status_code=404,
|
| 273 |
+
detail=f"Document not found: {decoded_filename}. Available files: {[f.name for f in actual_files]}"
|
| 274 |
+
)
|
| 275 |
|
| 276 |
file_extension = file_path.suffix.lower()
|
| 277 |
|