AldawsariNLP commited on
Commit
97cdc7c
·
1 Parent(s): 81872f1

pushing alst changes ...

Browse files
.gitignore CHANGED
@@ -25,6 +25,9 @@ vectorstore/
25
  *.pkl
26
  *.faiss
27
 
 
 
 
28
  # Processed documents JSON - included in repository (as requested)
29
  # processed_documents.json
30
 
 
25
  *.pkl
26
  *.faiss
27
 
28
+ # Chunk vectorstores (binary files, use HF Xet if needed)
29
+ chunk_vectorstores/
30
+
31
  # Processed documents JSON - included in repository (as requested)
32
  # processed_documents.json
33
 
backend/document_processor.py CHANGED
@@ -209,9 +209,7 @@ Return ONLY a single JSON object, with EXACTLY these two fields:
209
 
210
  if existing_filenames:
211
  print(f"Found {len(existing_filenames)} already processed documents")
212
- print(f"Existing filenames (original): {list(existing_filenames)}")
213
- print(f"Existing filenames (normalized): {list(existing_filenames_normalized)}")
214
-
215
  pdf_files = list(folder.glob("*.pdf"))
216
  new_processed_docs = []
217
  skipped_count = 0
@@ -221,7 +219,6 @@ Return ONLY a single JSON object, with EXACTLY these two fields:
221
  filename_normalized = self._normalize_filename(filename)
222
 
223
  # Debug: Print comparison attempt
224
- print(f"[Filename Check] Checking: '{filename}' (normalized: '{filename_normalized}')")
225
 
226
  # Skip if already processed (using normalized comparison)
227
  if skip_existing and filename_normalized in existing_filenames_normalized:
 
209
 
210
  if existing_filenames:
211
  print(f"Found {len(existing_filenames)} already processed documents")
212
+
 
 
213
  pdf_files = list(folder.glob("*.pdf"))
214
  new_processed_docs = []
215
  skipped_count = 0
 
219
  filename_normalized = self._normalize_filename(filename)
220
 
221
  # Debug: Print comparison attempt
 
222
 
223
  # Skip if already processed (using normalized comparison)
224
  if skip_existing and filename_normalized in existing_filenames_normalized:
backend/main.py CHANGED
@@ -60,16 +60,16 @@ async def lifespan(app: FastAPI):
60
 
61
  # Load environment variables from .env file with debug output
62
  env_path = PROJECT_ROOT / ".env"
63
- print(f"[Lifespan] .env file path: {env_path}")
64
- print(f"[Lifespan] .env file exists? {env_path.exists()}")
65
 
66
  if env_path.exists():
67
  load_dotenv(env_path, override=True)
68
  api_key = os.getenv("OPENAI_API_KEY")
69
- if api_key:
70
- print(f"[Lifespan] OPENAI_API_KEY found (length: {len(api_key)} characters)")
71
- else:
72
- print("[Lifespan] WARNING: OPENAI_API_KEY not found in .env file")
73
  else:
74
  print(f"[Lifespan] WARNING: .env file not found at {env_path}")
75
  # Try loading anyway in case it's in a different location
@@ -131,10 +131,10 @@ def initialize_rag_system():
131
 
132
  rag_system = RAGSystem()
133
 
134
- print(f"[RAG Init] processed_documents.json path: {PROCESSED_JSON}")
135
- print(f"[RAG Init] processed_documents.json exists? {PROCESSED_JSON.exists()}")
136
- print(f"[RAG Init] documents folder path: {DOCUMENTS_DIR}")
137
- print(f"[RAG Init] documents folder exists? {DOCUMENTS_DIR.exists()}")
138
 
139
  if DOCUMENTS_DIR.exists() and any(DOCUMENTS_DIR.glob("*.pdf")):
140
  print("[RAG Init] PDFs detected, processing...")
@@ -197,8 +197,7 @@ async def ask_question(request: QuestionRequest):
197
  model_provider=request.model_provider ,
198
  context_mode=request.context_mode or "full",
199
  )
200
- request_time = (time.perf_counter() - request_start) * 1000
201
- print(f"[Timing] Total /ask endpoint time: {request_time:.2f}ms")
202
  return QuestionResponse(answer=answer, sources=sources)
203
  except Exception as e:
204
  raise HTTPException(
 
60
 
61
  # Load environment variables from .env file with debug output
62
  env_path = PROJECT_ROOT / ".env"
63
+ # print(f"[Lifespan] .env file path: {env_path}")
64
+ # print(f"[Lifespan] .env file exists? {env_path.exists()}")
65
 
66
  if env_path.exists():
67
  load_dotenv(env_path, override=True)
68
  api_key = os.getenv("OPENAI_API_KEY")
69
+ # if api_key:
70
+ # print(f"[Lifespan] OPENAI_API_KEY found (length: {len(api_key)} characters)")
71
+ # else:
72
+ # print("[Lifespan] WARNING: OPENAI_API_KEY not found in .env file")
73
  else:
74
  print(f"[Lifespan] WARNING: .env file not found at {env_path}")
75
  # Try loading anyway in case it's in a different location
 
131
 
132
  rag_system = RAGSystem()
133
 
134
+ # print(f"[RAG Init] processed_documents.json path: {PROCESSED_JSON}")
135
+ # print(f"[RAG Init] processed_documents.json exists? {PROCESSED_JSON.exists()}")
136
+ # print(f"[RAG Init] documents folder path: {DOCUMENTS_DIR}")
137
+ # print(f"[RAG Init] documents folder exists? {DOCUMENTS_DIR.exists()}")
138
 
139
  if DOCUMENTS_DIR.exists() and any(DOCUMENTS_DIR.glob("*.pdf")):
140
  print("[RAG Init] PDFs detected, processing...")
 
197
  model_provider=request.model_provider ,
198
  context_mode=request.context_mode or "full",
199
  )
200
+
 
201
  return QuestionResponse(answer=answer, sources=sources)
202
  except Exception as e:
203
  raise HTTPException(
backend/rag_system.py CHANGED
@@ -1,6 +1,9 @@
1
  import os
2
  import json
3
  import time
 
 
 
4
  from pathlib import Path
5
  from typing import List, Tuple, Optional, Dict
6
  from langchain_community.vectorstores import FAISS
@@ -38,6 +41,16 @@ class RAGSystem:
38
  self.json_path = json_path
39
  self.vectorstore = None
40
 
 
 
 
 
 
 
 
 
 
 
41
  # Initialize embeddings (supports OpenAI or HuggingFace based on EMBEDDINGS_PROVIDER env var)
42
  provider = os.getenv("EMBEDDINGS_PROVIDER", "openai").lower()
43
  if provider in ["huggingface", "hf", "nebius"]:
@@ -51,6 +64,7 @@ class RAGSystem:
51
  if not embeddings_api_key:
52
  raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.")
53
 
 
54
  self.embeddings = get_embeddings_wrapper(api_key=embeddings_api_key)
55
 
56
  # Initialize document processor (always uses OpenAI for LLM processing)
@@ -87,22 +101,13 @@ class RAGSystem:
87
  embeddings=self.embeddings,
88
  allow_dangerous_deserialization=True
89
  )
90
- # Ensure embedding function is properly set
91
- # FAISS may use either embedding_function attribute or call embeddings directly
92
- # Set embedding_function to the embed_query method for compatibility
93
- if not hasattr(self.vectorstore, 'embedding_function') or self.vectorstore.embedding_function is None:
94
- self.vectorstore.embedding_function = self.embeddings.embed_query
95
- elif not callable(self.vectorstore.embedding_function):
96
- self.vectorstore.embedding_function = self.embeddings.embed_query
97
-
98
- # Also ensure the embeddings object itself is accessible and callable
99
- # This handles cases where FAISS tries to call the embeddings object directly
100
- if hasattr(self.vectorstore, 'embeddings'):
101
- self.vectorstore.embeddings = self.embeddings
102
-
103
- # Verify embedding function is working
104
- if not callable(self.vectorstore.embedding_function):
105
- raise ValueError("Embedding function is not callable after initialization")
106
  print(f"Loaded existing vectorstore from {self.vectorstore_path}")
107
  except Exception as e:
108
  print(f"Could not load existing vectorstore: {e}")
@@ -286,6 +291,119 @@ class RAGSystem:
286
 
287
  return None
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  def _get_or_build_chunk_vectorstore(
290
  self,
291
  filename: str,
@@ -294,7 +412,8 @@ class RAGSystem:
294
  chunk_overlap: int = 300
295
  ) -> Tuple[FAISS, List[Document]]:
296
  """
297
- Build or retrieve an in-memory FAISS vectorstore of semantic chunks for a single document.
 
298
 
299
  Args:
300
  filename: Document filename used as key in cache/metadata
@@ -305,11 +424,25 @@ class RAGSystem:
305
  Returns:
306
  Tuple of (FAISS vectorstore over chunks, list of chunk Documents)
307
  """
308
- # Return from cache if available
309
  if filename in self._chunk_cache:
310
  entry = self._chunk_cache[filename]
311
  return entry["vectorstore"], entry["chunks"] # type: ignore[return-value]
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  # Create text splitter tuned for Arabic legal text
314
  text_splitter = RecursiveCharacterTextSplitter(
315
  chunk_size=chunk_size,
@@ -351,10 +484,16 @@ class RAGSystem:
351
  ]
352
 
353
  chunk_vectorstore = FAISS.from_documents(chunk_docs, embedding=self.embeddings)
 
 
 
 
 
354
  self._chunk_cache[filename] = {
355
  "vectorstore": chunk_vectorstore,
356
  "chunks": chunk_docs,
357
  }
 
358
  return chunk_vectorstore, chunk_docs
359
 
360
  def _classify_question(self, question: str, use_history: bool = True, model_provider: str = "openai") -> Tuple[str, Optional[str], Optional[List[str]], Optional[List[str]]]:
@@ -531,9 +670,9 @@ Respond with ONLY one of: "law-new", "law-followup", or provide an answer if it'
531
  if use_history:
532
  previous_document = self.chat_history.get_last_document()
533
 
534
- # Build search query with last chat turn context if history is enabled
535
  search_query = question
536
- if use_history:
537
  last_turn = self.chat_history.get_last_turn()
538
  if last_turn:
539
  # Format last turn as text
@@ -550,39 +689,44 @@ Respond with ONLY one of: "law-new", "law-followup", or provide an answer if it'
550
  # Perform similarity search with scores for relevance checking
551
  # Use k=3 to get multiple candidates for comparison
552
  similar_docs_with_scores = self.vectorstore.similarity_search_with_score(search_query, k=3)
553
- search_time = (time.perf_counter() - search_start) * 1000
554
- print(f"[Timing] Similarity search: {search_time:.2f}ms")
555
 
556
  if not similar_docs_with_scores:
557
  return "I couldn't find any relevant information to answer your question.", [], None
558
 
559
  # Extract best matching document and score
 
 
 
 
 
 
 
560
  best_doc, best_score = similar_docs_with_scores[0]
561
  best_filename = best_doc.metadata.get("filename", "")
562
 
563
  # Step 2: Check if we should reuse previous document
 
564
  matched_filename = best_filename
565
  if previous_document and use_history:
566
- # Check if previous document is in the search results
567
- previous_doc_found = False
568
  previous_doc_score = None
 
569
 
570
  for doc, score in similar_docs_with_scores:
571
  filename = doc.metadata.get("filename", "")
572
  if filename == previous_document:
573
- previous_doc_found = True
574
  previous_doc_score = score
 
575
  break
576
 
577
- if previous_doc_found and previous_doc_score is not None:
578
  # Check if previous document score is close to best score
579
  # FAISS returns distance scores (lower is better), so we compare the difference
580
  score_difference = abs(best_score - previous_doc_score)
581
- # If difference is small (within 0.15), reuse previous document
582
- # This threshold can be adjusted based on testing
583
  relevance_threshold = 0.15
584
 
585
  if score_difference <= relevance_threshold:
 
586
  matched_filename = previous_document
587
  print(f"[RAG] Reusing previous document: {matched_filename} (score diff: {score_difference:.4f})")
588
  else:
@@ -590,41 +734,16 @@ Respond with ONLY one of: "law-new", "law-followup", or provide an answer if it'
590
  else:
591
  print(f"[RAG] Previous document not in top results, using best match: {best_filename}")
592
 
593
- # Get the matched document object
594
- matched_doc = None
595
- for doc, _ in similar_docs_with_scores:
596
- if doc.metadata.get("filename", "") == matched_filename:
597
- matched_doc = doc
598
- break
599
-
600
- # If matched document not found in results (shouldn't happen), use best match
601
- if matched_doc is None:
602
- matched_doc = best_doc
603
- matched_filename = best_filename
604
-
605
  # Print the filename and most similar summary
606
  print(f"[RAG] Matched filename: {matched_filename}")
607
 
608
-
609
- if not matched_filename:
610
- return "Error: No filename found in matched document metadata.", [], None
611
-
612
  # Step 3: Retrieve full text from JSON (with caching)
613
  retrieval_start = time.perf_counter()
614
  full_text = self._get_text_by_filename_cached(matched_filename)
615
- retrieval_time = (time.perf_counter() - retrieval_start) * 1000
616
- print(f"[Timing] Text retrieval from JSON: {retrieval_time:.2f}ms")
617
 
618
  if not full_text:
619
- # Load JSON to get available filenames for error message
620
- docs = self._load_json_cached()
621
- available_filenames = [doc.get("filename", "unknown") for doc in docs] if isinstance(docs, list) else []
622
-
623
- error_msg = f"Could not retrieve text for document: '{matched_filename}'. "
624
- if available_filenames:
625
- error_msg += f"Available filenames in JSON: {', '.join(available_filenames)}"
626
- else:
627
- error_msg += "JSON file is empty or invalid."
628
  return error_msg, [matched_filename], None
629
 
630
 
@@ -655,14 +774,13 @@ Respond with ONLY one of: "law-new", "law-followup", or provide an answer if it'
655
  previous_doc = self.chat_history.get_last_document()
656
  if previous_chunks and previous_doc == matched_filename:
657
  print(f"[RAG] Reusing previous chunks for law-followup question ({len(previous_chunks)} chunks)")
658
- selected_chunks = previous_chunks # Reuse previous chunks
659
  document_context_label = "Selected Document Excerpts"
660
  chunk_texts: List[str] = []
661
  for idx, chunk_text in enumerate(previous_chunks, start=1):
662
  chunk_texts.append(f"[مقطع {idx}]\n{chunk_text}")
663
  document_context = "\n\n".join(chunk_texts)[:25000]
664
  else:
665
- previous_chunks = None # Can't reuse, do new search
666
  print(f"[RAG] Cannot reuse chunks: law-followup but different document or no previous chunks")
667
 
668
  # If not reusing previous chunks, do normal chunk search (for law-new or when reuse not possible)
@@ -748,8 +866,9 @@ MUST Answer the Question in Arabic."""
748
 
749
  # Add chat history (excluding the last user message if it's the current question)
750
  if history_messages:
751
- # Add history but skip if last message is the same question
752
- for msg in history_messages[:-1] if len(history_messages) > 0 and history_messages[-1].get("content") == question else history_messages:
 
753
  messages.append(msg)
754
 
755
  messages.append({"role": "user", "content": user_prompt})
@@ -796,7 +915,6 @@ MUST Answer the Question in Arabic."""
796
  parse_start = time.perf_counter()
797
  answer = self._parse_llm_response(raw_response)
798
  parse_time = (time.perf_counter() - parse_start) * 1000
799
- print(f"[Timing] Response parsing: {parse_time:.2f}ms")
800
 
801
  # Step 7: Update chat history with document source and chunks
802
  self.chat_history.add_message("user", question)
 
1
  import os
2
  import json
3
  import time
4
+ import pickle
5
+ import hashlib
6
+ import re
7
  from pathlib import Path
8
  from typing import List, Tuple, Optional, Dict
9
  from langchain_community.vectorstores import FAISS
 
41
  self.json_path = json_path
42
  self.vectorstore = None
43
 
44
+ # Chunk vectorstores directory path
45
+ if json_path is None:
46
+ project_root = Path(__file__).resolve().parents[1]
47
+ self.chunk_vectorstores_path = str(project_root / "chunk_vectorstores")
48
+ else:
49
+ project_root = Path(json_path).parent
50
+ self.chunk_vectorstores_path = str(project_root / "chunk_vectorstores")
51
+ # Create directory if it doesn't exist
52
+ os.makedirs(self.chunk_vectorstores_path, exist_ok=True)
53
+
54
  # Initialize embeddings (supports OpenAI or HuggingFace based on EMBEDDINGS_PROVIDER env var)
55
  provider = os.getenv("EMBEDDINGS_PROVIDER", "openai").lower()
56
  if provider in ["huggingface", "hf", "nebius"]:
 
64
  if not embeddings_api_key:
65
  raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.")
66
 
67
+ print(f"[RAGSystem] Using embeddings provider: {provider}")
68
  self.embeddings = get_embeddings_wrapper(api_key=embeddings_api_key)
69
 
70
  # Initialize document processor (always uses OpenAI for LLM processing)
 
101
  embeddings=self.embeddings,
102
  allow_dangerous_deserialization=True
103
  )
104
+ # Ensure embedding function is properly set.
105
+ # LangChain now expects an Embeddings *object* here, not a raw function.
106
+ if not hasattr(self.vectorstore, "embedding_function") or self.vectorstore.embedding_function is None:
107
+ self.vectorstore.embedding_function = self.embeddings
108
+ # Some versions may set a non-callable placeholder; override with our wrapper.
109
+ elif not callable(getattr(self.vectorstore.embedding_function, "embed_query", None)):
110
+ self.vectorstore.embedding_function = self.embeddings
 
 
 
 
 
 
 
 
 
111
  print(f"Loaded existing vectorstore from {self.vectorstore_path}")
112
  except Exception as e:
113
  print(f"Could not load existing vectorstore: {e}")
 
291
 
292
  return None
293
 
294
+ def _sanitize_filename(self, filename: str) -> str:
295
+ """
296
+ Sanitize filename to create a safe directory name.
297
+ Handles Arabic filenames and special characters.
298
+
299
+ Args:
300
+ filename: Original filename
301
+
302
+ Returns:
303
+ Sanitized directory name safe for filesystem
304
+ """
305
+ # Remove extension
306
+ name_without_ext = Path(filename).stem
307
+
308
+ # Create a hash of the original filename for uniqueness
309
+ # This ensures Arabic and special characters are handled
310
+ filename_hash = hashlib.md5(filename.encode('utf-8')).hexdigest()[:8]
311
+
312
+ # Sanitize: keep alphanumeric, Arabic characters, spaces, hyphens, underscores
313
+ # Replace other special chars with underscore
314
+ sanitized = re.sub(r'[^\w\s\u0600-\u06FF\-]', '_', name_without_ext)
315
+ # Replace multiple spaces/underscores with single underscore
316
+ sanitized = re.sub(r'[\s_]+', '_', sanitized)
317
+ # Remove leading/trailing underscores
318
+ sanitized = sanitized.strip('_')
319
+
320
+ # Combine sanitized name with hash for uniqueness
321
+ if sanitized:
322
+ return f"{sanitized}_{filename_hash}"
323
+ else:
324
+ return filename_hash
325
+
326
+ def _get_chunk_vectorstore_path(self, filename: str) -> str:
327
+ """
328
+ Get the directory path for a document's chunk vectorstore.
329
+
330
+ Args:
331
+ filename: Document filename
332
+
333
+ Returns:
334
+ Path to the directory containing the chunk vectorstore
335
+ """
336
+ sanitized_name = self._sanitize_filename(filename)
337
+ return str(Path(self.chunk_vectorstores_path) / sanitized_name)
338
+
339
+ def _save_chunk_vectorstore(self, filename: str, vectorstore: FAISS, chunks: List[Document]) -> None:
340
+ """
341
+ Save chunk vectorstore and chunks metadata to disk.
342
+
343
+ Args:
344
+ filename: Document filename
345
+ vectorstore: FAISS vectorstore to save
346
+ chunks: List of Document objects (chunks metadata)
347
+ """
348
+ chunk_vs_path = self._get_chunk_vectorstore_path(filename)
349
+ os.makedirs(chunk_vs_path, exist_ok=True)
350
+
351
+ # Save FAISS vectorstore (saves index.faiss and index.pkl)
352
+ vectorstore.save_local(chunk_vs_path)
353
+
354
+ # Save chunks metadata as pickle
355
+ chunks_path = Path(chunk_vs_path) / "chunks.pkl"
356
+ with open(chunks_path, 'wb') as f:
357
+ pickle.dump(chunks, f)
358
+
359
+ print(f"[Chunk Vectorstore] Saved chunk vectorstore for '{filename}'")
360
+
361
+ def _load_chunk_vectorstore(self, filename: str) -> Optional[Tuple[FAISS, List[Document]]]:
362
+ """
363
+ Load chunk vectorstore and chunks metadata from disk if exists.
364
+
365
+ Args:
366
+ filename: Document filename
367
+
368
+ Returns:
369
+ Tuple of (FAISS vectorstore, List[Document]) if found, None otherwise
370
+ """
371
+ chunk_vs_path = self._get_chunk_vectorstore_path(filename)
372
+ chunk_vs_path_obj = Path(chunk_vs_path)
373
+
374
+ # Check if vectorstore files exist
375
+ faiss_index = chunk_vs_path_obj / "index.faiss"
376
+ faiss_pkl = chunk_vs_path_obj / "index.pkl"
377
+ chunks_pkl = chunk_vs_path_obj / "chunks.pkl"
378
+
379
+ if not (faiss_index.exists() and faiss_pkl.exists() and chunks_pkl.exists()):
380
+ return None
381
+
382
+ try:
383
+ # Load FAISS vectorstore
384
+ vectorstore = FAISS.load_local(
385
+ chunk_vs_path,
386
+ embeddings=self.embeddings,
387
+ allow_dangerous_deserialization=True
388
+ )
389
+
390
+ # Ensure embedding function is properly set to the embeddings wrapper object.
391
+ if not hasattr(vectorstore, "embedding_function") or vectorstore.embedding_function is None:
392
+ vectorstore.embedding_function = self.embeddings
393
+ elif not callable(getattr(vectorstore.embedding_function, "embed_query", None)):
394
+ vectorstore.embedding_function = self.embeddings
395
+
396
+ # Load chunks metadata
397
+ with open(chunks_pkl, 'rb') as f:
398
+ chunks = pickle.load(f)
399
+
400
+ print(f"[Chunk Vectorstore] Loaded chunk vectorstore for '{filename}'")
401
+ return vectorstore, chunks
402
+
403
+ except Exception as e:
404
+ print(f"[Chunk Vectorstore] Error loading chunk vectorstore for '{filename}': {e}")
405
+ return None
406
+
407
  def _get_or_build_chunk_vectorstore(
408
  self,
409
  filename: str,
 
412
  chunk_overlap: int = 300
413
  ) -> Tuple[FAISS, List[Document]]:
414
  """
415
+ Build or retrieve a FAISS vectorstore of semantic chunks for a single document.
416
+ Checks disk first, then memory cache, then builds if needed.
417
 
418
  Args:
419
  filename: Document filename used as key in cache/metadata
 
424
  Returns:
425
  Tuple of (FAISS vectorstore over chunks, list of chunk Documents)
426
  """
427
+ # Step 1: Return from memory cache if available (fastest)
428
  if filename in self._chunk_cache:
429
  entry = self._chunk_cache[filename]
430
  return entry["vectorstore"], entry["chunks"] # type: ignore[return-value]
431
 
432
+ # Step 2: Try to load from disk
433
+ loaded = self._load_chunk_vectorstore(filename)
434
+ if loaded is not None:
435
+ vectorstore, chunks = loaded
436
+ # Cache in memory for faster access
437
+ self._chunk_cache[filename] = {
438
+ "vectorstore": vectorstore,
439
+ "chunks": chunks,
440
+ }
441
+ return vectorstore, chunks
442
+
443
+ # Step 3: Build new vectorstore (not found in cache or disk)
444
+ print(f"[Chunk Vectorstore] Building new chunk vectorstore for '{filename}'")
445
+
446
  # Create text splitter tuned for Arabic legal text
447
  text_splitter = RecursiveCharacterTextSplitter(
448
  chunk_size=chunk_size,
 
484
  ]
485
 
486
  chunk_vectorstore = FAISS.from_documents(chunk_docs, embedding=self.embeddings)
487
+
488
+ # Step 4: Save to disk for future use
489
+ self._save_chunk_vectorstore(filename, chunk_vectorstore, chunk_docs)
490
+
491
+ # Step 5: Cache in memory for current session
492
  self._chunk_cache[filename] = {
493
  "vectorstore": chunk_vectorstore,
494
  "chunks": chunk_docs,
495
  }
496
+
497
  return chunk_vectorstore, chunk_docs
498
 
499
  def _classify_question(self, question: str, use_history: bool = True, model_provider: str = "openai") -> Tuple[str, Optional[str], Optional[List[str]], Optional[List[str]]]:
 
670
  if use_history:
671
  previous_document = self.chat_history.get_last_document()
672
 
673
+ # Build search query with last chat turn context only for follow-up questions
674
  search_query = question
675
+ if use_history and label == "law-followup":
676
  last_turn = self.chat_history.get_last_turn()
677
  if last_turn:
678
  # Format last turn as text
 
689
  # Perform similarity search with scores for relevance checking
690
  # Use k=3 to get multiple candidates for comparison
691
  similar_docs_with_scores = self.vectorstore.similarity_search_with_score(search_query, k=3)
 
 
692
 
693
  if not similar_docs_with_scores:
694
  return "I couldn't find any relevant information to answer your question.", [], None
695
 
696
  # Extract best matching document and score
697
+ best_doc, best_score = similar_docs_with_scores[0]
698
+ print(f"[RAG] All document scores:")
699
+ for idx, (doc, score) in enumerate(similar_docs_with_scores, 1):
700
+ filename = doc.metadata.get("filename", "unknown")
701
+ print(f" {idx}. {filename}: {score:.4f}")
702
+ print(f"[RAG] Best document: {best_score:.4f}")
703
+
704
  best_doc, best_score = similar_docs_with_scores[0]
705
  best_filename = best_doc.metadata.get("filename", "")
706
 
707
  # Step 2: Check if we should reuse previous document
708
+ matched_doc = best_doc
709
  matched_filename = best_filename
710
  if previous_document and use_history:
711
+ # Check if previous document is in the search results and capture doc object
 
712
  previous_doc_score = None
713
+ previous_doc_obj = None
714
 
715
  for doc, score in similar_docs_with_scores:
716
  filename = doc.metadata.get("filename", "")
717
  if filename == previous_document:
 
718
  previous_doc_score = score
719
+ previous_doc_obj = doc
720
  break
721
 
722
+ if previous_doc_score is not None:
723
  # Check if previous document score is close to best score
724
  # FAISS returns distance scores (lower is better), so we compare the difference
725
  score_difference = abs(best_score - previous_doc_score)
 
 
726
  relevance_threshold = 0.15
727
 
728
  if score_difference <= relevance_threshold:
729
+ matched_doc = previous_doc_obj
730
  matched_filename = previous_document
731
  print(f"[RAG] Reusing previous document: {matched_filename} (score diff: {score_difference:.4f})")
732
  else:
 
734
  else:
735
  print(f"[RAG] Previous document not in top results, using best match: {best_filename}")
736
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  # Print the filename and most similar summary
738
  print(f"[RAG] Matched filename: {matched_filename}")
739
 
 
 
 
 
740
  # Step 3: Retrieve full text from JSON (with caching)
741
  retrieval_start = time.perf_counter()
742
  full_text = self._get_text_by_filename_cached(matched_filename)
743
+
 
744
 
745
  if not full_text:
746
+ error_msg = f"Could not retrieve text for document: '{matched_filename}'. Please ensure the document is properly processed."
 
 
 
 
 
 
 
 
747
  return error_msg, [matched_filename], None
748
 
749
 
 
774
  previous_doc = self.chat_history.get_last_document()
775
  if previous_chunks and previous_doc == matched_filename:
776
  print(f"[RAG] Reusing previous chunks for law-followup question ({len(previous_chunks)} chunks)")
777
+ selected_chunks = previous_chunks
778
  document_context_label = "Selected Document Excerpts"
779
  chunk_texts: List[str] = []
780
  for idx, chunk_text in enumerate(previous_chunks, start=1):
781
  chunk_texts.append(f"[مقطع {idx}]\n{chunk_text}")
782
  document_context = "\n\n".join(chunk_texts)[:25000]
783
  else:
 
784
  print(f"[RAG] Cannot reuse chunks: law-followup but different document or no previous chunks")
785
 
786
  # If not reusing previous chunks, do normal chunk search (for law-new or when reuse not possible)
 
866
 
867
  # Add chat history (excluding the last user message if it's the current question)
868
  if history_messages:
869
+ # Skip last message if it's the same as current question
870
+ skip_last = len(history_messages) > 0 and history_messages[-1].get("content") == question
871
+ for msg in history_messages[:-1] if skip_last else history_messages:
872
  messages.append(msg)
873
 
874
  messages.append({"role": "user", "content": user_prompt})
 
915
  parse_start = time.perf_counter()
916
  answer = self._parse_llm_response(raw_response)
917
  parse_time = (time.perf_counter() - parse_start) * 1000
 
918
 
919
  # Step 7: Update chat history with document source and chunks
920
  self.chat_history.add_message("user", question)
frontend/build/asset-manifest.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "files": {
3
  "main.css": "/static/css/main.3a0c885e.css",
4
- "main.js": "/static/js/main.882def61.js",
5
  "index.html": "/index.html",
6
  "main.3a0c885e.css.map": "/static/css/main.3a0c885e.css.map",
7
- "main.882def61.js.map": "/static/js/main.882def61.js.map"
8
  },
9
  "entrypoints": [
10
  "static/css/main.3a0c885e.css",
11
- "static/js/main.882def61.js"
12
  ]
13
  }
 
1
  {
2
  "files": {
3
  "main.css": "/static/css/main.3a0c885e.css",
4
+ "main.js": "/static/js/main.2713a5e5.js",
5
  "index.html": "/index.html",
6
  "main.3a0c885e.css.map": "/static/css/main.3a0c885e.css.map",
7
+ "main.2713a5e5.js.map": "/static/js/main.2713a5e5.js.map"
8
  },
9
  "entrypoints": [
10
  "static/css/main.3a0c885e.css",
11
+ "static/js/main.2713a5e5.js"
12
  ]
13
  }
frontend/build/index.html CHANGED
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Law Document RAG Chat Application"/><title>Law Document Assistant</title><script defer="defer" src="/static/js/main.882def61.js"></script><link href="/static/css/main.3a0c885e.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
 
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Law Document RAG Chat Application"/><title>Law Document Assistant</title><script defer="defer" src="/static/js/main.2713a5e5.js"></script><link href="/static/css/main.3a0c885e.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
frontend/build/static/css/main.3a0c885e.css.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"static/css/main.3a0c885e.css","mappings":"AAAA,EAGE,qBAAsB,CADtB,SAEF,CAEA,OALE,QAcF,CATA,KAKE,kCAAmC,CACnC,iCAAkC,CAClC,kDAA6D,CAL7D,mIAEY,CAIZ,gBACF,CAEA,KACE,uEAEF,CCpBA,KAGE,kBAAmB,CAGnB,aAAc,CALd,YAAa,CAMb,mEAAmF,CALnF,sBAAuB,CAEvB,gBAAiB,CACjB,YAGF,CAEA,gBAIE,eAAiB,CACjB,kBAAmB,CACnB,gCAA0C,CAC1C,YAAa,CACb,qBAAsB,CALtB,WAAY,CADZ,eAAgB,CAOhB,eAAgB,CARhB,UASF,CAEA,aACE,kDAA6D,CAI7D,+BAAyC,CAHzC,UAAY,CACZ,YAAa,CACb,iBAEF,CAEA,gBAEE,cAAe,CACf,eAAgB,CAFhB,eAGF,CAEA,eAGE,cAAe,CAFf,eAAkB,CAClB,UAEF,CAEA,oBAIE,kBAAmB,CAHnB,QAAO,CACP,eAAgB,CAChB,YAEF,CAEA,iBAGE,UAAW,CADX,iBAAkB,CADlB,iBAGF,CAEA,oBACE,UAAW,CAEX,cAAe,CADf,kBAEF,CAEA,mBAEE,cAAe,CADf,aAEF,CAEA,uBAEE,UAAW,CADX,cAAe,CAEf,iBACF,CAEA,SAGE,yBAA2B,CAD3B,YAAa,CADb,kBAGF,CAEA,kBACE,GACE,SAAU,CACV,0BACF,CACA,GACE,SAAU,CACV,uBACF,CACF,CAMA,iCACE,0BACF,CAEA,iBAGE,kBAAmB,CACnB,8BAAwC,CAHxC,aAAc,CACd,iBAGF,CAEA,+BACE,kDAA6D,CAE7D,6BAA8B,CAD9B,UAEF,CAEA,oCACE,eAAiB,CAEjB,8BAA+B,CAD/B,UAEF,CAEA,gBACE,cAAe,CACf,eAAgB,CAChB,iBAAkB,CAClB,UACF,CAEA,cAGE,oBAAqB,CAErB,aAAc,CAJd,cAAe,CACf,eAAgB,CAEhB,gBAEF,CAEA,oBACE,aACF,CAEA,gBACE,eACF,CAEA,qBACE,eACF,CAEA,kCAGE,aAAc,CADd,kBAEF,CAEA,iBACE,iBACF,CAEA,kBACE,oBACF,CAEA,wBAEE,qCAA2C,CAD3C,aAEF,CAEA,gBACE,OACE,WACF,CACA,IACE,YACF,CACA,OACE,aACF,CACF,CAEA,SAGE,8BAAwC,CACxC,cAAe,CAHf,eAAgB,CAChB,gBAGF,CAEA,uBACE,0BACF,CAEA,gBACE,aAAc,CACd,iBAAkB,CAClB,UACF,CAEA,YACE,eAAgB,CAEhB,QAAS,CADT,SAEF,CAEA,YAGE,YAAa,CACb,qBAAsB,CACtB,OAAQ,CAHR,UAAY,CADZ,aAKF,CAEA,aAEE,UAAW,CADX,eAEF,CAEA,gBACE,YAAa,CACb,QACF,CAEA,aACE,eAAgB,CAChB,WAAY,CACZ,aAAc,CACd,cAAe,CAKf,aAAc,CAHd,cAAe,CACf,eAAgB,CAChB,SAAU,CAEV,gBAAiB,CALjB,yBAMF,CAEA,mBACE,aACF,CAEA,sBACE,aACF,CAEA,4BACE,aACF,CAEA,eAKE,kBAAmB,CAFnB,wBAAyB,CACzB,kBAAmB,CAInB,aAAc,CAPd,WAAY,CAKZ,gBAAiB,CACjB,aAAc,CALd,YAOF,CAEA,gBAGE,kBAAmB,CAFnB,YAAa,CACb,6BAA8B,CAE9B,kBACF,CAEA,mBACE,QACF,CAEA,kBAEE,UAAW,CADX,cAEF,CAEA,eACE,eAAgB,CAChB,WAAY,CAGZ,aAAc,CADd,cAAe,CADf,cAAe,CAGf,eACF,CAEA,qBACE,aACF,CAEA,qBAME,kBAAmB,CAEnB,iBAAkB,CANlB,aAAc,CAGd,cAAe,CADf,QAAS,CAGT,YAAa,CAJb,gBAAiB,CAFjB,oBAQF,CAEA,iBAIE,kBAAmB,CAFnB,YAAa,CACb,sBAAuB,CAFvB,gBAIF,CAEA,WAKE,eAAgB,CAFhB,WAAY,CACZ,iBAAkB,CAElB,mCAA8C,CAJ9C,YAAa,CADb,UAMF,CAEA,iBAGE,eAAiB,CACjB,4BAA6B,CAH7B,YAAa,CAIb,QAAS,CAHT,YAIF,CAEA,eAGE,wBAAyB,CACzB,kBAAmB,CAKnB,aAAc,CARd,QAAO,CAIP,cAAe,CACf,YAAa,CAJb,iBAAkB,CAMlB,gBAAiB,CADjB,gCAGF,CAEA,qBACE,oBACF,CAEA,wBACE,kBAAmB,CACnB,kBACF,CAEA,aAEE,kDAA6D,CAE7D,WAAY,CACZ,kBAAmB,CAFnB,UAAY,CAKZ,cAAe,CAFf,cAAe,CACf,eAAgB,CANhB,iBAAkB,CAQlB,iDACF,CAEA,kCAEE,+BAA+C,CAD/C,0BAEF,CAEA,sBAEE,kBAAmB,CADnB,UAAY,CAEZ,cACF,CAGA,uCACE,SACF,CAEA,6CACE,kBAAmB,CACnB,kBACF,CAEA,6CACE,eAAgB,CAChB,kBACF,CAEA,mDACE,eACF,CAGA,yBACE,gBAEE,eAAgB,CADhB,YAEF,CAEA,iBACE,aACF,CAEA,gBACE,cACF,CACF","sources":["index.css","App.css"],"sourcesContent":["* {\r\n margin: 0;\r\n padding: 0;\r\n box-sizing: border-box;\r\n}\r\n\r\nbody {\r\n margin: 0;\r\n font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',\r\n 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',\r\n sans-serif;\r\n -webkit-font-smoothing: antialiased;\r\n -moz-osx-font-smoothing: grayscale;\r\n background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\r\n min-height: 100vh;\r\n}\r\n\r\ncode {\r\n font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',\r\n monospace;\r\n}\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n",".App {\n display: flex;\n justify-content: center;\n align-items: center;\n min-height: 100vh;\n padding: 20px;\n direction: rtl;\n font-family: 'Segoe UI', 'Arial', 'Tahoma', 'Cairo', 'Noto Sans Arabic', sans-serif;\n}\n\n.chat-container {\n width: 100%;\n max-width: 900px;\n height: 90vh;\n background: white;\n border-radius: 20px;\n box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);\n display: flex;\n flex-direction: column;\n overflow: hidden;\n}\n\n.chat-header {\n background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\n color: white;\n padding: 25px;\n text-align: center;\n box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);\n}\n\n.chat-header h1 {\n margin: 0 0 10px 0;\n font-size: 28px;\n font-weight: 600;\n}\n\n.chat-header p {\n margin: 0 0 15px 0;\n opacity: 0.9;\n font-size: 14px;\n}\n\n.messages-container {\n flex: 1;\n overflow-y: auto;\n padding: 20px;\n background: #f8f9fa;\n}\n\n.welcome-message {\n text-align: center;\n padding: 60px 20px;\n color: #666;\n}\n\n.welcome-message h2 {\n color: #333;\n margin-bottom: 15px;\n font-size: 24px;\n}\n\n.welcome-message p {\n margin: 10px 0;\n font-size: 16px;\n}\n\n.welcome-message .hint {\n font-size: 14px;\n color: #999;\n font-style: italic;\n}\n\n.message {\n margin-bottom: 20px;\n display: flex;\n animation: fadeIn 0.3s ease;\n}\n\n@keyframes fadeIn {\n from {\n opacity: 0;\n transform: translateY(10px);\n }\n to {\n opacity: 1;\n transform: translateY(0);\n }\n}\n\n.message.user {\n justify-content: flex-start;\n}\n\n.message.assistant {\n justify-content: flex-start;\n}\n\n.message-content {\n max-width: 70%;\n padding: 15px 20px;\n border-radius: 18px;\n box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);\n}\n\n.message.user .message-content {\n background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\n color: white;\n border-bottom-left-radius: 4px;\n}\n\n.message.assistant .message-content {\n background: white;\n color: #333;\n border-bottom-right-radius: 4px;\n}\n\n.message-header {\n font-size: 12px;\n font-weight: 600;\n margin-bottom: 8px;\n opacity: 0.8;\n}\n\n.message-text {\n font-size: 15px;\n line-height: 1.8;\n word-wrap: break-word;\n text-align: right;\n direction: rtl;\n}\n\n.message-text.error {\n color: #d32f2f;\n}\n\n.message-text p {\n margin: 0 0 10px 0;\n}\n\n.message-text strong {\n font-weight: 700;\n}\n\n.message-text ul,\n.message-text ol {\n padding-right: 20px;\n margin: 10px 0;\n}\n\n.message-text li {\n margin-bottom: 6px;\n}\n\n.typing-indicator {\n display: inline-block;\n}\n\n.typing-indicator::after {\n content: '...';\n animation: dots 1.5s steps(4, end) infinite;\n}\n\n@keyframes dots {\n 0%, 20% {\n content: '.';\n }\n 40% {\n content: '..';\n }\n 60%, 100% {\n content: '...';\n }\n}\n\n.sources {\n margin-top: 12px;\n padding-top: 12px;\n border-top: 1px solid rgba(0, 0, 0, 0.1);\n font-size: 12px;\n}\n\n.message.user .sources {\n border-top-color: rgba(255, 255, 255, 0.3);\n}\n\n.sources strong {\n display: block;\n margin-bottom: 6px;\n opacity: 0.8;\n}\n\n.sources ul {\n list-style: none;\n padding: 0;\n margin: 0;\n}\n\n.sources li {\n padding: 4px 0;\n opacity: 0.9;\n display: flex;\n flex-direction: column;\n gap: 6px;\n}\n\n.source-name {\n font-weight: 600;\n color: #333;\n}\n\n.source-actions {\n display: flex;\n gap: 10px;\n}\n\n.source-link {\n background: none;\n border: none;\n color: #4c6ef5;\n cursor: pointer;\n text-decoration: underline;\n font-size: 14px;\n font-weight: 600;\n padding: 0;\n direction: rtl;\n text-align: right;\n}\n\n.source-link:hover {\n color: #2a48c5;\n}\n\n.source-link.download {\n color: #2f9e44;\n}\n\n.source-link.download:hover {\n color: #1b6d2f;\n}\n\n.preview-panel {\n margin: 20px;\n padding: 15px;\n border: 1px solid #e0e0e0;\n border-radius: 12px;\n background: #fdfdfd;\n max-height: 300px;\n overflow: auto;\n direction: rtl;\n}\n\n.preview-header {\n display: flex;\n justify-content: space-between;\n align-items: center;\n margin-bottom: 10px;\n}\n\n.preview-header h3 {\n margin: 0;\n}\n\n.preview-filename {\n font-size: 14px;\n color: #555;\n}\n\n.close-preview {\n background: none;\n border: none;\n font-size: 22px;\n cursor: pointer;\n color: #ff4d4f;\n font-weight: bold;\n}\n\n.close-preview:hover {\n color: #d9363e;\n}\n\n.preview-content pre {\n white-space: pre-wrap;\n direction: rtl;\n text-align: right;\n margin: 0;\n font-size: 14px;\n background: #f8f9fa;\n padding: 10px;\n border-radius: 8px;\n}\n\n.preview-content {\n min-height: 200px;\n display: flex;\n justify-content: center;\n align-items: center;\n}\n\n.pdf-frame {\n width: 100%;\n height: 400px;\n border: none;\n border-radius: 8px;\n background: #fff;\n box-shadow: inset 0 0 10px rgba(0, 0, 0, 0.05);\n}\n\n.input-container {\n display: flex;\n padding: 20px;\n background: white;\n border-top: 1px solid #e0e0e0;\n gap: 10px;\n}\n\n.message-input {\n flex: 1;\n padding: 15px 20px;\n border: 2px solid #e0e0e0;\n border-radius: 25px;\n font-size: 15px;\n outline: none;\n transition: border-color 0.3s ease;\n text-align: right;\n direction: rtl;\n}\n\n.message-input:focus {\n border-color: #667eea;\n}\n\n.message-input:disabled {\n background: #f5f5f5;\n cursor: not-allowed;\n}\n\n.send-button {\n padding: 15px 30px;\n background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\n color: white;\n border: none;\n border-radius: 25px;\n font-size: 15px;\n font-weight: 600;\n cursor: pointer;\n transition: transform 0.2s ease, box-shadow 0.2s ease;\n}\n\n.send-button:hover:not(:disabled) {\n transform: translateY(-2px);\n box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);\n}\n\n.send-button:disabled {\n opacity: 0.6;\n cursor: not-allowed;\n transform: none;\n}\n\n/* Scrollbar styling */\n.messages-container::-webkit-scrollbar {\n width: 8px;\n}\n\n.messages-container::-webkit-scrollbar-track {\n background: #f1f1f1;\n border-radius: 10px;\n}\n\n.messages-container::-webkit-scrollbar-thumb {\n background: #888;\n border-radius: 10px;\n}\n\n.messages-container::-webkit-scrollbar-thumb:hover {\n background: #555;\n}\n\n/* Responsive design */\n@media (max-width: 768px) {\n .chat-container {\n height: 100vh;\n border-radius: 0;\n }\n\n .message-content {\n max-width: 85%;\n }\n\n .chat-header h1 {\n font-size: 24px;\n }\n}\n\n\n"],"names":[],"sourceRoot":""}
 
1
+ {"version":3,"file":"static/css/main.3a0c885e.css","mappings":"AAAA,EAGE,qBAAsB,CADtB,SAEF,CAEA,OALE,QAcF,CATA,KAKE,kCAAmC,CACnC,iCAAkC,CAClC,kDAA6D,CAL7D,mIAEY,CAIZ,gBACF,CAEA,KACE,uEAEF,CCpBA,KAGE,kBAAmB,CAGnB,aAAc,CALd,YAAa,CAMb,mEAAmF,CALnF,sBAAuB,CAEvB,gBAAiB,CACjB,YAGF,CAEA,gBAIE,eAAiB,CACjB,kBAAmB,CACnB,gCAA0C,CAC1C,YAAa,CACb,qBAAsB,CALtB,WAAY,CADZ,eAAgB,CAOhB,eAAgB,CARhB,UASF,CAEA,aACE,kDAA6D,CAI7D,+BAAyC,CAHzC,UAAY,CACZ,YAAa,CACb,iBAEF,CAEA,gBAEE,cAAe,CACf,eAAgB,CAFhB,eAGF,CAEA,eAGE,cAAe,CAFf,eAAkB,CAClB,UAEF,CAEA,oBAIE,kBAAmB,CAHnB,QAAO,CACP,eAAgB,CAChB,YAEF,CAEA,iBAGE,UAAW,CADX,iBAAkB,CADlB,iBAGF,CAEA,oBACE,UAAW,CAEX,cAAe,CADf,kBAEF,CAEA,mBAEE,cAAe,CADf,aAEF,CAEA,uBAEE,UAAW,CADX,cAAe,CAEf,iBACF,CAEA,SAGE,yBAA2B,CAD3B,YAAa,CADb,kBAGF,CAEA,kBACE,GACE,SAAU,CACV,0BACF,CACA,GACE,SAAU,CACV,uBACF,CACF,CAMA,iCACE,0BACF,CAEA,iBAGE,kBAAmB,CACnB,8BAAwC,CAHxC,aAAc,CACd,iBAGF,CAEA,+BACE,kDAA6D,CAE7D,6BAA8B,CAD9B,UAEF,CAEA,oCACE,eAAiB,CAEjB,8BAA+B,CAD/B,UAEF,CAEA,gBACE,cAAe,CACf,eAAgB,CAChB,iBAAkB,CAClB,UACF,CAEA,cAGE,oBAAqB,CAErB,aAAc,CAJd,cAAe,CACf,eAAgB,CAEhB,gBAEF,CAEA,oBACE,aACF,CAEA,gBACE,eACF,CAEA,qBACE,eACF,CAEA,kCAGE,aAAc,CADd,kBAEF,CAEA,iBACE,iBACF,CAEA,kBACE,oBACF,CAEA,wBAEE,qCAA2C,CAD3C,aAEF,CAEA,gBACE,OACE,WACF,CACA,IACE,YACF,CACA,OACE,aACF,CACF,CAEA,SAGE,8BAAwC,CACxC,cAAe,CAHf,eAAgB,CAChB,gBAGF,CAEA,uBACE,0BACF,CAEA,gBACE,aAAc,CACd,iBAAkB,CAClB,UACF,CAEA,YACE,eAAgB,CAEhB,QAAS,CADT,SAEF,CAEA,YAGE,YAAa,CACb,qBAAsB,CACtB,OAAQ,CAHR,UAAY,CADZ,aAKF,CAEA,aAEE,UAAW,CADX,eAEF,CAEA,gBACE,YAAa,CACb,QACF,CAEA,aACE,eAAgB,CAChB,WAAY,CACZ,aAAc,CACd,cAAe,CAKf,aAAc,CAHd,cAAe,CACf,eAAgB,CAChB,SAAU,CAEV,gBAAiB,CALjB,yBAMF,CAEA,mBACE,aACF,CAEA,sBACE,aACF,CAEA,4BACE,aACF,CAEA,eAKE,kBAAmB,CAFnB,wBAAyB,CACzB,kBAAmB,CAInB,aAAc,CAPd,WAAY,CAKZ,gBAAiB,CACjB,aAAc,CALd,YAOF,CAEA,gBAGE,kBAAmB,CAFnB,YAAa,CACb,6BAA8B,CAE9B,kBACF,CAEA,mBACE,QACF,CAEA,kBAEE,UAAW,CADX,cAEF,CAEA,eACE,eAAgB,CAChB,WAAY,CAGZ,aAAc,CADd,cAAe,CADf,cAAe,CAGf,eACF,CAEA,qBACE,aACF,CAEA,qBAME,kBAAmB,CAEnB,iBAAkB,CANlB,aAAc,CAGd,cAAe,CADf,QAAS,CAGT,YAAa,CAJb,gBAAiB,CAFjB,oBAQF,CAEA,iBAIE,kBAAmB,CAFnB,YAAa,CACb,sBAAuB,CAFvB,gBAIF,CAEA,WAKE,eAAgB,CAFhB,WAAY,CACZ,iBAAkB,CAElB,mCAA8C,CAJ9C,YAAa,CADb,UAMF,CAEA,iBAGE,eAAiB,CACjB,4BAA6B,CAH7B,YAAa,CAIb,QAAS,CAHT,YAIF,CAEA,eAGE,wBAAyB,CACzB,kBAAmB,CAKnB,aAAc,CARd,QAAO,CAIP,cAAe,CACf,YAAa,CAJb,iBAAkB,CAMlB,gBAAiB,CADjB,gCAGF,CAEA,qBACE,oBACF,CAEA,wBACE,kBAAmB,CACnB,kBACF,CAEA,aAEE,kDAA6D,CAE7D,WAAY,CACZ,kBAAmB,CAFnB,UAAY,CAKZ,cAAe,CAFf,cAAe,CACf,eAAgB,CANhB,iBAAkB,CAQlB,iDACF,CAEA,kCAEE,+BAA+C,CAD/C,0BAEF,CAEA,sBAEE,kBAAmB,CADnB,UAAY,CAEZ,cACF,CAGA,uCACE,SACF,CAEA,6CACE,kBAAmB,CACnB,kBACF,CAEA,6CACE,eAAgB,CAChB,kBACF,CAEA,mDACE,eACF,CAGA,yBACE,gBAEE,eAAgB,CADhB,YAEF,CAEA,iBACE,aACF,CAEA,gBACE,cACF,CACF","sources":["index.css","App.css"],"sourcesContent":["* {\n margin: 0;\n padding: 0;\n box-sizing: border-box;\n}\n\nbody {\n margin: 0;\n font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',\n 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',\n sans-serif;\n -webkit-font-smoothing: antialiased;\n -moz-osx-font-smoothing: grayscale;\n background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\n min-height: 100vh;\n}\n\ncode {\n font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',\n monospace;\n}\n\n\n\n\n\n\n\n\n\n\n",".App {\n display: flex;\n justify-content: center;\n align-items: center;\n min-height: 100vh;\n padding: 20px;\n direction: rtl;\n font-family: 'Segoe UI', 'Arial', 'Tahoma', 'Cairo', 'Noto Sans Arabic', sans-serif;\n}\n\n.chat-container {\n width: 100%;\n max-width: 900px;\n height: 90vh;\n background: white;\n border-radius: 20px;\n box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);\n display: flex;\n flex-direction: column;\n overflow: hidden;\n}\n\n.chat-header {\n background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\n color: white;\n padding: 25px;\n text-align: center;\n box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);\n}\n\n.chat-header h1 {\n margin: 0 0 10px 0;\n font-size: 28px;\n font-weight: 600;\n}\n\n.chat-header p {\n margin: 0 0 15px 0;\n opacity: 0.9;\n font-size: 14px;\n}\n\n.messages-container {\n flex: 1;\n overflow-y: auto;\n padding: 20px;\n background: #f8f9fa;\n}\n\n.welcome-message {\n text-align: center;\n padding: 60px 20px;\n color: #666;\n}\n\n.welcome-message h2 {\n color: #333;\n margin-bottom: 15px;\n font-size: 24px;\n}\n\n.welcome-message p {\n margin: 10px 0;\n font-size: 16px;\n}\n\n.welcome-message .hint {\n font-size: 14px;\n color: #999;\n font-style: italic;\n}\n\n.message {\n margin-bottom: 20px;\n display: flex;\n animation: fadeIn 0.3s ease;\n}\n\n@keyframes fadeIn {\n from {\n opacity: 0;\n transform: translateY(10px);\n }\n to {\n opacity: 1;\n transform: translateY(0);\n }\n}\n\n.message.user {\n justify-content: flex-start;\n}\n\n.message.assistant {\n justify-content: flex-start;\n}\n\n.message-content {\n max-width: 70%;\n padding: 15px 20px;\n border-radius: 18px;\n box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);\n}\n\n.message.user .message-content {\n background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\n color: white;\n border-bottom-left-radius: 4px;\n}\n\n.message.assistant .message-content {\n background: white;\n color: #333;\n border-bottom-right-radius: 4px;\n}\n\n.message-header {\n font-size: 12px;\n font-weight: 600;\n margin-bottom: 8px;\n opacity: 0.8;\n}\n\n.message-text {\n font-size: 15px;\n line-height: 1.8;\n word-wrap: break-word;\n text-align: right;\n direction: rtl;\n}\n\n.message-text.error {\n color: #d32f2f;\n}\n\n.message-text p {\n margin: 0 0 10px 0;\n}\n\n.message-text strong {\n font-weight: 700;\n}\n\n.message-text ul,\n.message-text ol {\n padding-right: 20px;\n margin: 10px 0;\n}\n\n.message-text li {\n margin-bottom: 6px;\n}\n\n.typing-indicator {\n display: inline-block;\n}\n\n.typing-indicator::after {\n content: '...';\n animation: dots 1.5s steps(4, end) infinite;\n}\n\n@keyframes dots {\n 0%, 20% {\n content: '.';\n }\n 40% {\n content: '..';\n }\n 60%, 100% {\n content: '...';\n }\n}\n\n.sources {\n margin-top: 12px;\n padding-top: 12px;\n border-top: 1px solid rgba(0, 0, 0, 0.1);\n font-size: 12px;\n}\n\n.message.user .sources {\n border-top-color: rgba(255, 255, 255, 0.3);\n}\n\n.sources strong {\n display: block;\n margin-bottom: 6px;\n opacity: 0.8;\n}\n\n.sources ul {\n list-style: none;\n padding: 0;\n margin: 0;\n}\n\n.sources li {\n padding: 4px 0;\n opacity: 0.9;\n display: flex;\n flex-direction: column;\n gap: 6px;\n}\n\n.source-name {\n font-weight: 600;\n color: #333;\n}\n\n.source-actions {\n display: flex;\n gap: 10px;\n}\n\n.source-link {\n background: none;\n border: none;\n color: #4c6ef5;\n cursor: pointer;\n text-decoration: underline;\n font-size: 14px;\n font-weight: 600;\n padding: 0;\n direction: rtl;\n text-align: right;\n}\n\n.source-link:hover {\n color: #2a48c5;\n}\n\n.source-link.download {\n color: #2f9e44;\n}\n\n.source-link.download:hover {\n color: #1b6d2f;\n}\n\n.preview-panel {\n margin: 20px;\n padding: 15px;\n border: 1px solid #e0e0e0;\n border-radius: 12px;\n background: #fdfdfd;\n max-height: 300px;\n overflow: auto;\n direction: rtl;\n}\n\n.preview-header {\n display: flex;\n justify-content: space-between;\n align-items: center;\n margin-bottom: 10px;\n}\n\n.preview-header h3 {\n margin: 0;\n}\n\n.preview-filename {\n font-size: 14px;\n color: #555;\n}\n\n.close-preview {\n background: none;\n border: none;\n font-size: 22px;\n cursor: pointer;\n color: #ff4d4f;\n font-weight: bold;\n}\n\n.close-preview:hover {\n color: #d9363e;\n}\n\n.preview-content pre {\n white-space: pre-wrap;\n direction: rtl;\n text-align: right;\n margin: 0;\n font-size: 14px;\n background: #f8f9fa;\n padding: 10px;\n border-radius: 8px;\n}\n\n.preview-content {\n min-height: 200px;\n display: flex;\n justify-content: center;\n align-items: center;\n}\n\n.pdf-frame {\n width: 100%;\n height: 400px;\n border: none;\n border-radius: 8px;\n background: #fff;\n box-shadow: inset 0 0 10px rgba(0, 0, 0, 0.05);\n}\n\n.input-container {\n display: flex;\n padding: 20px;\n background: white;\n border-top: 1px solid #e0e0e0;\n gap: 10px;\n}\n\n.message-input {\n flex: 1;\n padding: 15px 20px;\n border: 2px solid #e0e0e0;\n border-radius: 25px;\n font-size: 15px;\n outline: none;\n transition: border-color 0.3s ease;\n text-align: right;\n direction: rtl;\n}\n\n.message-input:focus {\n border-color: #667eea;\n}\n\n.message-input:disabled {\n background: #f5f5f5;\n cursor: not-allowed;\n}\n\n.send-button {\n padding: 15px 30px;\n background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);\n color: white;\n border: none;\n border-radius: 25px;\n font-size: 15px;\n font-weight: 600;\n cursor: pointer;\n transition: transform 0.2s ease, box-shadow 0.2s ease;\n}\n\n.send-button:hover:not(:disabled) {\n transform: translateY(-2px);\n box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);\n}\n\n.send-button:disabled {\n opacity: 0.6;\n cursor: not-allowed;\n transform: none;\n}\n\n/* Scrollbar styling */\n.messages-container::-webkit-scrollbar {\n width: 8px;\n}\n\n.messages-container::-webkit-scrollbar-track {\n background: #f1f1f1;\n border-radius: 10px;\n}\n\n.messages-container::-webkit-scrollbar-thumb {\n background: #888;\n border-radius: 10px;\n}\n\n.messages-container::-webkit-scrollbar-thumb:hover {\n background: #555;\n}\n\n/* Responsive design */\n@media (max-width: 768px) {\n .chat-container {\n height: 100vh;\n border-radius: 0;\n }\n\n .message-content {\n max-width: 85%;\n }\n\n .chat-header h1 {\n font-size: 24px;\n }\n}\n\n\n"],"names":[],"sourceRoot":""}
frontend/build/static/js/{main.882def61.js → main.2713a5e5.js} RENAMED
The diff for this file is too large to render. See raw diff
 
frontend/build/static/js/{main.882def61.js.LICENSE.txt → main.2713a5e5.js.LICENSE.txt} RENAMED
File without changes
frontend/build/static/js/{main.882def61.js.map → main.2713a5e5.js.map} RENAMED
The diff for this file is too large to render. See raw diff
 
test_nebius_embeddings.py DELETED
@@ -1,292 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script for Nebius Embeddings API via HuggingFace Router
4
- Tests direct API calls to verify authentication and functionality
5
- """
6
-
7
- import os
8
- import sys
9
- import requests
10
- from pathlib import Path
11
- from dotenv import load_dotenv
12
-
13
- try:
14
- from huggingface_hub import InferenceClient
15
- HF_HUB_AVAILABLE = True
16
- except ImportError:
17
- HF_HUB_AVAILABLE = False
18
- print("WARNING: huggingface_hub not available. InferenceClient test will be skipped.")
19
-
20
- # Load .env from project root
21
- project_root = Path(__file__).resolve().parent
22
- load_dotenv(project_root / ".env")
23
-
24
- API_URL = "https://router.huggingface.co/nebius/v1/embeddings"
25
- MODEL = os.getenv("HF_EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-8B")
26
-
27
- def get_headers():
28
- """Get authorization headers"""
29
- hf_token = os.getenv("HF_TOKEN")
30
- if not hf_token:
31
- print("ERROR: HF_TOKEN environment variable is not set!")
32
- print("Please set HF_TOKEN in your .env file or environment variables.")
33
- sys.exit(1)
34
-
35
- return {
36
- "Authorization": f"Bearer {hf_token}",
37
- "Content-Type": "application/json"
38
- }
39
-
40
- def query(payload):
41
- """Make API request to Nebius embeddings endpoint"""
42
- headers = get_headers()
43
- try:
44
- response = requests.post(API_URL, headers=headers, json=payload, timeout=60.0)
45
- return response
46
- except requests.exceptions.RequestException as e:
47
- print(f"ERROR: Request failed: {e}")
48
- return None
49
-
50
- def test_single_text():
51
- """Test embedding a single text"""
52
- print("\n" + "="*60)
53
- print("TEST 1: Single Text Embedding")
54
- print("="*60)
55
-
56
- test_text = "ما هي المادة المتعلقة بالنفقة في نظام الأحوال الشخصية؟"
57
- print(f"Input text: {test_text}")
58
- print(f"Model: {MODEL}")
59
-
60
- payload = {
61
- "model": MODEL,
62
- "input": test_text
63
- }
64
-
65
- response = query(payload)
66
- if response is None:
67
- return False
68
-
69
- print(f"\nStatus Code: {response.status_code}")
70
-
71
- if response.status_code == 200:
72
- data = response.json()
73
- print(f"Response keys: {list(data.keys())}")
74
-
75
- if "data" in data and len(data["data"]) > 0:
76
- embedding = data["data"][0]["embedding"]
77
- print(f"Embedding dimensions: {len(embedding)}")
78
- print(f"First 10 values: {embedding[:10]}")
79
- print(f"Last 10 values: {embedding[-10:]}")
80
- print("✓ Single text embedding successful!")
81
- return True
82
- else:
83
- print(f"Unexpected response format: {data}")
84
- return False
85
- else:
86
- print(f"ERROR: Request failed with status {response.status_code}")
87
- print(f"Response: {response.text}")
88
- if response.status_code == 401:
89
- print("\nAuthentication failed. Please check:")
90
- print("1. HF_TOKEN is correct and valid")
91
- print("2. Token has proper permissions for Nebius provider")
92
- print("3. Token is not expired")
93
- return False
94
-
95
- def test_batch_texts():
96
- """Test embedding multiple texts"""
97
- print("\n" + "="*60)
98
- print("TEST 2: Batch Text Embedding")
99
- print("="*60)
100
-
101
- test_texts = [
102
- "ما هي المادة المتعلقة بالنفقة؟",
103
- "ما هي شروط الزواج؟",
104
- "كيف يتم الطلاق؟"
105
- ]
106
- print(f"Input texts ({len(test_texts)}):")
107
- for i, text in enumerate(test_texts, 1):
108
- print(f" {i}. {text}")
109
- print(f"Model: {MODEL}")
110
-
111
- payload = {
112
- "model": MODEL,
113
- "input": test_texts
114
- }
115
-
116
- response = query(payload)
117
- if response is None:
118
- return False
119
-
120
- print(f"\nStatus Code: {response.status_code}")
121
-
122
- if response.status_code == 200:
123
- data = response.json()
124
- print(f"Response keys: {list(data.keys())}")
125
-
126
- if "data" in data:
127
- print(f"Number of embeddings returned: {len(data['data'])}")
128
- for i, item in enumerate(data["data"]):
129
- embedding = item["embedding"]
130
- print(f" Embedding {i+1}: {len(embedding)} dimensions")
131
- print("✓ Batch text embedding successful!")
132
- return True
133
- else:
134
- print(f"Unexpected response format: {data}")
135
- return False
136
- else:
137
- print(f"ERROR: Request failed with status {response.status_code}")
138
- print(f"Response: {response.text}")
139
- return False
140
-
141
- def test_huggingface_hub_client():
142
- """Test using HuggingFace Hub InferenceClient (same approach as HuggingFaceEmbeddingsWrapper)"""
143
- print("\n" + "="*60)
144
- print("TEST 3: HuggingFace Hub InferenceClient")
145
- print("="*60)
146
-
147
- if not HF_HUB_AVAILABLE:
148
- print("SKIPPED: huggingface_hub package not installed")
149
- return None
150
-
151
- hf_token = os.getenv("HF_TOKEN")
152
- if not hf_token:
153
- print("ERROR: HF_TOKEN not set")
154
- return False
155
-
156
- test_text = "ما هي المادة المتعلقة بالنفقة في نظام الأحوال الشخصية؟"
157
- print(f"Input text: {test_text}")
158
- print(f"Model: {MODEL}")
159
- print(f"Provider: nebius")
160
-
161
- try:
162
- # Initialize client (same as HuggingFaceEmbeddingsWrapper)
163
- client = InferenceClient(
164
- provider="nebius",
165
- api_key=hf_token
166
- )
167
- print("✓ InferenceClient initialized successfully")
168
-
169
- # Test feature_extraction (same as HuggingFaceEmbeddingsWrapper)
170
- print("Calling client.feature_extraction()...")
171
- result = client.feature_extraction(
172
- test_text,
173
- model=MODEL
174
- )
175
-
176
- # Check result format - InferenceClient returns numpy.ndarray
177
- import numpy as np
178
-
179
- # Convert numpy array to list if needed
180
- if isinstance(result, np.ndarray):
181
- # Handle 2D array (batch) or 1D array (single)
182
- if result.ndim == 2:
183
- # Batch result - convert to list of lists
184
- result = result.tolist()
185
- else:
186
- # Single result - convert to list
187
- result = result.tolist()
188
-
189
- if isinstance(result, list):
190
- # Handle nested list (batch) or flat list (single)
191
- if len(result) > 0 and isinstance(result[0], list):
192
- # Batch result
193
- print(f"✓ Feature extraction successful! (batch format)")
194
- print(f"Number of embeddings: {len(result)}")
195
- for i, emb in enumerate(result):
196
- print(f" Embedding {i+1}: {len(emb)} dimensions")
197
- else:
198
- # Single result
199
- print(f"✓ Feature extraction successful!")
200
- print(f"Embedding dimensions: {len(result)}")
201
- print(f"First 10 values: {result[:10]}")
202
- print(f"Last 10 values: {result[-10:]}")
203
-
204
- # Test batch processing
205
- print("\nTesting batch processing...")
206
- test_texts = [
207
- "ما هي المادة المتعلقة بالنفقة؟",
208
- "ما هي شروط الزواج؟"
209
- ]
210
- results = []
211
- for text in test_texts:
212
- embedding = client.feature_extraction(text, model=MODEL)
213
- # Convert numpy array to list if needed
214
- if isinstance(embedding, np.ndarray):
215
- if embedding.ndim == 2:
216
- embedding = embedding.tolist()[0] # Extract first row if 2D
217
- else:
218
- embedding = embedding.tolist()
219
- results.append(embedding)
220
- print(f"✓ Batch processing successful! Processed {len(results)} texts")
221
- print(f" Embedding 1: {len(results[0])} dimensions")
222
- print(f" Embedding 2: {len(results[1])} dimensions")
223
-
224
- return True
225
- else:
226
- print(f"Unexpected result format: {type(result)}")
227
- print(f"Result: {result}")
228
- return False
229
-
230
- except Exception as e:
231
- print(f"ERROR: InferenceClient test failed")
232
- print(f"Error type: {type(e).__name__}")
233
- print(f"Error message: {str(e)}")
234
-
235
- # Provide helpful error messages
236
- if "401" in str(e) or "Unauthorized" in str(e):
237
- print("\nAuthentication failed. Please check:")
238
- print("1. HF_TOKEN is correct and valid")
239
- print("2. Token has proper permissions for Nebius provider")
240
- print("3. Token is not expired")
241
- elif "404" in str(e) or "Not Found" in str(e):
242
- print("\nModel or endpoint not found. Please check:")
243
- print(f"1. Model '{MODEL}' is available on Nebius")
244
- print("2. Provider 'nebius' is correctly configured")
245
-
246
- return False
247
-
248
- def main():
249
- """Run all tests"""
250
- print("Nebius Embeddings API Test")
251
- print("="*60)
252
- print(f"API URL: {API_URL}")
253
- print(f"Model: {MODEL}")
254
- print(f"HF_TOKEN: {'*' * 20 if os.getenv('HF_TOKEN') else 'NOT SET'}")
255
-
256
- # Check if token is set
257
- if not os.getenv("HF_TOKEN"):
258
- print("\nERROR: HF_TOKEN not found!")
259
- print("Please set it in your .env file:")
260
- print(" HF_TOKEN=your_token_here")
261
- sys.exit(1)
262
-
263
- # Run tests
264
- results = []
265
- results.append(("Single Text (Direct API)", test_single_text()))
266
- results.append(("Batch Texts (Direct API)", test_batch_texts()))
267
-
268
- # Test HuggingFace Hub InferenceClient if available
269
- if HF_HUB_AVAILABLE:
270
- hf_result = test_huggingface_hub_client()
271
- if hf_result is not None:
272
- results.append(("HuggingFace Hub InferenceClient", hf_result))
273
-
274
- # Summary
275
- print("\n" + "="*60)
276
- print("TEST SUMMARY")
277
- print("="*60)
278
- for test_name, success in results:
279
- status = "✓ PASSED" if success else "✗ FAILED"
280
- print(f"{test_name}: {status}")
281
-
282
- all_passed = all(result[1] for result in results)
283
- if all_passed:
284
- print("\n✓ All tests passed! API is working correctly.")
285
- sys.exit(0)
286
- else:
287
- print("\n✗ Some tests failed. Check the errors above.")
288
- sys.exit(1)
289
-
290
- if __name__ == "__main__":
291
- main()
292
-