|
|
import logging
|
|
|
import json
|
|
|
import re
|
|
|
from src.doc_qa import AgenticQA
|
|
|
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
|
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def load_rag_system(collection_name,domain):
|
|
|
"""
|
|
|
Loads an existing RAG system by connecting to the persistent vector store.
|
|
|
This is fast and does not re-process any documents.
|
|
|
"""
|
|
|
logger.info(f"Loading RAG system for collection: '{collection_name}' (Domain: {domain})...")
|
|
|
try:
|
|
|
agent = AgenticQA(
|
|
|
config={
|
|
|
"retriever": {
|
|
|
"collection_name": collection_name,
|
|
|
"persist_directory": "chroma_db"
|
|
|
},
|
|
|
"domain": domain
|
|
|
}
|
|
|
)
|
|
|
|
|
|
if not agent.agent_executor:
|
|
|
raise Exception("Agent Executor was not created. Check logs for errors.")
|
|
|
|
|
|
logger.info(f"β
System for '{collection_name}' loaded successfully.")
|
|
|
return agent
|
|
|
except Exception as e:
|
|
|
logger.error(f"β Failed to load RAG system for '{collection_name}': {e}")
|
|
|
logger.warning("Did you run the ingest.py script first?")
|
|
|
return None
|
|
|
|
|
|
def markdown_bold_to_html(text: str):
|
|
|
"""Converts markdown bold syntax to HTML <strong> tags."""
|
|
|
return re.sub(r"\*\*(.*?)\*\*", r"<strong>\1</strong>", text)
|
|
|
|
|
|
def standardize_query(query):
|
|
|
if not query:
|
|
|
return None
|
|
|
return query.strip().lower()
|
|
|
|
|
|
def get_standalone_question(input_question, chat_history,llm):
|
|
|
"""Uses LLM to create a standalone question from the chat history."""
|
|
|
if not chat_history:
|
|
|
return input_question
|
|
|
|
|
|
contextualize_q_prompt = ChatPromptTemplate.from_messages([
|
|
|
("system", "Given a chat history and the latest user question which might reference context in the chat history, "
|
|
|
"formulate a standalone question which can be understood without the chat history. "
|
|
|
"IMPORTANT: DO NOT PROVIDE ANY ANSWERS. ONLY REPHRASE THE QUESTION IF NEEDED. "
|
|
|
"If the question is already clear and standalone, return it exactly as is. "
|
|
|
"Output ONLY the reformulated question, nothing else."),
|
|
|
MessagesPlaceholder("chat_history"),
|
|
|
("human", "{input}"),
|
|
|
])
|
|
|
history_aware_retriever_chain = contextualize_q_prompt | llm
|
|
|
|
|
|
response = history_aware_retriever_chain.invoke(
|
|
|
{"chat_history": chat_history, "input": input_question}
|
|
|
)
|
|
|
return response.content
|
|
|
|
|
|
def parse_agent_response(response_dict):
|
|
|
"""A robust helper to parse the dictionary from an AgenticQA agent."""
|
|
|
answer = markdown_bold_to_html(response_dict.get('answer', 'Error: No answer found.'))
|
|
|
thoughts = response_dict.get('thoughts', 'No thought process available.')
|
|
|
validation = response_dict.get('validation', (False, 'Validation failed.'))
|
|
|
source = response_dict.get('source', 'Unknown')
|
|
|
|
|
|
if validation and validation[1] == "Validation skipped for insurance domain.":
|
|
|
validation = (True, "Factual Answer")
|
|
|
|
|
|
return answer, thoughts,validation, source
|
|
|
|
|
|
def extract_json_from_string(text: str) -> dict:
|
|
|
"""
|
|
|
Finds and parses the first valid JSON object within a string.
|
|
|
Returns a dictionary, or an empty dict if no JSON is found.
|
|
|
"""
|
|
|
|
|
|
json_match = re.search(r'\{.*\}', text, re.DOTALL)
|
|
|
|
|
|
if json_match:
|
|
|
json_string = json_match.group(0)
|
|
|
try:
|
|
|
return json.loads(json_string)
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
|
return {"error": "Failed to parse extracted JSON", "raw_text": json_string}
|
|
|
else:
|
|
|
|
|
|
return {"error": "No JSON object found in the string", "raw_text": text} |