Atlas / src /utils.py
WanIrfan's picture
Upload 5 files
8ada0c0 verified
raw
history blame
4.05 kB
import logging
import json
import re
from src.doc_qa import AgenticQA
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_google_genai import ChatGoogleGenerativeAI
logger = logging.getLogger(__name__)
def load_rag_system(collection_name,domain):
"""
Loads an existing RAG system by connecting to the persistent vector store.
This is fast and does not re-process any documents.
"""
logger.info(f"Loading RAG system for collection: '{collection_name}' (Domain: {domain})...")
try:
agent = AgenticQA(
config={
"retriever": {
"collection_name": collection_name,
"persist_directory": "chroma_db"
},
"domain": domain
}
)
# Check if the agent was actually created
if not agent.agent_executor:
raise Exception("Agent Executor was not created. Check logs for errors.")
logger.info(f"βœ… System for '{collection_name}' loaded successfully.")
return agent
except Exception as e:
logger.error(f"❌ Failed to load RAG system for '{collection_name}': {e}")
logger.warning("Did you run the ingest.py script first?")
return None
def markdown_bold_to_html(text: str):
"""Converts markdown bold syntax to HTML <strong> tags."""
return re.sub(r"\*\*(.*?)\*\*", r"<strong>\1</strong>", text)
def standardize_query(query):
if not query:
return None
return query.strip().lower()
def get_standalone_question(input_question, chat_history,llm):
"""Uses LLM to create a standalone question from the chat history."""
if not chat_history:
return input_question
contextualize_q_prompt = ChatPromptTemplate.from_messages([
("system", "Given a chat history and the latest user question which might reference context in the chat history, "
"formulate a standalone question which can be understood without the chat history. "
"IMPORTANT: DO NOT PROVIDE ANY ANSWERS. ONLY REPHRASE THE QUESTION IF NEEDED. "
"If the question is already clear and standalone, return it exactly as is. "
"Output ONLY the reformulated question, nothing else."),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
])
history_aware_retriever_chain = contextualize_q_prompt | llm
response = history_aware_retriever_chain.invoke(
{"chat_history": chat_history, "input": input_question}
)
return response.content
def parse_agent_response(response_dict):
"""A robust helper to parse the dictionary from an AgenticQA agent."""
answer = markdown_bold_to_html(response_dict.get('answer', 'Error: No answer found.'))
thoughts = response_dict.get('thoughts', 'No thought process available.')
validation = response_dict.get('validation', (False, 'Validation failed.'))
source = response_dict.get('source', 'Unknown')
if validation and validation[1] == "Validation skipped for insurance domain.":
validation = (True, "Factual Answer")
return answer, thoughts,validation, source
def extract_json_from_string(text: str) -> dict:
"""
Finds and parses the first valid JSON object within a string.
Returns a dictionary, or an empty dict if no JSON is found.
"""
# This regex finds the first occurrence of a string starting with { and ending with }
json_match = re.search(r'\{.*\}', text, re.DOTALL)
if json_match:
json_string = json_match.group(0)
try:
return json.loads(json_string)
except json.JSONDecodeError:
# The extracted string is not valid JSON
return {"error": "Failed to parse extracted JSON", "raw_text": json_string}
else:
# No JSON object found in the string
return {"error": "No JSON object found in the string", "raw_text": text}