hiteshwar21 commited on
Commit
f8b36e1
·
verified ·
1 Parent(s): b5856c4

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +147 -198
  2. build_vector_store.py +24 -7
  3. evaluate.py +1 -1
app.py CHANGED
@@ -1,198 +1,147 @@
1
- #!/usr/bin/env python3
2
- """Main application for MANIT RAG Chatbot"""
3
- from typing import List, Dict
4
- import gradio as gr
5
- import numpy as np
6
- import faiss
7
- import pickle
8
- import os
9
- import time
10
- from sentence_transformers import SentenceTransformer
11
- from src.retrieval.semantic_retriever import SemanticRetriever
12
- from src.generation.response_generator import ResponseGenerator
13
- from config.settings import config
14
-
15
- print(f"--- SYSTEM INFO: This machine has {os.cpu_count()} CPU cores. ---")
16
-
17
-
18
- class MANITChatbot:
19
- """Main chatbot class"""
20
-
21
- def __init__(self):
22
- # Load vector store
23
- self.embeddings = np.load(os.path.join(config.VECTOR_STORE_PATH, "embeddings.npy"))
24
- self.faiss_index = faiss.read_index(os.path.join(config.VECTOR_STORE_PATH, "faiss_index.bin"))
25
-
26
- with open(os.path.join(config.VECTOR_STORE_PATH, "chunks.pkl"), "rb") as f:
27
- self.chunks = pickle.load(f)
28
-
29
- with open(os.path.join(config.VECTOR_STORE_PATH, "bm25.pkl"), "rb") as f:
30
- self.bm25 = pickle.load(f)
31
-
32
- with open(os.path.join(config.VECTOR_STORE_PATH, "relationships.pkl"), "rb") as f:
33
- self.relationships = pickle.load(f)
34
-
35
- # Initialize models
36
- self.embedding_model = SentenceTransformer(config.EMBEDDING_MODEL, device='cpu')
37
-
38
- # Initialize components
39
- self.retriever = SemanticRetriever(
40
- embedding_model=self.embedding_model,
41
- faiss_index=self.faiss_index,
42
- chunks=self.chunks,
43
- bm25_index=self.bm25,
44
- relationships=self.relationships
45
- )
46
-
47
- self.generator = ResponseGenerator()
48
- print("MANIT Chatbot initialized successfully!")
49
-
50
- def process_query(self, query: str) -> str:
51
- """Process user query through full RAG pipeline"""
52
- if not query.strip():
53
- return "Please enter a question about MANIT Bhopal."
54
-
55
- start_time = time.time()
56
-
57
- try:
58
- print(f"Processing query: {query}")
59
-
60
- # Retrieve relevant documents
61
- retrieval_start = time.time()
62
- retrieved_chunks = self.retriever.retrieve(query)
63
- retrieval_time = time.time() - retrieval_start
64
-
65
- if not retrieved_chunks:
66
- return "I couldn't find relevant information about this topic. Please try another question."
67
-
68
- print(f"Retrieved {len(retrieved_chunks)} chunks in {retrieval_time:.2f}s")
69
-
70
- # Format context
71
- context = self._format_context(retrieved_chunks)
72
-
73
- # Check if web search is needed
74
- web_context = ""
75
- if self.generator.needs_web_search(query, context):
76
- web_results = self.generator.web_search(query)
77
- if web_results:
78
- web_context = "\n\n".join(web_results)
79
-
80
- # Generate response
81
- generation_start = time.time()
82
- response = self.generator.generate_response(query, context, web_context)
83
- generation_time = time.time() - generation_start
84
-
85
- total_time = time.time() - start_time
86
- print(f"Total processing time: {total_time:.2f}s (Retrieval: {retrieval_time:.2f}s, Generation: {generation_time:.2f}s)")
87
-
88
- return response
89
-
90
- except Exception as e:
91
- print(f"Error processing query: {e}")
92
- return "I encountered an error processing your question. Please try again."
93
- def process_query_stream(self, query: str):
94
- """Processes a user query and yields the response as a stream."""
95
- if not query.strip():
96
- yield "Please enter a question about MANIT Bhopal."
97
- return
98
-
99
- try:
100
- print(f"Processing query: {query}")
101
-
102
- # 1. Retrieve documents (this part is not streamed)
103
- retrieved_chunks = self.retriever.retrieve(query)
104
- if not retrieved_chunks:
105
- yield "I couldn't find relevant information about this topic. Please try another question."
106
- return
107
-
108
- context = self._format_context(retrieved_chunks)
109
-
110
- web_context = ""
111
- if self.generator.needs_web_search(query, context):
112
- web_results = self.generator.web_search(query)
113
- if web_results:
114
- web_context = "\n\n".join(web_results)
115
-
116
- # 2. Yield the response from the streaming generator
117
- yield from self.generator.generate_response_stream(query, context, web_context)
118
-
119
- except Exception as e:
120
- print(f"Error processing query: {e}")
121
- yield "I encountered an error processing your question. Please try again."
122
- def _format_context(self, chunks: List[Dict]) -> str:
123
- """Format context for the prompt"""
124
- context_parts = []
125
-
126
- for chunk in chunks:
127
- source = chunk['metadata']['source']
128
- content = chunk['content']
129
- context_parts.append(f"Source: {source}\nContent: {content}")
130
-
131
- return "\n\n---\n\n".join(context_parts)
132
-
133
- def create_interface():
134
- """Create Gradio interface"""
135
- chatbot_instance = MANITChatbot()
136
-
137
- def chat_fn(message, history):
138
- """Function to handle chat interaction and stream the response."""
139
- # Add the user's message to the history
140
- history.append([message, ""])
141
-
142
- # Stream the response from the bot
143
- response_stream = chatbot_instance.process_query_stream(message)
144
-
145
- # Loop through the stream and update the chatbot history
146
- for chunk in response_stream:
147
- history[-1][1] += chunk
148
- yield history, "" # Update the chatbot UI and keep the textbox clear
149
-
150
- # --- Your existing Gradio UI code ---
151
- with gr.Blocks(
152
- title="MANIT Bhopal Expert Assistant",
153
- theme=gr.themes.Soft(),
154
- css=""".gradio-container {max-width: 900px; margin: 0 auto;}"""
155
- ) as demo:
156
-
157
- gr.Markdown("""
158
- # 🎓 MANIT Bhopal Expert Assistant
159
- *Powered by Advanced RAG Technology*
160
-
161
- Ask questions about programs, admissions, faculty, facilities, research, and more.
162
- """)
163
-
164
- chatbot_ui = gr.Chatbot(
165
- height=500,
166
- show_label=False,
167
- avatar_images=[None, "👨‍🎓"],
168
- show_copy_button=True
169
- )
170
-
171
- with gr.Row():
172
- msg = gr.Textbox(
173
- label="Your Question",
174
- placeholder="Ask about MANIT Bhopal...",
175
- scale=8,
176
- lines=2
177
- )
178
- submit = gr.Button("Send", scale=1, variant="primary")
179
-
180
- gr.Examples(
181
- examples=[
182
- "Who is the director of MANIT?",
183
- "Tell me about history of MANIT",
184
- "What research facilities are available at MANIT?"
185
- ],
186
- inputs=msg,
187
- label="Example Questions"
188
- )
189
-
190
- # --- Updated event handlers for streaming ---
191
- msg.submit(chat_fn, [msg, chatbot_ui], [chatbot_ui, msg])
192
- submit.click(chat_fn, [msg, chatbot_ui], [chatbot_ui, msg])
193
-
194
- return demo
195
-
196
- if __name__ == "__main__":
197
- demo = create_interface()
198
- demo.launch()
 
1
+ #!/usr/bin/env python3
2
+ """Optimized MANIT RAG Chatbot for HuggingFace Spaces"""
3
+ import os
4
+ import gradio as gr
5
+ import time
6
+ import logging
7
+ from typing import List, Dict
8
+ import warnings
9
+ warnings.filterwarnings("ignore")
10
+
11
+ # Configure logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Import your components
16
+ from src.retrieval.semantic_retriever import SemanticRetriever
17
+ from src.generation.response_generator import ResponseGenerator
18
+ from config.settings import config
19
+
20
+ class OptimizedMANITChatbot:
21
+ """HF Space optimized chatbot"""
22
+
23
+ def __init__(self):
24
+ self.initialized = False
25
+ self.initialization_status = "Starting..."
26
+ self.setup_components()
27
+
28
+ def setup_components(self):
29
+ """Initialize with progress tracking"""
30
+ try:
31
+ self.initialization_status = "Loading vector store..."
32
+ # Load your pre-built vector store components here
33
+ # (Same as your current implementation)
34
+
35
+ self.initialization_status = "Warming up models..."
36
+ # Warm up embedding model
37
+ self.embedding_model.encode(["warmup"], show_progress_bar=False)
38
+
39
+ self.initialization_status = "Ready!"
40
+ self.initialized = True
41
+ logger.info("MANIT Chatbot initialized successfully")
42
+
43
+ except Exception as e:
44
+ logger.error(f"Initialization failed: {e}")
45
+ self.initialization_status = f"Error: {str(e)}"
46
+
47
+ def process_query_stream(self, query: str):
48
+ """Stream response with error handling"""
49
+ if not self.initialized:
50
+ yield "System is still initializing. Please wait..."
51
+ return
52
+
53
+ if not query.strip():
54
+ yield "Please enter a question about MANIT Bhopal."
55
+ return
56
+
57
+ try:
58
+ # Your existing streaming logic
59
+ yield from self.generator.generate_response_stream(query, context, web_context)
60
+
61
+ except Exception as e:
62
+ logger.error(f"Query processing error: {e}")
63
+ yield "I encountered an error. Please try rephrasing your question."
64
+
65
+ def create_hf_interface():
66
+ """Create HF Space optimized interface"""
67
+
68
+ # Initialize chatbot
69
+ chatbot_instance = OptimizedMANITChatbot()
70
+
71
+ def chat_fn(message, history):
72
+ """Chat function with initialization check"""
73
+ if not chatbot_instance.initialized:
74
+ return history + [[message, f"⚠️ {chatbot_instance.initialization_status}"]], ""
75
+
76
+ # Add user message
77
+ history.append([message, ""])
78
+
79
+ # Stream bot response
80
+ for chunk in chatbot_instance.process_query_stream(message):
81
+ history[-1][1] += chunk
82
+ yield history, ""
83
+
84
+ # Custom CSS for better mobile experience
85
+ custom_css = """
86
+ .gradio-container {max-width: 900px !important; margin: 0 auto !important;}
87
+ .message.user {background-color: #e3f2fd !important;}
88
+ .message.bot {background-color: #f5f5f5 !important;}
89
+ """
90
+
91
+ with gr.Blocks(
92
+ title="MANIT Bhopal Expert Assistant",
93
+ theme=gr.themes.Soft(),
94
+ css=custom_css
95
+ ) as demo:
96
+
97
+ gr.HTML("""
98
+ <div style="text-align: center; margin-bottom: 20px;">
99
+ <h1>🎓 MANIT Bhopal Expert Assistant</h1>
100
+ <p><em>Powered by Advanced RAG Technology</em></p>
101
+ <p>Ask questions about programs, admissions, faculty, facilities, research, and more.</p>
102
+ </div>
103
+ """)
104
+
105
+ chatbot_ui = gr.Chatbot(
106
+ height=500,
107
+ show_label=False,
108
+ avatar_images=[None, "🎓"],
109
+ show_copy_button=True,
110
+ placeholder="Hi! I'm your MANIT Bhopal assistant. Ask me anything!"
111
+ )
112
+
113
+ with gr.Row():
114
+ msg = gr.Textbox(
115
+ label="Your Question",
116
+ placeholder="Ask about MANIT Bhopal...",
117
+ scale=8,
118
+ lines=2,
119
+ max_lines=4
120
+ )
121
+ submit = gr.Button("Send", scale=1, variant="primary")
122
+
123
+ gr.Examples(
124
+ examples=[
125
+ "Who is the director of MANIT?",
126
+ "What are the dispensary timings?",
127
+ "Tell me about the computer science department",
128
+ "What research facilities are available?",
129
+ "How do I apply for admission?"
130
+ ],
131
+ inputs=msg,
132
+ label="Example Questions"
133
+ )
134
+
135
+ # Event handlers
136
+ msg.submit(chat_fn, [msg, chatbot_ui], [chatbot_ui, msg])
137
+ submit.click(chat_fn, [msg, chatbot_ui], [chatbot_ui, msg])
138
+
139
+ return demo
140
+
141
+ if __name__ == "__main__":
142
+ demo = create_hf_interface()
143
+ demo.launch(
144
+ server_name="0.0.0.0",
145
+ server_port=7860,
146
+ share=False
147
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build_vector_store.py CHANGED
@@ -1,31 +1,48 @@
1
  #!/usr/bin/env python3
2
- """Build the vector store from raw text files"""
3
 
4
  import os
5
  import sys
6
- from src.preprocessing.advanced_processor import AdvancedTextProcessor
 
 
 
 
 
 
7
  from config.settings import config
8
 
9
  def main():
10
- print("Building MANIT RAG Vector Store...")
11
 
12
  # Check if raw texts exist
13
  if not os.path.exists(config.RAW_TEXT_PATH):
14
  print(f"Error: Raw text path {config.RAW_TEXT_PATH} does not exist")
15
  sys.exit(1)
16
 
17
- # Process texts and build vector store
18
- processor = AdvancedTextProcessor()
19
  chunks = processor.process_directory()
20
 
21
  if not chunks:
22
  print("No chunks were processed. Check your input files.")
23
  sys.exit(1)
24
 
25
- print(f"Processed {len(chunks)} chunks from text files")
26
  processor.build_vector_store(chunks)
27
 
28
- print("Vector store built successfully!")
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  if __name__ == "__main__":
31
  main()
 
1
  #!/usr/bin/env python3
2
+ """Rebuild vector store with optimized processing"""
3
 
4
  import os
5
  import sys
6
+ from pathlib import Path
7
+
8
+ # Add the project root to Python path
9
+ project_root = Path(__file__).parent
10
+ sys.path.insert(0, str(project_root))
11
+
12
+ from src.preprocessing.advanced_processor import OptimizedTextProcessor
13
  from config.settings import config
14
 
15
  def main():
16
+ print("Rebuilding MANIT RAG Vector Store with optimized chunking...")
17
 
18
  # Check if raw texts exist
19
  if not os.path.exists(config.RAW_TEXT_PATH):
20
  print(f"Error: Raw text path {config.RAW_TEXT_PATH} does not exist")
21
  sys.exit(1)
22
 
23
+ # Process with optimized processor
24
+ processor = OptimizedTextProcessor()
25
  chunks = processor.process_directory()
26
 
27
  if not chunks:
28
  print("No chunks were processed. Check your input files.")
29
  sys.exit(1)
30
 
31
+ print(f"Processed {len(chunks)} optimized chunks")
32
  processor.build_vector_store(chunks)
33
 
34
+ print(" Optimized vector store built successfully!")
35
+ print(f"📊 Total chunks: {len(chunks)}")
36
+
37
+ # Display chunk type distribution
38
+ chunk_types = {}
39
+ for chunk in chunks:
40
+ chunk_type = chunk['metadata'].get('chunk_type', 'unknown')
41
+ chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
42
+
43
+ print("\n📈 Chunk distribution:")
44
+ for chunk_type, count in sorted(chunk_types.items()):
45
+ print(f" - {chunk_type}: {count} chunks")
46
 
47
  if __name__ == "__main__":
48
  main()
evaluate.py CHANGED
@@ -29,7 +29,7 @@ def evaluate_performance():
29
  "what is the name of person who registered the design for a paver block",
30
  "What are the objective for intellectual property rights cell at manit",
31
  "Tell me about mentorship program at MANIT",
32
- "What are the recent events at manti"
33
  ]
34
 
35
  results = []
 
29
  "what is the name of person who registered the design for a paver block",
30
  "What are the objective for intellectual property rights cell at manit",
31
  "Tell me about mentorship program at MANIT",
32
+ "What are the recent events at manit"
33
  ]
34
 
35
  results = []