Spaces:

Presidentlin
/

Aidan-Bench

Runtime error

App Files Files Community

Presidentlin commited on Aug 13, 2024

Commit

eebf495

1 Parent(s): fb39607

x

Browse files

Files changed (3) hide show

__pycache__/main.cpython-310.pyc +0 -0
app.py +10 -4
main.py +89 -40

__pycache__/main.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/main.cpython-310.pyc and b/__pycache__/main.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -41,6 +41,7 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
         models.sort(key=lambda model: model["id"])
         model_names = [model["id"] for model in models]
     except requests.exceptions.RequestException as e:
         st.error(f"Error fetching models from OpenRouter API: {e}")
         model_names = []  # Provide an empty list if API call fails
@@ -52,6 +53,13 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
         st.error("No models available. Please check your API connection.")
         st.stop()  # Stop execution if no models are available
     # Initialize session state for user_questions and predefined_questions
     if "user_questions" not in st.session_state:
         st.session_state.user_questions = []
@@ -107,8 +115,6 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
         if not selected_questions:
             st.warning("Please select at least one question.")
         else:
-            # Initialize progress bar
-            progress_bar = st.progress(0)
             num_questions = len(selected_questions)
             results = []
@@ -117,9 +123,9 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
             # Benchmarking logic using the chosen execution mode
             if execution_mode == "Sequential":
-                question_results = benchmark_model_sequential(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key)
             else:  # Multithreaded
-                question_results = benchmark_model_multithreaded(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key, max_threads)
             results.extend(question_results)

         models.sort(key=lambda model: model["id"])
         model_names = [model["id"] for model in models]
+        judge_models = [model["id"] for model in models if "gpt" in model["id"]]  # Example criteria
     except requests.exceptions.RequestException as e:
         st.error(f"Error fetching models from OpenRouter API: {e}")
         model_names = []  # Provide an empty list if API call fails
         st.error("No models available. Please check your API connection.")
         st.stop()  # Stop execution if no models are available
+    # Judge Model Selection
+    if judge_models:
+        judge_model_name = st.selectbox("Select a Judge Model", judge_models)
+    else:
+        st.error("No judge models available. Please check your API connection.")
+        st.stop()  # Stop execution if no judge models are available
     # Initialize session state for user_questions and predefined_questions
     if "user_questions" not in st.session_state:
         st.session_state.user_questions = []
         if not selected_questions:
             st.warning("Please select at least one question.")
         else:
             num_questions = len(selected_questions)
             results = []
             # Benchmarking logic using the chosen execution mode
             if execution_mode == "Sequential":
+                question_results = benchmark_model_sequential(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key,judge_model_name)
             else:  # Multithreaded
+                question_results = benchmark_model_multithreaded(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key, max_threads, judge_model_name)
             results.extend(question_results)

main.py CHANGED Viewed

@@ -7,37 +7,37 @@ import threading
 import streamlit as st  # Import Streamlit
 import queue
 def generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key):
     """Generates an answer to a question using the specified language model."""
     gen_prompt = create_gen_prompt(question, previous_answers)
     try:
         new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
-                                             openai_api_key=openai_api_key)
         return new_answer
     except Exception as e:
         st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>",
-                         unsafe_allow_html=True)
         return None
-def evaluate_answer(question, new_answer, open_router_key, openai_api_key):
     """Evaluates the coherence and novelty of an answer."""
     judge_prompt = create_judge_prompt(question, new_answer)
-    judge = "openai/gpt-4o-mini"
     try:
         judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
-                                                 openai_api_key=openai_api_key)
         coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
         return coherence_score
     except Exception as e:
         st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>",
-                         unsafe_allow_html=True)
         return None
-def process_question(question, model_name, open_router_key, openai_api_key, result_queue):
     start_time = time.time()
-    # st.write(f"<span style='color:red'>{question}</span>", unsafe_allow_html=True)
     previous_answers = []
     question_novelty = 0
@@ -47,20 +47,20 @@ def process_question(question, model_name, open_router_key, openai_api_key, resu
             if new_answer is None:
                 break
-            coherence_score = evaluate_answer(question, new_answer, open_router_key, openai_api_key)
             if coherence_score is None:
                 break
-            if coherence_score <= 6:
                 break
             novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
-            if novelty_score < 0.3:
                 break
-            # Append results to the queue instead of using st.write
-            result_queue.put({
                 "type": "answer",
                 "question": question,
                 "answer": new_answer,
@@ -69,26 +69,34 @@ def process_question(question, model_name, open_router_key, openai_api_key, resu
                 "results": [
                     {
                         "question": question,
-                        "answers": previous_answers.copy() + [new_answer],  # Include the new answer
                         "coherence_score": coherence_score,
-                        "novelty_score": question_novelty + novelty_score  # Accumulate novelty score
                     }
                 ]
-            })
             previous_answers.append(new_answer)
             question_novelty += novelty_score
     except Exception as e:
-        result_queue.put({"type": "error", "message": str(e)})
     time_taken = time.time() - start_time
-    result_queue.put({
-        "type": "summary",
-        "question": question,
-        "total_novelty": question_novelty,
-        "time_taken": time_taken
-    })
     return question_novelty, [
         {
@@ -121,7 +129,7 @@ def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key):
     return novelty
-def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None):
     novelty_score = 0
     results = []
     result_queue = queue.Queue()  # Create a queue for communication
@@ -135,14 +143,13 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         # Submit tasks to the thread pool
         future_to_question = {
-            executor.submit(process_question, question, model_name, open_router_key, openai_api_key, result_queue): question
             for question in questions
         }
-        # Process results from the queue in the main thread
-        while True:
-            try:
-                result = result_queue.get_nowait()
                 if result["type"] == "answer":
                     st.write(f"**Question:** {result['question']}")
                     st.write(f"**New Answer:**\n{result['answer']}")
@@ -150,6 +157,11 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
                              unsafe_allow_html=True)
                     st.write(f"**Novelty Score:** {result['novelty_score']}")
                     results.extend(result["results"])  # Add results here
                 elif result["type"] == "summary":
                     st.write(f"<span style='color:blue'>Total novelty score for question '{result['question']}': {result['total_novelty']}</span>",
                              unsafe_allow_html=True)
@@ -158,27 +170,64 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
                 elif result["type"] == "error":
                     st.write(f"<span style='color:red'>Error in thread: {result['message']}</span>",
                              unsafe_allow_html=True)
-            except queue.Empty:
-                if not any(future.running() for future in future_to_question.keys()):
-                    break  # All tasks are done
     st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
              unsafe_allow_html=True)
     return results
-def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key, progress=0, progress_lock=None):
     novelty_score = 0
     results = []
     for i, question in enumerate(questions):
-        question_novelty, question_results = process_question(question, model_name, open_router_key, openai_api_key,
-                                                              progress_lock, i, len(questions), progress)
-        novelty_score += question_novelty
-        results.extend(question_results)
-        st.write(
-            f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>",
-            unsafe_allow_html=True)  # Display progress after each question
     st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
              unsafe_allow_html=True)

 import streamlit as st  # Import Streamlit
 import queue
 def generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key):
     """Generates an answer to a question using the specified language model."""
     gen_prompt = create_gen_prompt(question, previous_answers)
     try:
         new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
+                                     openai_api_key=openai_api_key)
         return new_answer
     except Exception as e:
         st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>",
+                 unsafe_allow_html=True)
         return None
+def evaluate_answer(question, new_answer, open_router_key, openai_api_key, judge_model_name):
     """Evaluates the coherence and novelty of an answer."""
     judge_prompt = create_judge_prompt(question, new_answer)
+    judge = judge_model_name  # Use the judge_model_name passed to the function
     try:
         judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
+                                         openai_api_key=openai_api_key)
         coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
         return coherence_score
     except Exception as e:
         st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>",
+                 unsafe_allow_html=True)
         return None
+def process_question(question, model_name, open_router_key, openai_api_key, result_queue, judge_model_name):
     start_time = time.time()
     previous_answers = []
     question_novelty = 0
             if new_answer is None:
                 break
+            coherence_score = evaluate_answer(question, new_answer, open_router_key, openai_api_key, judge_model_name)
             if coherence_score is None:
                 break
+            if coherence_score <= 3:
                 break
             novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
+            if novelty_score < 0.1:
                 break
+            result_dict = {
                 "type": "answer",
                 "question": question,
                 "answer": new_answer,
                 "results": [
                     {
                         "question": question,
+                        "answers": previous_answers.copy() + [new_answer],
                         "coherence_score": coherence_score,
+                        "novelty_score": question_novelty + novelty_score
                     }
                 ]
+            }
+            if result_queue is not None:  # Check if result_queue is provided
+                result_queue.put(result_dict)
+            yield result_dict  # Use yield to return the result immediately
             previous_answers.append(new_answer)
             question_novelty += novelty_score
     except Exception as e:
+        if result_queue is not None:  # Check if result_queue is provided
+            result_queue.put({"type": "error", "message": str(e)})
     time_taken = time.time() - start_time
+    if result_queue is not None:  # Check if result_queue is provided
+        result_queue.put({
+            "type": "summary",
+            "question": question,
+            "total_novelty": question_novelty,
+            "time_taken": time_taken
+        })
     return question_novelty, [
         {
     return novelty
+def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None, judge_model_name=None):
     novelty_score = 0
     results = []
     result_queue = queue.Queue()  # Create a queue for communication
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         # Submit tasks to the thread pool
         future_to_question = {
+            executor.submit(process_question, question, model_name, open_router_key, openai_api_key, result_queue, judge_model_name): question
             for question in questions
         }
+        # Collect results as they become available from futures and the queue
+        for future in as_completed(future_to_question):
+            for result in future.result():  # Iterate over yielded results from process_question
                 if result["type"] == "answer":
                     st.write(f"**Question:** {result['question']}")
                     st.write(f"**New Answer:**\n{result['answer']}")
                              unsafe_allow_html=True)
                     st.write(f"**Novelty Score:** {result['novelty_score']}")
                     results.extend(result["results"])  # Add results here
+                    novelty_score += result["novelty_score"]  # Update novelty score
+                    st.write(
+                        f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>",
+                        unsafe_allow_html=True)
                 elif result["type"] == "summary":
                     st.write(f"<span style='color:blue'>Total novelty score for question '{result['question']}': {result['total_novelty']}</span>",
                              unsafe_allow_html=True)
                 elif result["type"] == "error":
                     st.write(f"<span style='color:red'>Error in thread: {result['message']}</span>",
                              unsafe_allow_html=True)
+        # Process remaining results in the queue (if any)
+        while not result_queue.empty():
+            result = result_queue.get()
+            if result["type"] == "answer":
+                st.write(f"**Question:** {result['question']}")
+                st.write(f"**New Answer:**\n{result['answer']}")
+                st.write(f"<span style='color:green'>Coherence Score: {result['coherence_score']}</span>",
+                             unsafe_allow_html=True)
+                st.write(f"**Novelty Score:** {result['novelty_score']}")
+                results.extend(result["results"])  # Add results here
+                novelty_score += result["novelty_score"]  # Update novelty score
+                st.write(
+                    f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>",
+                    unsafe_allow_html=True)
+            elif result["type"] == "summary":
+                st.write(f"<span style='color:blue'>Total novelty score for question '{result['question']}': {result['total_novelty']}</span>",
+                         unsafe_allow_html=True)
+                st.write(f"<span style='color:blue'>Time taken: {result['time_taken']} seconds</span>",
+                         unsafe_allow_html=True)
+            elif result["type"] == "error":
+                st.write(f"<span style='color:red'>Error in thread: {result['message']}</span>",
+                         unsafe_allow_html=True)
     st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
              unsafe_allow_html=True)
     return results
+def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key, judge_model_name):
     novelty_score = 0
     results = []
     for i, question in enumerate(questions):
+        for result in process_question(question, model_name, open_router_key, openai_api_key, None, judge_model_name):
+            if result["type"] == "answer":
+                st.write(f"**Question:** {result['question']}")
+                st.write(f"**New Answer:**\n{result['answer']}")
+                st.write(f"<span style='color:green'>Coherence Score: {result['coherence_score']}</span>",
+                         unsafe_allow_html=True)
+                st.write(f"**Novelty Score:** {result['novelty_score']}")
+                results.extend(result["results"])
+                novelty_score += result["novelty_score"] # Add to novelty score
+                st.write(
+                    f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>",
+                    unsafe_allow_html=True)
+            elif result["type"] == "summary":
+                st.write(f"<span style='color:blue'>Total novelty score for question '{result['question']}': {result['total_novelty']}</span>",
+                         unsafe_allow_html=True)
+                st.write(f"<span style='color:blue'>Time taken: {result['time_taken']} seconds</span>",
+                         unsafe_allow_html=True)
+            elif result["type"] == "error":
+                st.write(f"<span style='color:red'>Error in thread: {result['message']}</span>",
+                         unsafe_allow_html=True)
     st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
              unsafe_allow_html=True)