Spaces:
Running
Running
Commit
·
fedc47d
1
Parent(s):
63f41a1
fix: Resolve test case click errors and CO2 emissions calculations
Browse files- Fixed token type error in thought_graph: convert prompt/completion tokens to int before addition
- Fixed on_test_case_select return values: return 8 gr.update() values instead of empty dict on errors
- Fixed CO2 emissions calculation: use delta values (diff) for time series charts instead of cumulative sum
- Fixed CO2 summary card: calculate total as final - initial value
- Updated chart titles to reflect "Incremental" values for CO2 and Power Cost
Resolves TypeError when clicking test cases and incorrect CO2 visualization.
- app.py +18 -5
- components/thought_graph.py +3 -2
- screens/trace_detail.py +44 -13
app.py
CHANGED
|
@@ -175,18 +175,31 @@ def on_test_case_select(evt: gr.SelectData, df):
|
|
| 175 |
|
| 176 |
print(f"[DEBUG] on_test_case_select called with index: {evt.index}")
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
# Check if we have a selected run
|
| 179 |
if current_selected_run is None:
|
| 180 |
print("[ERROR] No run selected - current_selected_run is None")
|
| 181 |
gr.Warning("Please select a run from the leaderboard first")
|
| 182 |
-
return
|
| 183 |
|
| 184 |
try:
|
| 185 |
# Get selected test case
|
| 186 |
selected_idx = evt.index[0]
|
| 187 |
if df is None or df.empty or selected_idx >= len(df):
|
| 188 |
gr.Warning("Invalid test case selection")
|
| 189 |
-
return
|
| 190 |
|
| 191 |
test_case = df.iloc[selected_idx].to_dict()
|
| 192 |
trace_id = test_case.get('trace_id')
|
|
@@ -197,7 +210,7 @@ def on_test_case_select(evt: gr.SelectData, df):
|
|
| 197 |
traces_dataset = current_selected_run.get('traces_dataset')
|
| 198 |
if not traces_dataset:
|
| 199 |
gr.Warning("No traces dataset found in current run")
|
| 200 |
-
return
|
| 201 |
|
| 202 |
# Update global trace info for MCP debug_trace tool
|
| 203 |
_current_trace_info["trace_id"] = trace_id
|
|
@@ -208,7 +221,7 @@ def on_test_case_select(evt: gr.SelectData, df):
|
|
| 208 |
|
| 209 |
if not trace_data:
|
| 210 |
gr.Warning(f"Trace not found: {trace_id}")
|
| 211 |
-
return
|
| 212 |
|
| 213 |
current_selected_trace = trace_data
|
| 214 |
|
|
@@ -278,7 +291,7 @@ def on_test_case_select(evt: gr.SelectData, df):
|
|
| 278 |
import traceback
|
| 279 |
traceback.print_exc()
|
| 280 |
gr.Warning(f"Error loading trace: {e}")
|
| 281 |
-
return
|
| 282 |
|
| 283 |
|
| 284 |
|
|
|
|
| 175 |
|
| 176 |
print(f"[DEBUG] on_test_case_select called with index: {evt.index}")
|
| 177 |
|
| 178 |
+
# Helper function to return empty updates for all 8 outputs
|
| 179 |
+
def return_error():
|
| 180 |
+
return (
|
| 181 |
+
gr.update(), # run_detail_screen
|
| 182 |
+
gr.update(), # trace_detail_screen
|
| 183 |
+
gr.update(), # trace_title
|
| 184 |
+
gr.update(), # trace_metadata_html
|
| 185 |
+
gr.update(), # trace_thought_graph
|
| 186 |
+
gr.update(), # span_visualization
|
| 187 |
+
gr.update(), # span_details_table
|
| 188 |
+
gr.update() # span_details_json
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
# Check if we have a selected run
|
| 192 |
if current_selected_run is None:
|
| 193 |
print("[ERROR] No run selected - current_selected_run is None")
|
| 194 |
gr.Warning("Please select a run from the leaderboard first")
|
| 195 |
+
return return_error()
|
| 196 |
|
| 197 |
try:
|
| 198 |
# Get selected test case
|
| 199 |
selected_idx = evt.index[0]
|
| 200 |
if df is None or df.empty or selected_idx >= len(df):
|
| 201 |
gr.Warning("Invalid test case selection")
|
| 202 |
+
return return_error()
|
| 203 |
|
| 204 |
test_case = df.iloc[selected_idx].to_dict()
|
| 205 |
trace_id = test_case.get('trace_id')
|
|
|
|
| 210 |
traces_dataset = current_selected_run.get('traces_dataset')
|
| 211 |
if not traces_dataset:
|
| 212 |
gr.Warning("No traces dataset found in current run")
|
| 213 |
+
return return_error()
|
| 214 |
|
| 215 |
# Update global trace info for MCP debug_trace tool
|
| 216 |
_current_trace_info["trace_id"] = trace_id
|
|
|
|
| 221 |
|
| 222 |
if not trace_data:
|
| 223 |
gr.Warning(f"Trace not found: {trace_id}")
|
| 224 |
+
return return_error()
|
| 225 |
|
| 226 |
current_selected_trace = trace_data
|
| 227 |
|
|
|
|
| 291 |
import traceback
|
| 292 |
traceback.print_exc()
|
| 293 |
gr.Warning(f"Error loading trace: {e}")
|
| 294 |
+
return return_error()
|
| 295 |
|
| 296 |
|
| 297 |
|
components/thought_graph.py
CHANGED
|
@@ -196,8 +196,9 @@ def create_thought_graph(spans: List[Dict[str, Any]], trace_id: str = "Unknown")
|
|
| 196 |
if 'tool_name' in node_data:
|
| 197 |
hover += f"Tool: {node_data['tool_name']}<br>"
|
| 198 |
if 'prompt_tokens' in node_data or 'completion_tokens' in node_data:
|
| 199 |
-
|
| 200 |
-
|
|
|
|
| 201 |
hover += f"Tokens: {prompt + completion} (p:{prompt}, c:{completion})<br>"
|
| 202 |
if 'cost' in node_data and node_data['cost'] is not None:
|
| 203 |
hover += f"Cost: ${node_data['cost']:.6f}<br>"
|
|
|
|
| 196 |
if 'tool_name' in node_data:
|
| 197 |
hover += f"Tool: {node_data['tool_name']}<br>"
|
| 198 |
if 'prompt_tokens' in node_data or 'completion_tokens' in node_data:
|
| 199 |
+
# Ensure values are integers, not strings
|
| 200 |
+
prompt = int(node_data.get('prompt_tokens', 0) or 0) # Handle None values and convert to int
|
| 201 |
+
completion = int(node_data.get('completion_tokens', 0) or 0) # Handle None values and convert to int
|
| 202 |
hover += f"Tokens: {prompt + completion} (p:{prompt}, c:{completion})<br>"
|
| 203 |
if 'cost' in node_data and node_data['cost'] is not None:
|
| 204 |
hover += f"Cost: ${node_data['cost']:.6f}<br>"
|
screens/trace_detail.py
CHANGED
|
@@ -550,20 +550,44 @@ def extract_metrics_data(metrics_df):
|
|
| 550 |
gpu_temperature_celsius, gpu_power_watts, co2_emissions_gco2e
|
| 551 |
|
| 552 |
Returns:
|
| 553 |
-
DataFrame ready for visualization
|
| 554 |
"""
|
| 555 |
if metrics_df is None or metrics_df.empty:
|
| 556 |
return pd.DataFrame()
|
| 557 |
|
|
|
|
|
|
|
|
|
|
| 558 |
# Ensure timestamp is datetime
|
| 559 |
-
if 'timestamp' in
|
| 560 |
-
if not pd.api.types.is_datetime64_any_dtype(
|
| 561 |
-
|
| 562 |
|
| 563 |
# Sort by timestamp
|
| 564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
|
| 566 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
|
| 568 |
|
| 569 |
def create_gpu_summary_cards(df):
|
|
@@ -591,13 +615,19 @@ def create_gpu_summary_cards(df):
|
|
| 591 |
utilization = df['gpu_utilization_percent'].mean() if 'gpu_utilization_percent' in df.columns else 0
|
| 592 |
memory_used = df['gpu_memory_used_mib'].max() if 'gpu_memory_used_mib' in df.columns else 0
|
| 593 |
temperature = df['gpu_temperature_celsius'].max() if 'gpu_temperature_celsius' in df.columns else 0
|
| 594 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
power = df['gpu_power_watts'].mean() if 'gpu_power_watts' in df.columns else 0
|
| 596 |
|
| 597 |
# Get GPU name from first row (it's constant across all rows)
|
| 598 |
gpu_name = df['gpu_name'].iloc[0] if 'gpu_name' in df.columns and not df.empty else 'Unknown GPU'
|
| 599 |
|
| 600 |
-
print(f"[DEBUG create_gpu_summary_cards] Aggregated values - util: {utilization:.2f}, mem: {memory_used:.2f}, temp: {temperature:.2f}, gpu_name: {gpu_name}")
|
| 601 |
|
| 602 |
# Get memory total from max value if available
|
| 603 |
memory_total = df['gpu_memory_total_mib'].max() if 'gpu_memory_total_mib' in df.columns else 0
|
|
@@ -662,7 +692,7 @@ def create_gpu_metrics_dashboard(metrics_df):
|
|
| 662 |
return None
|
| 663 |
|
| 664 |
# Create subplots for GPU metrics
|
| 665 |
-
# We'll show: Utilization, Memory, Temperature, Power, CO2
|
| 666 |
fig = make_subplots(
|
| 667 |
rows=3, cols=2,
|
| 668 |
subplot_titles=[
|
|
@@ -670,8 +700,8 @@ def create_gpu_metrics_dashboard(metrics_df):
|
|
| 670 |
'GPU Memory (MiB)',
|
| 671 |
'GPU Temperature (°C)',
|
| 672 |
'GPU Power (W)',
|
| 673 |
-
'CO2 Emissions (g)',
|
| 674 |
-
'Power Cost (USD)'
|
| 675 |
],
|
| 676 |
vertical_spacing=0.10,
|
| 677 |
horizontal_spacing=0.12,
|
|
@@ -681,13 +711,14 @@ def create_gpu_metrics_dashboard(metrics_df):
|
|
| 681 |
colors = ['#667eea', '#f093fb', '#4facfe', '#FFE66D', '#43e97b', '#FF6B6B']
|
| 682 |
|
| 683 |
# Define metrics to plot
|
|
|
|
| 684 |
metrics_config = [
|
| 685 |
('gpu_utilization_percent', 'GPU Utilization (%)', 1, 1, colors[0]),
|
| 686 |
('gpu_memory_used_mib', 'GPU Memory (MiB)', 1, 2, colors[1]),
|
| 687 |
('gpu_temperature_celsius', 'GPU Temperature (°C)', 2, 1, colors[2]),
|
| 688 |
('gpu_power_watts', 'GPU Power (W)', 2, 2, colors[3]),
|
| 689 |
-
('co2_emissions_gco2e', 'CO2 Emissions (g)', 3, 1, colors[4]),
|
| 690 |
-
('power_cost_usd', 'Power Cost (USD)', 3, 2, colors[5]),
|
| 691 |
]
|
| 692 |
|
| 693 |
for col_name, title, row, col, color in metrics_config:
|
|
|
|
| 550 |
gpu_temperature_celsius, gpu_power_watts, co2_emissions_gco2e
|
| 551 |
|
| 552 |
Returns:
|
| 553 |
+
DataFrame ready for visualization with delta values for cumulative counters
|
| 554 |
"""
|
| 555 |
if metrics_df is None or metrics_df.empty:
|
| 556 |
return pd.DataFrame()
|
| 557 |
|
| 558 |
+
# Make a copy to avoid modifying original
|
| 559 |
+
df = metrics_df.copy()
|
| 560 |
+
|
| 561 |
# Ensure timestamp is datetime
|
| 562 |
+
if 'timestamp' in df.columns:
|
| 563 |
+
if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
|
| 564 |
+
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
| 565 |
|
| 566 |
# Sort by timestamp
|
| 567 |
+
df = df.sort_values('timestamp').reset_index(drop=True)
|
| 568 |
+
|
| 569 |
+
# Calculate deltas for cumulative counters (CO2 and Power Cost)
|
| 570 |
+
# These are cumulative metrics, so we need to show the incremental change
|
| 571 |
+
cumulative_metrics = ['co2_emissions_gco2e', 'power_cost_usd']
|
| 572 |
+
|
| 573 |
+
for metric in cumulative_metrics:
|
| 574 |
+
if metric in df.columns:
|
| 575 |
+
# Calculate delta (difference from previous value)
|
| 576 |
+
# First value gets 0 (since we don't know the previous state)
|
| 577 |
+
df[f'{metric}_delta'] = df[metric].diff().fillna(0)
|
| 578 |
+
|
| 579 |
+
# Handle negative deltas (can happen if counter resets)
|
| 580 |
+
# If delta is negative, use the absolute value of current value
|
| 581 |
+
df.loc[df[f'{metric}_delta'] < 0, f'{metric}_delta'] = df.loc[df[f'{metric}_delta'] < 0, metric]
|
| 582 |
|
| 583 |
+
# Replace the original cumulative columns with delta values for visualization
|
| 584 |
+
if 'co2_emissions_gco2e' in df.columns and 'co2_emissions_gco2e_delta' in df.columns:
|
| 585 |
+
df['co2_emissions_gco2e'] = df['co2_emissions_gco2e_delta']
|
| 586 |
+
|
| 587 |
+
if 'power_cost_usd' in df.columns and 'power_cost_usd_delta' in df.columns:
|
| 588 |
+
df['power_cost_usd'] = df['power_cost_usd_delta']
|
| 589 |
+
|
| 590 |
+
return df
|
| 591 |
|
| 592 |
|
| 593 |
def create_gpu_summary_cards(df):
|
|
|
|
| 615 |
utilization = df['gpu_utilization_percent'].mean() if 'gpu_utilization_percent' in df.columns else 0
|
| 616 |
memory_used = df['gpu_memory_used_mib'].max() if 'gpu_memory_used_mib' in df.columns else 0
|
| 617 |
temperature = df['gpu_temperature_celsius'].max() if 'gpu_temperature_celsius' in df.columns else 0
|
| 618 |
+
|
| 619 |
+
# CO2 emissions is a cumulative counter - calculate delta (final - initial)
|
| 620 |
+
if 'co2_emissions_gco2e' in df.columns and not df.empty:
|
| 621 |
+
co2_emissions = df['co2_emissions_gco2e'].iloc[-1] - df['co2_emissions_gco2e'].iloc[0]
|
| 622 |
+
else:
|
| 623 |
+
co2_emissions = 0
|
| 624 |
+
|
| 625 |
power = df['gpu_power_watts'].mean() if 'gpu_power_watts' in df.columns else 0
|
| 626 |
|
| 627 |
# Get GPU name from first row (it's constant across all rows)
|
| 628 |
gpu_name = df['gpu_name'].iloc[0] if 'gpu_name' in df.columns and not df.empty else 'Unknown GPU'
|
| 629 |
|
| 630 |
+
print(f"[DEBUG create_gpu_summary_cards] Aggregated values - util: {utilization:.2f}, mem: {memory_used:.2f}, temp: {temperature:.2f}, co2: {co2_emissions:.4f}, gpu_name: {gpu_name}")
|
| 631 |
|
| 632 |
# Get memory total from max value if available
|
| 633 |
memory_total = df['gpu_memory_total_mib'].max() if 'gpu_memory_total_mib' in df.columns else 0
|
|
|
|
| 692 |
return None
|
| 693 |
|
| 694 |
# Create subplots for GPU metrics
|
| 695 |
+
# We'll show: Utilization, Memory, Temperature, Power, CO2 (delta), Power Cost (delta)
|
| 696 |
fig = make_subplots(
|
| 697 |
rows=3, cols=2,
|
| 698 |
subplot_titles=[
|
|
|
|
| 700 |
'GPU Memory (MiB)',
|
| 701 |
'GPU Temperature (°C)',
|
| 702 |
'GPU Power (W)',
|
| 703 |
+
'CO2 Emissions - Incremental (g)',
|
| 704 |
+
'Power Cost - Incremental (USD)'
|
| 705 |
],
|
| 706 |
vertical_spacing=0.10,
|
| 707 |
horizontal_spacing=0.12,
|
|
|
|
| 711 |
colors = ['#667eea', '#f093fb', '#4facfe', '#FFE66D', '#43e97b', '#FF6B6B']
|
| 712 |
|
| 713 |
# Define metrics to plot
|
| 714 |
+
# Note: CO2 and Power Cost are shown as delta/incremental values (calculated in extract_metrics_data)
|
| 715 |
metrics_config = [
|
| 716 |
('gpu_utilization_percent', 'GPU Utilization (%)', 1, 1, colors[0]),
|
| 717 |
('gpu_memory_used_mib', 'GPU Memory (MiB)', 1, 2, colors[1]),
|
| 718 |
('gpu_temperature_celsius', 'GPU Temperature (°C)', 2, 1, colors[2]),
|
| 719 |
('gpu_power_watts', 'GPU Power (W)', 2, 2, colors[3]),
|
| 720 |
+
('co2_emissions_gco2e', 'CO2 Emissions - Incremental (g)', 3, 1, colors[4]),
|
| 721 |
+
('power_cost_usd', 'Power Cost - Incremental (USD)', 3, 2, colors[5]),
|
| 722 |
]
|
| 723 |
|
| 724 |
for col_name, title, row, col, color in metrics_config:
|