import gradio as gr import pandas as pd import numpy as np from typing import List, Dict, Any, Tuple, Optional def create_cluster_browser_app(): """ Create a simple Gradio app for browsing prompts by cluster from uploaded CSV file. """ def load_and_validate_csv(file) -> Tuple[Optional[pd.DataFrame], str, List[str], str]: """ Load and validate the uploaded CSV file. Args: file: Uploaded file object from Gradio Returns: Tuple of (dataframe, status_message, cluster_options, cluster_stats) """ if file is None: return None, "Please upload a CSV file with 'prompt' and 'cluster' columns.", ["(No data loaded)"], "" try: df = pd.read_csv(file.name) # Validate required columns required_cols = ['prompt', 'cluster'] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: return None, f"Missing required columns: {missing_cols}. Please ensure your CSV has 'prompt' and 'cluster' columns.", ["(No data loaded)"], "" # Validate data types if not pd.api.types.is_numeric_dtype(df['cluster']): return None, "The 'cluster' column must contain numeric values.", ["(No data loaded)"], "" # Get cluster options unique_clusters = sorted(df['cluster'].unique()) cluster_options = ["(All Clusters)"] + [f"Cluster {c}" for c in unique_clusters] # Get cluster statistics stats = [] for cluster_num in unique_clusters: count = len(df[df['cluster'] == cluster_num]) stats.append(f"Cluster {cluster_num}: {count} prompts") total_prompts = len(df) stats_text = f"**Total Prompts:** {total_prompts}\n\n**Cluster Distribution:**\n" + "\n".join(stats) return df, f"✅ Successfully loaded {len(df)} prompts with {len(unique_clusters)} clusters.", cluster_options, stats_text except Exception as e: return None, f"Error loading CSV file: {str(e)}", ["(No data loaded)"], "" def filter_by_cluster(df: pd.DataFrame, cluster_sel: str) -> pd.DataFrame: """Filter dataframe by selected cluster.""" if df is None or cluster_sel == "(All Clusters)" or cluster_sel == "(No data loaded)": return df if df is not None else pd.DataFrame() cluster_num = int(cluster_sel.split()[-1]) # Extract number from "Cluster X" return df[df['cluster'] == cluster_num].reset_index(drop=True) def format_prompt_cell(prompt_text: str) -> str: """Format a single prompt in its own cell.""" return f"""