import datetime import os import re import yaml from datasets import Dataset, load_dataset from huggingface_hub import create_repo, login import gradio as gr # Constants DATASET_NAME = "jablonkagroup/eval-cards-dataset" def setup_hf_auth(): """Setup Hugging Face authentication""" try: hf_token = os.environ.get("HF_TOKEN") if hf_token: login(token=hf_token) return True return False except Exception as e: print(f"HF Auth error: {e}") return False def ensure_dataset_exists(): """Ensure the dataset repository exists""" try: load_dataset(DATASET_NAME, split="train") return True except Exception: try: if not setup_hf_auth(): return False create_repo( repo_id=DATASET_NAME, repo_type="dataset", private=False, exist_ok=True ) empty_data = { "filename": [], "title": [], "summary": [], "authors": [], "creation_date": [], "coverage_score": [], "yaml_content": [], "paper_link": [], "repository_link": [], "timestamp": [], } empty_dataset = Dataset.from_dict(empty_data) empty_dataset.push_to_hub(DATASET_NAME) return True except Exception as e: print(f"Dataset creation error: {e}") return False def get_template(): """Get a basic YAML template""" return """title: "Your Evaluation Title" summary: "Brief description of your evaluation" metadata: authors: ["Author Name"] creation_date: "2025-01-01" paper_link: "" repository_link: "" evaluation_design: purpose: "What is the purpose of this evaluation?" scope: "What does this evaluation cover?" estimand: definition: "What are you trying to estimate?" estimator: method: "How are you estimating it?" estimate: results: "What are the results?" results_communication: format: "How are results communicated?" known_issues_and_limitations: issues: ["List any known issues"] version_and_maintenance: version: "1.0" citation_and_usage: citation: "How to cite this work" """ def compute_coverage_score(eval_data): """Compute a coverage score for the eval card""" sections = { "metadata": 5, "evaluation_design": 10, "estimand": 20, "estimator": 20, "estimate": 20, "results_communication": 10, "known_issues_and_limitations": 10, "version_and_maintenance": 5, "citation_and_usage": 5, } def count_filled_fields(data): if isinstance(data, dict): filled = total = 0 for value in data.values(): if isinstance(value, (dict, list)): sub_filled, sub_total = count_filled_fields(value) filled += sub_filled total += sub_total else: total += 1 if value and str(value).strip() not in ["", "[]", "{}"]: filled += 1 return filled, total elif isinstance(data, list): if not data: return 0, 1 filled = total = 0 for item in data: sub_filled, sub_total = count_filled_fields(item) filled += sub_filled total += sub_total return filled, total else: return 1 if data else 0, 1 scores = {} total_score = 0 for section, weight in sections.items(): if section in eval_data: filled, total = count_filled_fields(eval_data[section]) completion_rate = filled / total if total > 0 else 0 scores[section] = { "score": round(completion_rate * weight, 2), "max_score": weight, "completion_rate": round(completion_rate * 100, 2), } total_score += scores[section]["score"] else: scores[section] = { "score": 0, "max_score": weight, "completion_rate": 0, } return min(round(total_score, 2), 100), scores def get_sample_cards(): """Get sample cards for demo purposes""" return [ { "title": "ChemBench: A Large-Scale Chemical Evaluation", "summary": "A comprehensive benchmark for evaluating chemical property prediction models across multiple datasets and molecular representations.", "authors": "John Doe, Jane Smith", "creation_date": "2024-12-15", "coverage_score": 85.5, "paper_link": "https://arxiv.org/abs/2401.example", "repository_link": "https://github.com/example/chembench", "score_details": { "metadata": {"score": 4.5, "max_score": 5, "completion_rate": 90}, "evaluation_design": { "score": 9.0, "max_score": 10, "completion_rate": 90, }, "estimand": {"score": 18.0, "max_score": 20, "completion_rate": 90}, "estimator": {"score": 17.0, "max_score": 20, "completion_rate": 85}, "estimate": {"score": 16.0, "max_score": 20, "completion_rate": 80}, "results_communication": { "score": 8.0, "max_score": 10, "completion_rate": 80, }, "known_issues_and_limitations": { "score": 7.0, "max_score": 10, "completion_rate": 70, }, "version_and_maintenance": { "score": 3.0, "max_score": 5, "completion_rate": 60, }, "citation_and_usage": { "score": 3.0, "max_score": 5, "completion_rate": 60, }, }, }, { "title": "MaterialsML: Property Prediction Framework", "summary": "An evaluation framework for materials property prediction using machine learning approaches on crystal structure data.", "authors": "Alice Johnson, Bob Wilson", "creation_date": "2024-11-20", "coverage_score": 92.0, "paper_link": "", "repository_link": "https://github.com/example/materialsml", "score_details": { "metadata": {"score": 5.0, "max_score": 5, "completion_rate": 100}, "evaluation_design": { "score": 10.0, "max_score": 10, "completion_rate": 100, }, "estimand": {"score": 19.0, "max_score": 20, "completion_rate": 95}, "estimator": {"score": 18.0, "max_score": 20, "completion_rate": 90}, "estimate": {"score": 19.0, "max_score": 20, "completion_rate": 95}, "results_communication": { "score": 9.0, "max_score": 10, "completion_rate": 90, }, "known_issues_and_limitations": { "score": 8.0, "max_score": 10, "completion_rate": 80, }, "version_and_maintenance": { "score": 2.0, "max_score": 5, "completion_rate": 40, }, "citation_and_usage": { "score": 2.0, "max_score": 5, "completion_rate": 40, }, }, }, ] def save_eval_card(yaml_content, paper_url="", repo_url=""): """Save an eval card to the dataset""" try: eval_data = yaml.safe_load(yaml_content) # Add URLs to metadata if provided if paper_url: eval_data.setdefault("metadata", {})["paper_link"] = paper_url if repo_url: eval_data.setdefault("metadata", {})["repository_link"] = repo_url yaml_content = yaml.dump(eval_data) filename = re.sub(r"[^\w\-_]", "_", eval_data.get("title", "Unnamed")) filename = ( f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" ) score, score_details = compute_coverage_score(eval_data) # Try to save to HF dataset, but don't fail if it doesn't work saved_to_hf = False if setup_hf_auth(): try: dataset = load_dataset(DATASET_NAME, split="train") existing_data = dataset.to_dict() saved_to_hf = True except Exception: try: existing_data = { "filename": [], "title": [], "summary": [], "authors": [], "creation_date": [], "coverage_score": [], "yaml_content": [], "paper_link": [], "repository_link": [], "timestamp": [], } ensure_dataset_exists() saved_to_hf = True except Exception: saved_to_hf = False if saved_to_hf: try: existing_data["filename"].append(filename) existing_data["title"].append(eval_data.get("title", "Unnamed")) existing_data["summary"].append(eval_data.get("summary", "")) existing_data["authors"].append( ", ".join(eval_data.get("metadata", {}).get("authors", [])) ) existing_data["creation_date"].append( eval_data.get("metadata", {}).get("creation_date", "") ) existing_data["coverage_score"].append(float(score)) existing_data["yaml_content"].append(yaml_content) existing_data["paper_link"].append(paper_url or "") existing_data["repository_link"].append(repo_url or "") existing_data["timestamp"].append( datetime.datetime.now().isoformat() ) updated_dataset = Dataset.from_dict(existing_data) updated_dataset.push_to_hub(DATASET_NAME) except Exception as e: saved_to_hf = False print(f"Failed to save to HF: {e}") details_str = f"Coverage Score: {score}%\n\nSection Details:\n" for section, details in score_details.items(): details_str += f"• {section}: {details['score']}/{details['max_score']} ({details['completion_rate']}%)\n" if saved_to_hf: return f"✅ Successfully saved to HF dataset! Filename: {filename}\n\n{details_str}" else: return f"⚠️ Validated successfully but couldn't save to HF dataset (check HF_TOKEN)\nFilename: {filename}\n\n{details_str}" except Exception as e: return f"❌ Error: {str(e)}" def load_gallery_cards(): """Load cards view for gallery with fallback to sample data""" try: # Try to load from HF dataset first if setup_hf_auth(): try: dataset = load_dataset(DATASET_NAME, split="train") if len(dataset) > 0: return create_gallery_html_from_dataset(dataset) except Exception as e: print(f"Failed to load from HF dataset: {e}") # Fallback to sample data with nice styling sample_cards = get_sample_cards() return create_gallery_html_from_samples(sample_cards) except Exception as e: return f""" """ def create_gallery_html_from_dataset(dataset): """Create gallery HTML from HF dataset""" cards_html = '" return cards_html def create_gallery_html_from_samples(sample_cards): """Create gallery HTML from sample data""" cards_html = """ " return cards_html def create_card_html(card_data): """Create HTML for a single card""" # Get coverage color score = card_data["coverage_score"] if score >= 80: score_color = "#2e7d32" score_bg = "#e8f5e8" elif score >= 60: score_color = "#f57c00" score_bg = "#fff3e0" else: score_color = "#d32f2f" score_bg = "#ffebee" html = f"""

🎯 {card_data["title"]}

{card_data["coverage_score"]}%

📝 Summary: {card_data["summary"]}

👥 Authors: {card_data["authors"]}

📅 Created: {card_data["creation_date"]}

📊 Coverage by Section

""" for section, details in card_data["score_details"].items(): section_display = section.replace("_", " ").title() completion = details["completion_rate"] # Color coding for completion rates if completion >= 80: bar_color = "#4caf50" elif completion >= 60: bar_color = "#ff9800" else: bar_color = "#f44336" html += f"""
{section_display}
{details["score"]}/{details["max_score"]} ({completion}%)
""" html += """
""" # Add links if available if card_data.get("paper_link") or card_data.get("repository_link"): html += '" html += "
" return html def load_gallery_table(): """Load table view for gallery""" try: if setup_hf_auth(): try: dataset = load_dataset(DATASET_NAME, split="train") if len(dataset) > 0: return create_table_from_dataset(dataset) except Exception: pass # Fallback to sample data sample_cards = get_sample_cards() return create_table_from_samples(sample_cards) except Exception as e: return f"Error loading table: {str(e)}" def create_table_from_dataset(dataset): """Create table from HF dataset""" table_text = "📊 Evaluation Cards Summary\n" + "=" * 100 + "\n\n" sorted_indices = sorted( range(len(dataset)), key=lambda i: dataset[i]["coverage_score"], reverse=True ) table_text += ( f"{'Rank':<6} {'Title':<35} {'Authors':<30} {'Coverage':<12} {'Created':<12}\n" ) table_text += "-" * 100 + "\n" for rank, i in enumerate(sorted_indices[:50], 1): row = dataset[i] title = row["title"][:32] + "..." if len(row["title"]) > 35 else row["title"] authors = ( row["authors"][:27] + "..." if len(row["authors"]) > 30 else row["authors"] ) table_text += f"{rank:<6} {title:<35} {authors:<30} {row['coverage_score']:<11}% {row['creation_date']:<12}\n" table_text += "\n" + "=" * 100 table_text += f"\n\nTotal Cards: {len(dataset)}" table_text += f"\nAverage Coverage: {sum(row['coverage_score'] for row in dataset) / len(dataset):.1f}%" return table_text def create_table_from_samples(sample_cards): """Create table from sample data""" table_text = "📊 Evaluation Cards Summary (Demo Data)\n" + "=" * 100 + "\n\n" sorted_cards = sorted(sample_cards, key=lambda x: x["coverage_score"], reverse=True) table_text += ( f"{'Rank':<6} {'Title':<35} {'Authors':<30} {'Coverage':<12} {'Created':<12}\n" ) table_text += "-" * 100 + "\n" for rank, card in enumerate(sorted_cards, 1): title = card["title"][:32] + "..." if len(card["title"]) > 35 else card["title"] authors = ( card["authors"][:27] + "..." if len(card["authors"]) > 30 else card["authors"] ) table_text += f"{rank:<6} {title:<35} {authors:<30} {card['coverage_score']:<11}% {card['creation_date']:<12}\n" table_text += "\n" + "=" * 100 table_text += f"\n\nTotal Cards: {len(sorted_cards)} (demo)" table_text += f"\nAverage Coverage: {sum(card['coverage_score'] for card in sorted_cards) / len(sorted_cards):.1f}%" return table_text def get_llm_feedback(yaml_content): """Get LLM feedback using Groq""" api_token = os.environ.get("GROQ_API_KEY") if not api_token: return "Please set GROQ_API_KEY in Space settings to get LLM feedback." if not yaml_content.strip(): return "Please provide YAML content first." try: import requests response = requests.post( "https://api.groq.com/openai/v1/chat/completions", headers={ "Content-Type": "application/json", "Authorization": f"Bearer {api_token}", }, json={ "model": "llama-3.3-70b-versatile", "messages": [ { "role": "user", "content": f"Analyze this evaluation card YAML and provide specific improvement suggestions:\n\n```yaml\n{yaml_content}\n```\n\nFocus on completeness, clarity, and best practices.", } ], }, ) if response.status_code == 200: return response.json()["choices"][0]["message"]["content"] else: return f"API Error {response.status_code}: {response.text}" except Exception as e: return f"Error getting feedback: {str(e)}" # Simple functions for the interface def submit_card(yaml_text, paper_url, repo_url): if not yaml_text.strip(): return "Please provide YAML content" try: yaml.safe_load(yaml_text) # Validate YAML result = save_eval_card(yaml_text, paper_url, repo_url) return result except yaml.YAMLError as e: return f"Invalid YAML: {str(e)}" except Exception as e: return f"Error: {str(e)}" def load_template_text(): return get_template() def get_feedback(yaml_text): return get_llm_feedback(yaml_text) def refresh_gallery_cards(): return load_gallery_cards() def refresh_gallery_table(): return load_gallery_table() # Enhanced CSS with dark mode support enhanced_css = """ /* Hide Gradio footer */ footer {visibility: hidden} /* General styling */ body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif; } /* Gallery container */ .gallery-container { max-height: 700px; overflow-y: auto; padding: 20px; background-color: var(--background-fill-primary, #f8f9fa); } /* Demo notice */ .demo-notice { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 25px; text-align: center; box-shadow: 0 4px 15px rgba(0,0,0,0.1); } .demo-notice h3 { margin: 0 0 10px 0; font-size: 1.3em; } .demo-notice p { margin: 0; opacity: 0.9; } /* Eval card styling - Light mode default */ .eval-card { background: var(--background-fill-secondary, white); color: var(--body-text-color, #374151); border-radius: 12px; padding: 25px; margin-bottom: 25px; box-shadow: 0 4px 20px rgba(0,0,0,0.08); border: 1px solid var(--border-color-primary, #e0e0e0); transition: all 0.3s ease; position: relative; overflow: hidden; } .eval-card:hover { transform: translateY(-2px); box-shadow: 0 8px 30px rgba(0,0,0,0.12); } .eval-card::before { content: ''; position: absolute; top: 0; left: 0; right: 0; height: 4px; background: linear-gradient(90deg, #4CAF50, #2196F3, #FF9800); } /* Card header */ .card-header { display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 20px; flex-wrap: wrap; gap: 15px; } .card-title { color: var(--color-accent, #1976d2); margin: 0; font-size: 1.4em; font-weight: 600; flex: 1; min-width: 250px; } .coverage-badge { font-weight: bold; padding: 8px 16px; border-radius: 20px; font-size: 1.1em; min-width: 80px; text-align: center; box-shadow: 0 2px 8px rgba(0,0,0,0.1); } /* Card content */ .card-content { margin-bottom: 25px; } .card-summary { margin: 0 0 15px 0; line-height: 1.6; font-size: 1.02em; color: var(--body-text-color, #374151); } .card-meta { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 10px; margin: 15px 0; } .card-meta p { margin: 0; padding: 8px 12px; background: var(--input-background-fill, #f5f5f5); color: var(--body-text-color, #374151); border-radius: 6px; font-size: 0.95em; } /* Coverage section */ .coverage-section { border-top: 1px solid var(--border-color-primary, #e0e0e0); padding-top: 20px; } .coverage-section h4 { color: var(--color-accent, #1976d2); margin: 0 0 15px 0; font-size: 1.1em; font-weight: 600; } .coverage-grid { display: grid; gap: 12px; } .coverage-item { display: grid; grid-template-columns: 1fr 2fr auto; align-items: center; gap: 15px; padding: 8px 0; } .coverage-label { font-weight: 500; color: var(--body-text-color, #333); font-size: 0.9em; } .coverage-bar { background: var(--neutral-200, #e0e0e0); border-radius: 10px; height: 8px; overflow: hidden; position: relative; } .coverage-fill { height: 100%; border-radius: 10px; transition: width 0.6s ease; } .coverage-text { font-size: 0.85em; color: var(--body-text-color-subdued, #666); min-width: 100px; text-align: right; } /* Card links */ .card-links { margin-top: 20px; display: flex; gap: 12px; flex-wrap: wrap; border-top: 1px solid var(--border-color-primary, #e0e0e0); padding-top: 15px; } .link-button { display: inline-flex; align-items: center; padding: 8px 16px; background: linear-gradient(135deg, #1976d2, #1565c0); color: white !important; text-decoration: none; border-radius: 6px; font-size: 0.9em; font-weight: 500; transition: all 0.3s ease; } .link-button:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(25, 118, 210, 0.3); text-decoration: none; color: white !important; } /* Error styling */ .error-message { text-align: center; padding: 40px; color: #d32f2f; background: var(--error-background, #ffebee); border-radius: 8px; border: 1px solid var(--error-border, #ffcdd2); } .error-message h3 { margin: 0 0 15px 0; } /* Dark mode specific overrides */ .dark .gallery-container { background-color: var(--background-fill-primary, #0b0f19); } .dark .eval-card { background: var(--background-fill-secondary, #1f2937); color: var(--body-text-color, #f3f4f6); border: 1px solid var(--border-color-primary, #374151); box-shadow: 0 4px 20px rgba(0,0,0,0.3); } .dark .eval-card:hover { box-shadow: 0 8px 30px rgba(0,0,0,0.4); } .dark .card-title { color: var(--color-accent, #60a5fa); } .dark .card-summary { color: var(--body-text-color, #f3f4f6); } .dark .card-meta p { background: var(--input-background-fill, #374151); color: var(--body-text-color, #f3f4f6); } .dark .coverage-section { border-top: 1px solid var(--border-color-primary, #4b5563); } .dark .coverage-section h4 { color: var(--color-accent, #60a5fa); } .dark .coverage-label { color: var(--body-text-color, #f3f4f6); } .dark .coverage-bar { background: var(--neutral-700, #4b5563); } .dark .coverage-text { color: var(--body-text-color-subdued, #9ca3af); } .dark .card-links { border-top: 1px solid var(--border-color-primary, #4b5563); } .dark .error-message { background: var(--error-background, #7f1d1d); border: 1px solid var(--error-border, #991b1b); color: #fca5a5; } /* Alternative dark mode detection using data attributes or CSS variables */ [data-theme="dark"] .eval-card, html[data-theme="dark"] .eval-card { background: #1f2937; color: #f3f4f6; border: 1px solid #374151; } [data-theme="dark"] .card-title, html[data-theme="dark"] .card-title { color: #60a5fa; } [data-theme="dark"] .card-meta p, html[data-theme="dark"] .card-meta p { background: #374151; color: #f3f4f6; } [data-theme="dark"] .coverage-section, html[data-theme="dark"] .coverage-section { border-top: 1px solid #4b5563; } [data-theme="dark"] .coverage-section h4, html[data-theme="dark"] .coverage-section h4 { color: #60a5fa; } [data-theme="dark"] .coverage-label, html[data-theme="dark"] .coverage-label { color: #f3f4f6; } [data-theme="dark"] .coverage-bar, html[data-theme="dark"] .coverage-bar { background: #4b5563; } [data-theme="dark"] .coverage-text, html[data-theme="dark"] .coverage-text { color: #9ca3af; } [data-theme="dark"] .card-links, html[data-theme="dark"] .card-links { border-top: 1px solid #4b5563; } /* Responsive design */ @media (max-width: 768px) { .gallery-container { padding: 15px; } .eval-card { padding: 20px; } .card-header { flex-direction: column; align-items: flex-start; } .coverage-badge { align-self: flex-start; } .coverage-item { grid-template-columns: 1fr; gap: 8px; } .coverage-text { text-align: left; } .card-meta { grid-template-columns: 1fr; } } /* Force dark theme detection using Gradio's CSS variables */ @media (prefers-color-scheme: dark) { .eval-card { background: var(--background-fill-secondary, #1f2937) !important; color: var(--body-text-color, #f3f4f6) !important; border: 1px solid var(--border-color-primary, #374151) !important; } .card-title { color: var(--color-accent, #60a5fa) !important; } .card-summary { color: var(--body-text-color, #f3f4f6) !important; } .card-meta p { background: var(--input-background-fill, #374151) !important; color: var(--body-text-color, #f3f4f6) !important; } .coverage-section { border-top: 1px solid var(--border-color-primary, #4b5563) !important; } .coverage-section h4 { color: var(--color-accent, #60a5fa) !important; } .coverage-label { color: var(--body-text-color, #f3f4f6) !important; } .coverage-bar { background: var(--neutral-700, #4b5563) !important; } .coverage-text { color: var(--body-text-color-subdued, #9ca3af) !important; } .card-links { border-top: 1px solid var(--border-color-primary, #4b5563) !important; } } """ # Create the interface with gr.Blocks( title="Evaluation Cards Gallery", theme=gr.themes.Soft(), css=enhanced_css, ) as demo: gr.Markdown(f""" # 📊 Evaluation Cards for Machine Learning in Materials Science Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the gallery. Data is persistently stored in the HF dataset: [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}) Checkout the [GitHub repository](https://github.com/lamalab-org/eval-cards) for more information. """) with gr.Tabs(): with gr.TabItem("📝 Upload & Review"): with gr.Row(): with gr.Column(): gr.Markdown("### ✏️ Create Evaluation Card") yaml_editor = gr.Textbox( label="YAML Content", lines=15, placeholder="Paste your YAML content or click 'Load Template'...", ) template_btn = gr.Button("📝 Load Template") paper_url = gr.Textbox( label="📄 Paper URL (Optional)", placeholder="https://arxiv.org/abs/...", ) repo_url = gr.Textbox( label="💻 Repository URL (Optional)", placeholder="https://github.com/...", ) with gr.Row(): feedback_btn = gr.Button("🤖 Get LLM Feedback") submit_btn = gr.Button( "🚀 Submit Evaluation Card", variant="primary" ) with gr.Column(): gr.Markdown("### 💡 LLM Feedback") feedback_box = gr.Textbox( label="AI Feedback", lines=10, interactive=False ) gr.Markdown("### 📤 Submission Result") result_box = gr.Textbox(label="Result", lines=8, interactive=False) with gr.TabItem("🏛️ Gallery"): refresh_btn = gr.Button("🔄 Refresh Gallery") with gr.Tabs(): with gr.TabItem("📋 Card View"): gallery_cards = gr.HTML(value="Loading gallery...") with gr.TabItem("📊 Table View"): gallery_table = gr.Textbox( label="Evaluation Cards Table", lines=25, interactive=False, value="Loading table...", ) # Event handlers template_btn.click(load_template_text, outputs=[yaml_editor]) feedback_btn.click(get_feedback, inputs=[yaml_editor], outputs=[feedback_box]) submit_btn.click( submit_card, inputs=[yaml_editor, paper_url, repo_url], outputs=[result_box] ) refresh_btn.click(refresh_gallery_cards, outputs=[gallery_cards]) refresh_btn.click(refresh_gallery_table, outputs=[gallery_table]) # Load gallery on startup demo.load(refresh_gallery_cards, outputs=[gallery_cards]) demo.load(refresh_gallery_table, outputs=[gallery_table]) if __name__ == "__main__": demo.launch()