Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| import os | |
| import pandas as pd | |
| from huggingface_hub import snapshot_download | |
| from .config import DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, RESULTS_REPO, TOKEN | |
| def download_datasets(): | |
| """Download datasets from HuggingFace repositories""" | |
| print("Downloading datasets from HuggingFace...") | |
| # Download eval requests (queue) | |
| try: | |
| print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}") | |
| snapshot_download( | |
| repo_id=QUEUE_REPO, | |
| local_dir=EVAL_REQUESTS_PATH, | |
| repo_type="dataset", | |
| tqdm_class=None, | |
| etag_timeout=30, | |
| token=TOKEN, | |
| ) | |
| print("✓ Eval requests downloaded successfully") | |
| except Exception as e: | |
| print(f"Error downloading eval requests: {e}") | |
| # Download eval results | |
| try: | |
| print(f"Downloading eval results to {EVAL_RESULTS_PATH}") | |
| snapshot_download( | |
| repo_id=RESULTS_REPO, | |
| local_dir=EVAL_RESULTS_PATH, | |
| repo_type="dataset", | |
| tqdm_class=None, | |
| etag_timeout=30, | |
| token=TOKEN, | |
| ) | |
| print("✓ Eval results downloaded successfully") | |
| except Exception as e: | |
| print(f"Error downloading eval results: {e}") | |
| # Download prediction data (main dataset) | |
| try: | |
| print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}") | |
| snapshot_download( | |
| repo_id=DATA_REPO, | |
| local_dir=PREDICTIONS_CSV_PATH, | |
| repo_type="dataset", | |
| tqdm_class=None, | |
| etag_timeout=30, | |
| token=TOKEN, | |
| ) | |
| print("✓ Prediction data downloaded successfully") | |
| except Exception as e: | |
| print(f"Error downloading prediction data: {e}") | |
| def process_data(): | |
| """Process the downloaded data and create queue""" | |
| print("Processing downloaded data...") | |
| # Load the main dataset | |
| csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv") | |
| if not os.path.exists(csv_path): | |
| print(f"Error: data.csv not found at {csv_path}") | |
| return None, None | |
| print(f"Loading data from {csv_path}") | |
| df = pd.read_csv(csv_path) | |
| # Convert date columns | |
| df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"]) | |
| df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"]) | |
| print(f"Loaded {len(df)} records") | |
| print(f"Data shape: {df.shape}") | |
| print(f"Columns: {list(df.columns)}") | |
| # Get unique dates for prediction windows | |
| prediction_dates = sorted(df["open_to_bet_until"].dt.date.unique()) | |
| print(f"Prediction dates: {prediction_dates}") | |
| # Get unique algorithms/models | |
| algorithms = df["algorithm_name"].unique() | |
| print(f"Algorithms: {algorithms}") | |
| # Get unique event types | |
| event_types = df["event_type"].unique() | |
| print(f"Event types: {event_types}") | |
| # Create a summary of the data | |
| summary = {"total_records": len(df), "unique_events": df["event_id"].nunique(), "unique_algorithms": len(algorithms), "unique_event_types": len(event_types), "prediction_dates": prediction_dates, "algorithms": algorithms.tolist(), "event_types": event_types.tolist()} | |
| print("\n=== Data Summary ===") | |
| for key, value in summary.items(): | |
| print(f"{key}: {value}") | |
| return df, summary | |
| def generate_queue(df): | |
| """Generate evaluation queue from processed data""" | |
| print("Generating evaluation queue...") | |
| # Get unique events that need evaluation | |
| unique_events = df.groupby("event_id").agg({"question": "first", "event_type": "first", "answer_options": "first", "result": "first", "open_to_bet_until": "first"}).reset_index() | |
| # Filter for events that haven't been resolved yet (if needed) | |
| pending_events = unique_events[unique_events["result"].isna()] | |
| resolved_events = unique_events[unique_events["result"].notna()] | |
| print(f"Total unique events: {len(unique_events)}") | |
| print(f"Pending events: {len(pending_events)}") | |
| print(f"Resolved events: {len(resolved_events)}") | |
| # Save queue locally | |
| queue_path = os.path.join(PREDICTIONS_CSV_PATH, "evaluation_queue.csv") | |
| unique_events.to_csv(queue_path, index=False) | |
| print(f"✓ Queue saved to {queue_path}") | |
| return unique_events | |
| def main(): | |
| """Main function to download and process data""" | |
| print("=== FutureBench Data Download and Processing ===") | |
| # Download datasets | |
| download_datasets() | |
| # Process data | |
| df, summary = process_data() | |
| if df is None: | |
| print("❌ Failed to process data. Exiting.") | |
| return | |
| # Generate queue | |
| queue = generate_queue(df) | |
| print("\n=== Processing Complete ===") | |
| print("Data processed and queue generated successfully!") | |
| print(f"Queue contains {len(queue)} events") | |
| if __name__ == "__main__": | |
| main() | |