Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import duckdb | |
| from huggingface_hub import HfFileSystem | |
| from huggingface_hub.hf_file_system import safe_quote | |
| import pandas as pd | |
| import requests | |
| DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co" | |
| PARQUET_REVISION="refs/convert/parquet" | |
| TABLE_WILDCARD="{table}" | |
| fs = HfFileSystem() | |
| duckdb.register_filesystem(fs) | |
| def get_parquet_files(dataset, config, split): | |
| response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={config}", timeout=60) | |
| if response.status_code != 200: | |
| raise Exception(response) | |
| response = response.json() | |
| parquet_files = response["parquet_files"] | |
| file_names = [content["filename"] for content in parquet_files if content["split"] == split] | |
| if len(file_names) == 0: | |
| raise Exception("No parquet files found for dataset") | |
| return file_names | |
| def run_command(dataset, config, split, sql): | |
| try: | |
| if TABLE_WILDCARD not in sql: | |
| raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.") | |
| parquet_files = get_parquet_files(dataset, config, split) | |
| print(f"File names found: {','.join(parquet_files)}") | |
| parquet_first_file = parquet_files[0] # TODO: Send pattern to duck db to read all split parquets | |
| print(f"Trying with the first one {parquet_first_file}") | |
| location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{config}/{parquet_first_file}" | |
| print(location) | |
| sql = sql.replace(TABLE_WILDCARD, f"'{location}'") | |
| result = duckdb.query(sql).to_df() | |
| print("Ok") | |
| except Exception as error: | |
| print(f"Error: {str(error)}") | |
| return pd.DataFrame({"Error": [f"❌ {str(error)}"]}) | |
| return result | |
| with gr.Blocks() as demo: | |
| gr.Markdown(" ## SQL Query using DuckDB for datasets server parquet files") | |
| dataset = gr.Textbox(label="dataset", placeholder="mstz/iris", value="mstz/iris") | |
| config = gr.Textbox(label="config", placeholder="iris", value="iris") | |
| split = gr.Textbox(label="split", placeholder="train", value="train") | |
| sql = gr.Textbox( | |
| label="Query in SQL format - It should have {table} wildcard", | |
| placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3", | |
| value=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3", | |
| lines=3, | |
| ) | |
| run_button = gr.Button("Run") | |
| gr.Markdown("### Result") | |
| cached_responses_table = gr.DataFrame() | |
| run_button.click(run_command, inputs=[dataset, config, split, sql], outputs=cached_responses_table) | |
| if __name__ == "__main__": | |
| demo.launch() | |