| from itertools import islice | |
| import gradio as gr | |
| from datatrove.executor.local import LocalPipelineExecutor | |
| from datatrove.pipeline.base import PipelineStep | |
| from datatrove.pipeline.extractors import Trafilatura | |
| from datatrove.pipeline.filters import ( | |
| C4QualityFilter, | |
| FineWebQualityFilter, | |
| GopherQualityFilter, | |
| GopherRepetitionFilter, | |
| LanguageFilter, | |
| URLFilter, | |
| ) | |
| from datatrove.pipeline.readers import WarcReader | |
| from datatrove.pipeline.writers.jsonl import JsonlWriter | |
| def run(input): | |
| return "wip" | |
| demo = gr.Interface(run, inputs=[gr.Textbox()], outputs=[gr.Textbox()]) | |
| demo.launch() | |