Spaces:

Fancy-MLLM
/

R1-Onevision

Runtime error

App Files Files Community

Fancy-MLLM commited on Feb 13, 2025

Commit

5762ea1

verified ·

1 Parent(s): 6bcd1bb

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -43

app.py CHANGED Viewed

@@ -222,20 +222,19 @@
 import os
 from datetime import datetime
-import subprocess
 import time
 # Third-party imports
 import numpy as np
 import torch
 from PIL import Image
-import accelerate
 import gradio as gr
 import spaces
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
-    AutoTokenizer,
-    AutoProcessor
 )
 # Local imports
@@ -251,7 +250,6 @@ else:
 print(f"[INFO] Using device: {device}")
 def array_to_image_path(image_array):
     if image_array is None:
         raise ValueError("No image provided. Please upload an image before submitting.")
@@ -269,7 +267,7 @@ def array_to_image_path(image_array):
     full_path = os.path.abspath(filename)
     return full_path
 models = {
     "Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B",
                                                                                       trust_remote_code=True,
@@ -291,55 +289,70 @@ assistant_prompt = '<|assistant|>\n'
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
-def run_example(image, text_input=None, model_id=None):
-    start_time = time.time()
-    image_path = array_to_image_path(image)
-    print(image_path)
-    model = models[model_id]
-    processor = processors[model_id]
-    image = Image.fromarray(image).convert("RGB")
     messages = [
-    {
             "role": "user",
             "content": [
-                {
-                    "type": "image",
-                    "image": image_path,
-                },
-                {"type": "text", "text": text_input},
             ],
         }
     ]
-    # Preparation for inference
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
     image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
-    )
-    inputs = inputs.to(device)
-    # Inference: Generation of the output
-    generated_ids = model.generate(**inputs, max_new_tokens=2048)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )
-    end_time = time.time()
-    total_time = round(end_time - start_time, 2)
-    return output_text[0], total_time
 css = """
   #output {
@@ -354,18 +367,18 @@ with gr.Blocks(css=css) as demo:
     with gr.Tab(label="R1-OneVision-7B Input"):
         with gr.Row():
             with gr.Column():
-                input_img = gr.Image(label="Input Picture")
                 model_selector = gr.Dropdown(choices=list(models.keys()),
                                              label="Model",
                                              value="Fancy-MLLM/R1-OneVision-7B")
                 text_input = gr.Textbox(label="Text Prompt")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
-                output_text = gr.Textbox(label="Output Text")
-                time_taken = gr.Textbox(label="Time taken for processing + inference")
-        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text, time_taken])
 demo.queue(api_open=False)
 demo.launch(debug=True)

 import os
 from datetime import datetime
 import time
+from threading import Thread
 # Third-party imports
 import numpy as np
 import torch
 from PIL import Image
 import gradio as gr
 import spaces
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+    TextIteratorStreamer
 )
 # Local imports
 print(f"[INFO] Using device: {device}")
 def array_to_image_path(image_array):
     if image_array is None:
         raise ValueError("No image provided. Please upload an image before submitting.")
     full_path = os.path.abspath(filename)
     return full_path
 models = {
     "Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B",
                                                                                       trust_remote_code=True,
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
+def model_inference(input_dict, history):
+    text = input_dict["text"]
+    files = input_dict["files"]
+    # Load images if provided
+    images = []
+    if len(files) > 0:
+        images = [array_to_image_path(image) for image in files]
+    # Validate input
+    if text == "" and not images:
+        yield "Error: Please input a query and optionally image(s)."
+        return
+    if text == "" and images:
+        yield "Error: Please input a text query along with the image(s)."
+        return
+    # Prepare messages for the model
     messages = [
+        {
             "role": "user",
             "content": [
+                *[{"type": "image", "image": image} for image in images],
+                {"type": "text", "text": text},
             ],
         }
     ]
+    # Apply chat template and process inputs
+    prompt = processors["Fancy-MLLM/R1-OneVision-7B"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processors["Fancy-MLLM/R1-OneVision-7B"](
+        text=[prompt],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
+    ).to(device)
+    # Set up streamer for real-time output
+    streamer = TextIteratorStreamer(processors["Fancy-MLLM/R1-OneVision-7B"], skip_prompt=True, skip_special_tokens=True)
+    # Define the generation parameters
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=2048,
+        top_p=0.001,
+        top_k=1,
+        temperature=0.01,
+        repetition_penalty=1.0,
     )
+    # Start generation in a separate thread
+    thread = Thread(target=models["Fancy-MLLM/R1-OneVision-7B"].generate, kwargs=generation_kwargs)
+    thread.start()
+    # Stream the output
+    buffer = ""
+    yield "Thinking..."
+    for new_text in streamer:
+        buffer += new_text
+        time.sleep(0.01)
+        yield buffer
 css = """
   #output {
     with gr.Tab(label="R1-OneVision-7B Input"):
         with gr.Row():
             with gr.Column():
+                input_img = gr.Image(label="Input Picture", type="numpy", elem_id="image_input")
                 model_selector = gr.Dropdown(choices=list(models.keys()),
                                              label="Model",
                                              value="Fancy-MLLM/R1-OneVision-7B")
                 text_input = gr.Textbox(label="Text Prompt")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
+                output_text = gr.Textbox(label="Output Text", elem_id="output_text", lines=10)
+        submit_btn.click(model_inference, [input_img, text_input, model_selector], [output_text])
 demo.queue(api_open=False)
 demo.launch(debug=True)