Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

App Files Files Community

yzhuang commited on May 22

Commit

2d7f359

verified ·

1 Parent(s): 3579acb

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -52

app.py CHANGED Viewed

@@ -1,62 +1,86 @@
-# app.py
-import json, requests, gradio as gr
-import server
-API_URL = "http://0.0.0.0:8000/v1/chat/completions"
-def stream_completion(message, history, max_tokens, temperature, top_p, beta):
-    """Gradio callback: stream the assistant’s reply token-by-token."""
-    # -------- build OpenAI-style message list (no system prompt) -------------
-    messages = []
     for user_msg, assistant_msg in history:
-        if user_msg:        # past user turn
             messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:   # past assistant turn
             messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
-    payload = {
-        "model": "Qwen/Qwen3-4B",
-        "messages": messages,
-        "temperature": temperature,
-        "top_p": top_p,
-        "max_tokens": int(max_tokens),
-        "stream": True,
-    }
-    headers = {
-        "Content-Type": "application/json",
-        "X-MIXINPUTS-BETA": str(beta),
-    }
     try:
-        with requests.post(API_URL,
-                           json=payload,
-                           stream=True,
-                           headers=headers,
-                           timeout=(10, None)) as resp:
-            resp.raise_for_status()
-            assistant = ""
-            # iterate over the HTTP chunks
-            for raw in resp.iter_lines(decode_unicode=True, delimiter=b"\n"):
-                if not raw:
-                    continue
-                if raw.startswith("data: "):
-                    data = raw[6:]                 # strip the 'data: ' prefix
-                else:
-                    data = raw
-                if data.strip() == "[DONE]":
-                    break
-                delta = json.loads(data)["choices"][0]["delta"].get("content", "")
-                assistant += delta
-                yield history + [(message, assistant)]  # live update in Gradio
-    except Exception as err:
         yield history + [(message, f"[ERROR] {err}")]
-# ---------------------------- UI --------------------------------------------
 with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
     gr.Markdown(
         "## 🎨 Mixture of Inputs (MoI) Demo  \n"
@@ -65,10 +89,10 @@ with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
     )
     with gr.Row():  # sliders first
-        beta         = gr.Slider(0.0, 10.0, value=1.0,  step=0.1,  label="MoI β")
-        temperature  = gr.Slider(0.1, 1.0,  value=0.6,  step=0.1,  label="Temperature")
-        top_p        = gr.Slider(0.1, 1.0,  value=0.80, step=0.05, label="Top-p")
-        max_tokens   = gr.Slider(1,   2048, value=512,  step=1,    label="Max new tokens")
     chatbot   = gr.Chatbot(height=450)
     user_box  = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
@@ -79,7 +103,9 @@ with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
         inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
         outputs=chatbot,
     )
     clear_btn.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
     demo.launch()

+"""Gradio chat demo that streams responses from a (local) OpenAI‑compatible
+endpoint using the official `openai` Python SDK.  The server is assumed to be
+running at http://0.0.0.0:8000 with the v1 REST routes.  A custom header
+`X‑MIXINPUTS‑BETA` is forwarded so MoI can adjust its blending strength at
+runtime.
+Launch with:
+    python app_openai.py
+"""
+from __future__ import annotations
+import os
+import openai
+import gradio as gr
+# ──────────────────────────────────────────────────────────────────────────────
+# OpenAI client configuration
+# ──────────────────────────────────────────────────────────────────────────────
+# ``openai`` still expects an API key even if the backend ignores it, so we use
+# a dummy value when none is provided.  The *base_url* points to the local
+# vLLM server that speaks the OpenAI REST dialect.
+# -----------------------------------------------------------------------------
+openai.api_key = os.getenv("OPENAI_API_KEY", "EMPTY")
+openai.base_url = "http://0.0.0.0:8000/v1"
+# ──────────────────────────────────────────────────────────────────────────────
+# Chat handler
+# ──────────────────────────────────────────────────────────────────────────────
+def stream_completion(message: str,
+                      history: list[tuple[str, str]],
+                      max_tokens: int,
+                      temperature: float,
+                      top_p: float,
+                      beta: float):
+    """Gradio callback that yields streaming assistant replies.
+    The function reconstructs the conversation *excluding* any system prompt
+    and then calls ``openai.chat.completions.create`` with ``stream=True``.
+    Each incoming delta is appended to an ``assistant`` buffer which is sent
+    back to the Chatbot component for real‑time display.
+    """
+    # Build OpenAI‑style message list from prior turns
+    messages: list[dict[str, str]] = []
     for user_msg, assistant_msg in history:
+        if user_msg:
             messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
+    # Current user input comes last
     messages.append({"role": "user", "content": message})
     try:
+        # Kick off streaming completion
+        response = openai.chat.completions.create(
+            model="Qwen/Qwen3-4B",
+            messages=messages,
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            stream=True,
+            # Forward MoI blending coefficient to the backend
+            extra_headers={"X-MIXINPUTS-BETA": str(beta)},
+        )
+        assistant = ""
+        for chunk in response:
+            # ``delta.content`` is None for e.g. role announcements; guard with or ""
+            delta = chunk.choices[0].delta.content or ""
+            assistant += delta
+            yield history + [(message, assistant)]  # live update
+    except Exception as err:  # pylint: disable=broad-except
         yield history + [(message, f"[ERROR] {err}")]
+# ──────────────────────────────────────────────────────────────────────────────
+# Gradio UI
+# ──────────────────────────────────────────────────────────────────────────────
 with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
     gr.Markdown(
         "## 🎨 Mixture of Inputs (MoI) Demo  \n"
     )
     with gr.Row():  # sliders first
+        beta        = gr.Slider(0.0, 10.0, value=1.0,  step=0.1,  label="MoI β")
+        temperature = gr.Slider(0.1, 1.0,  value=0.6,  step=0.1,  label="Temperature")
+        top_p       = gr.Slider(0.1, 1.0,  value=0.80, step=0.05, label="Top‑p")
+        max_tokens  = gr.Slider(1,   2048, value=512,  step=1,    label="Max new tokens")
     chatbot   = gr.Chatbot(height=450)
     user_box  = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
         inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
         outputs=chatbot,
     )
     clear_btn.click(lambda: None, None, chatbot, queue=False)
+# ──────────────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     demo.launch()