import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
import gradio as gr
MODEL_NAME = os.getenv('MODEL_ID')
TOKEN = os.getenv('TOKEN')
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, token=TOKEN)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
token=TOKEN
)
print("Model loaded.")
def playground(
message,
history,
system_prompt,
max_new_tokens,
temperature,
repetition_penalty,
top_k,
top_p
):
if not isinstance(message, str) or not message.strip():
yield ""
return
# Build conversation với system prompt
conversation = []
# Thêm system prompt nếu có
if system_prompt and system_prompt.strip():
conversation.append({"role": "system", "content": system_prompt.strip()})
# Thêm lịch sử chat
for user_msg, bot_msg in history:
conversation.append({"role": "user", "content": user_msg})
if bot_msg:
conversation.append({"role": "assistant", "content": bot_msg})
conversation.append({"role": "user", "content": message})
if hasattr(tokenizer, "apply_chat_template"):
prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
else:
prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in conversation]) + "\nassistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=int(max_new_tokens),
temperature=float(temperature),
top_k=int(top_k) if top_k > 0 else None,
top_p=float(top_p),
repetition_penalty=float(repetition_penalty),
do_sample=True if temperature > 0 else False,
pad_token_id=tokenizer.eos_token_id
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ""
for new_text in streamer:
generated_text += new_text
yield generated_text
thread.join()
with gr.Blocks(fill_height=True, fill_width=True) as app:
with gr.Sidebar():
gr.Markdown("## Playground by UltimaX Intelligence")
gr.HTML("""
Runs
beyoru/Qwen3-0.9B-A0.6B via Hugging Face Transformers.
Support me at:.
""")
gr.Markdown("---")
gr.Markdown("## System Prompt")
system_prompt = gr.Textbox(
label="System Prompt",
placeholder="Enter custom system instructions here (optional)...",
lines=4,
value="You are a helpful AI assistant.",
info="AI role custome"
)
gr.Markdown("---")
gr.Markdown("## Generation Parameters")
max_new_tokens = gr.Slider(32, 4096, value=2048, step=32, label="Max New Tokens")
temperature = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="Temperature")
repetition_penalty = gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Repetition Penalty")
top_k = gr.Slider(0, 100, value=20, step=1, label="Top K (0 = off)")
top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top P")
gr.ChatInterface(
fn=playground,
additional_inputs=[system_prompt, max_new_tokens, temperature, repetition_penalty, top_k, top_p],
chatbot=gr.Chatbot(
label="Qwen3-0.9B-A0.6B",
show_copy_button=True,
allow_tags=["think"],
),
examples=[
["Hello who are you?"],
["How to solve 2x+1=3."],
["Example python code for async"]
],
cache_examples=False,
show_api=False
)
app.launch(server_name="0.0.0.0", pwa=True)