import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from threading import Thread import gradio as gr MODEL_NAME = os.getenv('MODEL_ID') TOKEN = os.getenv('TOKEN') print("Loading model...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, token=TOKEN) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, token=TOKEN ) print("Model loaded.") def playground( message, history, system_prompt, max_new_tokens, temperature, repetition_penalty, top_k, top_p ): if not isinstance(message, str) or not message.strip(): yield "" return # Build conversation với system prompt conversation = [] # Thêm system prompt nếu có if system_prompt and system_prompt.strip(): conversation.append({"role": "system", "content": system_prompt.strip()}) # Thêm lịch sử chat for user_msg, bot_msg in history: conversation.append({"role": "user", "content": user_msg}) if bot_msg: conversation.append({"role": "assistant", "content": bot_msg}) conversation.append({"role": "user", "content": message}) if hasattr(tokenizer, "apply_chat_template"): prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) else: prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in conversation]) + "\nassistant:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=int(max_new_tokens), temperature=float(temperature), top_k=int(top_k) if top_k > 0 else None, top_p=float(top_p), repetition_penalty=float(repetition_penalty), do_sample=True if temperature > 0 else False, pad_token_id=tokenizer.eos_token_id ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() generated_text = "" for new_text in streamer: generated_text += new_text yield generated_text thread.join() with gr.Blocks(fill_height=True, fill_width=True) as app: with gr.Sidebar(): gr.Markdown("## Playground by UltimaX Intelligence") gr.HTML(""" Runs beyoru/Qwen3-0.9B-A0.6B via Hugging Face Transformers.

Support me at:.

Buy Me A Coffee """) gr.Markdown("---") gr.Markdown("## System Prompt") system_prompt = gr.Textbox( label="System Prompt", placeholder="Enter custom system instructions here (optional)...", lines=4, value="You are a helpful AI assistant.", info="AI role custome" ) gr.Markdown("---") gr.Markdown("## Generation Parameters") max_new_tokens = gr.Slider(32, 4096, value=2048, step=32, label="Max New Tokens") temperature = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="Temperature") repetition_penalty = gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="Repetition Penalty") top_k = gr.Slider(0, 100, value=20, step=1, label="Top K (0 = off)") top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.05, label="Top P") gr.ChatInterface( fn=playground, additional_inputs=[system_prompt, max_new_tokens, temperature, repetition_penalty, top_k, top_p], chatbot=gr.Chatbot( label="Qwen3-0.9B-A0.6B", show_copy_button=True, allow_tags=["think"], ), examples=[ ["Hello who are you?"], ["How to solve 2x+1=3."], ["Example python code for async"] ], cache_examples=False, show_api=False ) app.launch(server_name="0.0.0.0", pwa=True)