Spaces:

nguyentantoan
/

vintern-video-recognition

Sleeping

App Files Files Community

nguyentantoan commited on May 23

Commit

a7f24f5

verified ·

1 Parent(s): 5f21563

Update app.py

Browse files

Files changed (1) hide show

app.py +184 -49

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer
@@ -7,6 +8,7 @@ from PIL import Image
 import base64
 import io
 import time
 # Setup
 device = "cpu"  # HF Spaces miễn phí chỉ có CPU
@@ -14,18 +16,33 @@ model = None
 tokenizer = None
 transform = None
 def load_model():
     global model, tokenizer, transform
     try:
         print("🤖 Loading Vintern-1B-v3.5...")
         model_name = "5CD-AI/Vintern-1B-v3_5"
         tokenizer = AutoTokenizer.from_pretrained(
             model_name,
             trust_remote_code=True
         )
         model = AutoModel.from_pretrained(
             model_name,
             torch_dtype=torch.float32,
@@ -33,60 +50,100 @@ def load_model():
             low_cpu_mem_usage=True
         )
-        # Image transform
-        IMAGENET_MEAN = (0.485, 0.456, 0.406)
-        IMAGENET_STD = (0.229, 0.224, 0.225)
-        transform = T.Compose([
-            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
-            T.Resize((448, 448), interpolation=InterpolationMode.BICUBIC),
-            T.ToTensor(),
-            T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
-        ])
         print("✅ Model loaded successfully!")
         return True
     except Exception as e:
         print(f"❌ Error loading model: {e}")
         return False
 def analyze_image(image):
     if model is None:
         return "❌ Model chưa được tải. Vui lòng chờ..."
     try:
         start_time = time.time()
-        # Preprocess image
-        if isinstance(image, str):
-            # Base64 image
-            if image.startswith('data:image'):
-                image = image.split(',')[1]
-            image_bytes = base64.b64decode(image)
-            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
-        image_tensor = transform(image).unsqueeze(0).to(device)
         with torch.no_grad():
             query = "Mô tả chi tiết những gì bạn thấy trong hình ảnh này:"
-            description = model.chat(
-                tokenizer,
-                image_tensor,
-                query,
-                generation_config=dict(
-                    max_new_tokens=200,
                     do_sample=True,
-                    temperature=0.7,
-                    top_p=0.9,
-                    repetition_penalty=1.1
                 )
-            )
             # Get objects
             try:
-                object_query = "Liệt kê các đối tượng chính:"
                 objects_text = model.chat(
                     tokenizer,
                     image_tensor,
@@ -100,8 +157,8 @@ def analyze_image(image):
             processing_time = time.time() - start_time
-            return f"""
-**📝 Mô tả từ Vintern AI:**
 {description}
 **🔍 Đối tượng nhận diện:**
@@ -109,46 +166,124 @@ def analyze_image(image):
 **⚡ Thời gian xử lý:** {processing_time:.2f}s
 **🤖 Model:** Vintern-1B-v3.5 (Hugging Face Spaces)
 """
     except Exception as e:
-        return f"❌ Lỗi phân tích: {str(e)}"
-# Load model khi khởi động
 print("🚀 Initializing Vintern-1B-v3.5...")
 model_loaded = load_model()
-# Gradio interface
-with gr.Blocks(title="Vintern-1B-v3.5 Video Recognition") as demo:
-    gr.Markdown("# 🎥 Vintern-1B-v3.5 - Nhận Diện Ảnh Tiếng Việt")
-    gr.Markdown("Upload ảnh để nhận diện nội dung bằng AI Vintern-1B-v3.5")
     if not model_loaded:
-        gr.Markdown("⚠️ **Model đang được tải...** Vui lòng chờ vài phút.")
     with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(type="pil", label="📤 Upload Ảnh")
-            analyze_btn = gr.Button("🔍 Phân Tích", variant="primary")
-        with gr.Column():
-            result_output = gr.Textbox(label="📋 Kết Quả", lines=10, max_lines=15)
     analyze_btn.click(
         fn=analyze_image,
         inputs=image_input,
         outputs=result_output
     )
     gr.Markdown("""
     ---
-    **💡 Hướng dẫn:**
-    1. Upload ảnh từ máy tính hoặc webcam
-    2. Nhấn "Phân Tích" để nhận diện
-    3. Xem kết quả mô tả tiếng Việt
-    **🔗 API Endpoint:** Sử dụng URL của Space này trong trangchu.html
     """)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

+# app.py - Fixed version for HF Spaces
 import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer
 import base64
 import io
 import time
+import traceback
 # Setup
 device = "cpu"  # HF Spaces miễn phí chỉ có CPU
 tokenizer = None
 transform = None
+def build_transform(input_size=448):
+    """Build image transform pipeline"""
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+    return T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if hasattr(img, 'mode') and img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+    ])
 def load_model():
+    """Load Vintern model"""
     global model, tokenizer, transform
     try:
         print("🤖 Loading Vintern-1B-v3.5...")
         model_name = "5CD-AI/Vintern-1B-v3_5"
+        # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(
             model_name,
             trust_remote_code=True
         )
+        # Load model
         model = AutoModel.from_pretrained(
             model_name,
             torch_dtype=torch.float32,
             low_cpu_mem_usage=True
         )
+        # Build transform
+        transform = build_transform()
         print("✅ Model loaded successfully!")
         return True
     except Exception as e:
         print(f"❌ Error loading model: {e}")
+        traceback.print_exc()
         return False
+def safe_image_processing(image):
+    """Safely process image input"""
+    try:
+        # Handle different input types
+        if image is None:
+            return None, "❌ Không có ảnh đầu vào"
+        # If it's a file path (string)
+        if isinstance(image, str):
+            if image.startswith('data:image'):
+                # Base64 image
+                image_data = image.split(',')[1]
+                image_bytes = base64.b64decode(image_data)
+                image = Image.open(io.BytesIO(image_bytes))
+            else:
+                # File path
+                image = Image.open(image)
+        # Ensure it's a PIL Image
+        if not hasattr(image, 'mode'):
+            return None, "❌ Định dạng ảnh không hợp lệ"
+        # Convert to RGB if needed
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        return image, None
+    except Exception as e:
+        return None, f"❌ Lỗi xử lý ảnh: {str(e)}"
 def analyze_image(image):
+    """Analyze image with Vintern model"""
     if model is None:
         return "❌ Model chưa được tải. Vui lòng chờ..."
     try:
         start_time = time.time()
+        # Safe image processing
+        processed_image, error = safe_image_processing(image)
+        if error:
+            return error
+        if processed_image is None:
+            return "❌ Không thể xử lý ảnh đầu vào"
+        # Transform image
+        image_tensor = transform(processed_image).unsqueeze(0).to(device)
         with torch.no_grad():
+            # Main description
             query = "Mô tả chi tiết những gì bạn thấy trong hình ảnh này:"
+            try:
+                description = model.chat(
+                    tokenizer,
+                    image_tensor,
+                    query,
+                    generation_config=dict(
+                        max_new_tokens=200,
+                        do_sample=True,
+                        temperature=0.7,
+                        top_p=0.9,
+                        repetition_penalty=1.1
+                    )
+                )
+            except Exception as chat_error:
+                print(f"Chat method failed: {chat_error}")
+                # Fallback to simple generation
+                inputs = tokenizer(query, return_tensors="pt").to(device)
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=150,
                     do_sample=True,
+                    temperature=0.7
                 )
+                description = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                description = description.replace(query, "").strip()
             # Get objects
             try:
+                object_query = "Liệt kê các đối tượng chính trong ảnh:"
                 objects_text = model.chat(
                     tokenizer,
                     image_tensor,
             processing_time = time.time() - start_time
+            # Format output
+            return f"""**📝 Mô tả từ Vintern AI:**
 {description}
 **🔍 Đối tượng nhận diện:**
 **⚡ Thời gian xử lý:** {processing_time:.2f}s
 **🤖 Model:** Vintern-1B-v3.5 (Hugging Face Spaces)
+**💻 Device:** {device.upper()}
+---
+*Để sử dụng cho video real-time, hãy sử dụng API endpoint của Space này với trangchu.html*
 """
     except Exception as e:
+        error_msg = f"❌ Lỗi phân tích: {str(e)}"
+        print(error_msg)
+        traceback.print_exc()
+        return error_msg
+def analyze_for_api(image_file):
+    """API-friendly analysis function"""
+    try:
+        result = analyze_image(image_file)
+        # Return simple text for API consumption
+        return result
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Load model when starting
 print("🚀 Initializing Vintern-1B-v3.5...")
 model_loaded = load_model()
+# Create Gradio interface
+with gr.Blocks(
+    title="Vintern-1B-v3.5 Video Recognition",
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+    }
+    .upload-area {
+        min-height: 300px;
+    }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🎥 Vintern-1B-v3.5 - Nhận Diện Ảnh Tiếng Việt
+    **Powered by Hugging Face Spaces** | Model chạy hoàn toàn trên cloud
+    """)
     if not model_loaded:
+        gr.Markdown("⚠️ **Model đang được tải...** Vui lòng chờ vài phút và refresh trang.")
+    else:
+        gr.Markdown("✅ **Model đã sẵn sàng!** Upload ảnh để bắt đầu nhận diện.")
     with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(
+                type="pil",
+                label="📤 Upload Ảnh",
+                elem_classes=["upload-area"]
+            )
+            with gr.Row():
+                analyze_btn = gr.Button("🔍 Phân Tích", variant="primary", scale=2)
+                clear_btn = gr.Button("🗑️ Xóa", variant="secondary", scale=1)
+        with gr.Column(scale=1):
+            result_output = gr.Textbox(
+                label="📋 Kết Quả Phân Tích",
+                lines=15,
+                max_lines=20,
+                show_copy_button=True
+            )
+    # Event handlers
     analyze_btn.click(
         fn=analyze_image,
         inputs=image_input,
         outputs=result_output
     )
+    clear_btn.click(
+        fn=lambda: (None, ""),
+        outputs=[image_input, result_output]
+    )
+    # Auto-analyze on image upload
+    image_input.change(
+        fn=analyze_image,
+        inputs=image_input,
+        outputs=result_output
+    )
     gr.Markdown("""
     ---
+    ## 💡 Hướng dẫn sử dụng:
+    ### 🖼️ Phân tích ảnh đơn lẻ:
+    1. **Upload ảnh** từ máy tính hoặc drag & drop
+    2. **Kết quả tự động** hiển thị sau khi upload
+    3. **Hoặc nhấn "Phân Tích"** để chạy lại
+    ### 🎥 Phân tích video real-time:
+    1. **Copy URL Space này:** `https://nguyentantoan-vintern-video-recognition.hf.space`
+    2. **Mở trangchu.html** đã được cung cấp
+    3. **Thay URL** trong code JavaScript
+    4. **Sử dụng camera** để phân tích real-time
+    ### 🔗 API Usage:
+    ```javascript
+    // POST to: https://nguyentantoan-vintern-video-recognition.hf.space/api/predict
+    // Body: FormData with image file
+    ```
+    **⚠️ Lưu ý:** Việc phân tích có thể mất 10-30 giây do chạy trên CPU miễn phí của HF Spaces.
     """)
+# Launch the app
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        quiet=False
+    )