Spaces:

ylinzhang12
/

sam3-demo

Running

App Files Files Community

Yanlin Zhang commited on 17 days ago

Commit

ccd869c

1 Parent(s): a5e9f68

use sam3 pipeline

Browse files

Files changed (1) hide show

app.py +113 -68

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ import gradio as gr
 import numpy as np
 from PIL import Image
 import torch
-from transformers import AutoImageProcessor, AutoModel
 # -----------------------------------------------------------------------------
 # Configuration
@@ -39,15 +39,18 @@ CLASS_COLORS: Dict[str, Tuple[int, int, int]] = {
 }
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 # -----------------------------------------------------------------------------
 # Model + processor
 # -----------------------------------------------------------------------------
-processor = AutoImageProcessor.from_pretrained(MODEL_ID)
-model = AutoModel.from_pretrained(MODEL_ID, torch_dtype=DTYPE).to(DEVICE)
-model.eval()
 # -----------------------------------------------------------------------------
@@ -63,77 +66,119 @@ class Track:
     score: float | None
-def _post_process(outputs, height: int, width: int):
-    target_sizes = [(height, width)]
-    if hasattr(processor, "post_process_instance_segmentation"):
-        return processor.post_process_instance_segmentation(
-            outputs=outputs,
-            target_sizes=target_sizes,
-            threshold=0.35,
-            mask_threshold=0.4,
-            overlap_mask_area_threshold=0.5,
-        )[0]
-    if hasattr(processor, "post_process_semantic_segmentation"):
-        segmentation = processor.post_process_semantic_segmentation(
-            outputs=outputs,
-            target_sizes=target_sizes,
-        )[0]
-        return {
-            "masks": segmentation.unsqueeze(0),
-            "scores": torch.ones(1),
-            "labels": torch.zeros(1, dtype=torch.int64),
-        }
-    raise gr.Error(
-        "This version of transformers does not expose SAM3 post-processing helpers. "
-        "Please ensure transformers>=4.46.0 is installed."
-    )
 def _extract_detections(frame_rgb: np.ndarray) -> List[Dict]:
     pil_image = Image.fromarray(frame_rgb)
     detections: List[Dict] = []
     for label in TEXT_PROMPTS:
-        inputs = processor(images=pil_image, text=label, return_tensors="pt")
-        inputs = {
-            k: (v.to(DEVICE) if isinstance(v, torch.Tensor) else v)
-            for k, v in inputs.items()
-        }
-        with torch.inference_mode():
-            outputs = model(**inputs)
-        processed = _post_process(outputs, pil_image.height, pil_image.width)
-        masks = processed.get("masks", [])
-        scores = processed.get("scores", [None] * len(masks))
-        for mask_tensor, score in zip(masks, scores):
-            mask_np = mask_tensor.squeeze().detach().cpu().numpy()
-            if mask_np.ndim == 3:
-                mask_np = mask_np[0]
-            binary_mask = mask_np > 0.5
-            area = int(binary_mask.sum())
-            if area < MIN_MASK_PIXELS:
-                continue
-            ys, xs = np.nonzero(binary_mask)
-            if len(xs) == 0:
                 continue
-            centroid = (float(xs.mean()), float(ys.mean()))
-            detections.append(
-                {
-                    "label": label,
-                    "mask": binary_mask,
-                    "score": float(score) if score is not None else None,
-                    "centroid": centroid,
-                    "area": area,
-                }
-            )
     return detections

 import numpy as np
 from PIL import Image
 import torch
+from transformers import pipeline
 # -----------------------------------------------------------------------------
 # Configuration
 }
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # -----------------------------------------------------------------------------
 # Model + processor
 # -----------------------------------------------------------------------------
+# Use pipeline as shown in Hugging Face guidance
+# Then extract model and processor for text-prompt support
+mask_pipe = pipeline("mask-generation", model=MODEL_ID, device=0 if DEVICE == "cuda" else -1)
+# Extract model and processor from pipeline for direct text prompt usage
+model = mask_pipe.model
+processor = mask_pipe.feature_extractor if hasattr(mask_pipe, 'feature_extractor') else mask_pipe.image_processor
 # -----------------------------------------------------------------------------
     score: float | None
 def _extract_detections(frame_rgb: np.ndarray) -> List[Dict]:
     pil_image = Image.fromarray(frame_rgb)
     detections: List[Dict] = []
     for label in TEXT_PROMPTS:
+        # Use processor and model directly with text prompt
+        try:
+            inputs = processor(images=pil_image, text=label, return_tensors="pt")
+            inputs = {
+                k: (v.to(DEVICE) if isinstance(v, torch.Tensor) else v)
+                for k, v in inputs.items()
+            }
+            with torch.inference_mode():
+                outputs = model(**inputs)
+            # Extract masks from outputs - SAM3 outputs structure may vary
+            if hasattr(outputs, "pred_masks"):
+                masks = outputs.pred_masks
+            elif hasattr(outputs, "masks"):
+                masks = outputs.masks
+            elif isinstance(outputs, dict):
+                masks = outputs.get("pred_masks") or outputs.get("masks")
+            else:
+                masks = outputs
+            if masks is None:
                 continue
+            # Handle different mask formats
+            if isinstance(masks, torch.Tensor):
+                if masks.ndim == 4:  # [batch, num_masks, H, W]
+                    masks = masks[0]  # Remove batch dimension
+                elif masks.ndim == 3:  # [num_masks, H, W]
+                    pass
+                else:
+                    continue
+                for mask_tensor in masks:
+                    mask_np = mask_tensor.squeeze().detach().cpu().numpy()
+                    if mask_np.ndim == 3:
+                        mask_np = mask_np[0]
+                    binary_mask = mask_np > 0.5
+                    area = int(binary_mask.sum())
+                    if area < MIN_MASK_PIXELS:
+                        continue
+                    ys, xs = np.nonzero(binary_mask)
+                    if len(xs) == 0:
+                        continue
+                    centroid = (float(xs.mean()), float(ys.mean()))
+                    detections.append(
+                        {
+                            "label": label,
+                            "mask": binary_mask,
+                            "score": None,
+                            "centroid": centroid,
+                            "area": area,
+                        }
+                    )
+        except Exception as e:
+            # Fallback to pipeline if direct access fails
+            try:
+                results = mask_pipe(pil_image)
+                if not isinstance(results, list):
+                    results = [results]
+                for result in results:
+                    if isinstance(result, dict):
+                        mask = result.get("mask")
+                        score = result.get("score")
+                    else:
+                        mask = result
+                        score = None
+                    if isinstance(mask, Image.Image):
+                        mask_np = np.array(mask.convert("L"))
+                    elif isinstance(mask, torch.Tensor):
+                        mask_np = mask.squeeze().detach().cpu().numpy()
+                    elif isinstance(mask, np.ndarray):
+                        mask_np = mask
+                    else:
+                        continue
+                    if mask_np.ndim == 3:
+                        mask_np = mask_np[:, :, 0] if mask_np.shape[2] == 1 else mask_np.max(axis=2)
+                    if mask_np.max() > 1.0:
+                        mask_np = mask_np / 255.0
+                    binary_mask = mask_np > 0.5
+                    area = int(binary_mask.sum())
+                    if area < MIN_MASK_PIXELS:
+                        continue
+                    ys, xs = np.nonzero(binary_mask)
+                    if len(xs) == 0:
+                        continue
+                    centroid = (float(xs.mean()), float(ys.mean()))
+                    detections.append(
+                        {
+                            "label": label,
+                            "mask": binary_mask,
+                            "score": float(score) if score is not None else None,
+                            "centroid": centroid,
+                            "area": area,
+                        }
+                    )
+            except Exception as e2:
+                raise gr.Error(f"Both direct model access and pipeline failed: {e2}")
     return detections