Spaces:
Runtime error
Runtime error
| # Prediction interface for Cog ⚙️ | |
| # https://github.com/replicate/cog/blob/main/docs/python.md | |
| import os | |
| import json | |
| from typing import Any | |
| import numpy as np | |
| import random | |
| import torch | |
| import torchvision | |
| import torchvision.transforms as transforms | |
| from PIL import Image | |
| import cv2 | |
| import matplotlib.pyplot as plt | |
| from cog import BasePredictor, Input, Path, BaseModel | |
| from subprocess import call | |
| HOME = os.getcwd() | |
| os.chdir("GroundingDINO") | |
| call("pip install -q .", shell=True) | |
| os.chdir(HOME) | |
| os.chdir("segment_anything") | |
| call("pip install -q .", shell=True) | |
| os.chdir(HOME) | |
| # Grounding DINO | |
| import GroundingDINO.groundingdino.datasets.transforms as T | |
| from GroundingDINO.groundingdino.models import build_model | |
| from GroundingDINO.groundingdino.util.slconfig import SLConfig | |
| from GroundingDINO.groundingdino.util.utils import ( | |
| clean_state_dict, | |
| get_phrases_from_posmap, | |
| ) | |
| # segment anything | |
| from segment_anything import build_sam, build_sam_hq, SamPredictor | |
| from ram.models import ram | |
| class ModelOutput(BaseModel): | |
| tags: str | |
| rounding_box_img: Path | |
| masked_img: Path | |
| json_data: Any | |
| class Predictor(BasePredictor): | |
| def setup(self): | |
| """Load the model into memory to make running multiple predictions efficient""" | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| normalize = transforms.Normalize( | |
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] | |
| ) | |
| self.image_size = 384 | |
| self.transform = transforms.Compose( | |
| [ | |
| transforms.Resize((self.image_size, self.image_size)), | |
| transforms.ToTensor(), | |
| normalize, | |
| ] | |
| ) | |
| # load model | |
| self.ram_model = ram( | |
| pretrained="pretrained/ram_swin_large_14m.pth", | |
| image_size=self.image_size, | |
| vit="swin_l", | |
| ) | |
| self.ram_model.eval() | |
| self.ram_model = self.ram_model.to(self.device) | |
| self.model = load_model( | |
| "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", | |
| "pretrained/groundingdino_swint_ogc.pth", | |
| device=self.device, | |
| ) | |
| self.sam = SamPredictor( | |
| build_sam(checkpoint="pretrained/sam_vit_h_4b8939.pth").to(self.device) | |
| ) | |
| self.sam_hq = SamPredictor( | |
| build_sam_hq(checkpoint="pretrained/sam_hq_vit_h.pth").to(self.device) | |
| ) | |
| def predict( | |
| self, | |
| input_image: Path = Input(description="Input image"), | |
| use_sam_hq: bool = Input( | |
| description="Use sam_hq instead of SAM for prediction", default=False | |
| ), | |
| ) -> ModelOutput: | |
| """Run a single prediction on the model""" | |
| # default settings | |
| box_threshold = 0.25 | |
| text_threshold = 0.2 | |
| iou_threshold = 0.5 | |
| image_pil, image = load_image(str(input_image)) | |
| raw_image = image_pil.resize((self.image_size, self.image_size)) | |
| raw_image = self.transform(raw_image).unsqueeze(0).to(self.device) | |
| with torch.no_grad(): | |
| tags, tags_chinese = self.ram_model.generate_tag(raw_image) | |
| tags = tags[0].replace(" |", ",") | |
| # run grounding dino model | |
| boxes_filt, scores, pred_phrases = get_grounding_output( | |
| self.model, image, tags, box_threshold, text_threshold, device=self.device | |
| ) | |
| predictor = self.sam_hq if use_sam_hq else self.sam | |
| image = cv2.imread(str(input_image)) | |
| image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| predictor.set_image(image) | |
| size = image_pil.size | |
| H, W = size[1], size[0] | |
| for i in range(boxes_filt.size(0)): | |
| boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H]) | |
| boxes_filt[i][:2] -= boxes_filt[i][2:] / 2 | |
| boxes_filt[i][2:] += boxes_filt[i][:2] | |
| boxes_filt = boxes_filt.cpu() | |
| # use NMS to handle overlapped boxes | |
| print(f"Before NMS: {boxes_filt.shape[0]} boxes") | |
| nms_idx = ( | |
| torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist() | |
| ) | |
| boxes_filt = boxes_filt[nms_idx] | |
| pred_phrases = [pred_phrases[idx] for idx in nms_idx] | |
| print(f"After NMS: {boxes_filt.shape[0]} boxes") | |
| transformed_boxes = predictor.transform.apply_boxes_torch( | |
| boxes_filt, image.shape[:2] | |
| ).to(self.device) | |
| masks, _, _ = predictor.predict_torch( | |
| point_coords=None, | |
| point_labels=None, | |
| boxes=transformed_boxes.to(self.device), | |
| multimask_output=False, | |
| ) | |
| # draw output image | |
| plt.figure(figsize=(10, 10)) | |
| for mask in masks: | |
| show_mask(mask.cpu().numpy(), plt.gca(), random_color=True) | |
| for box, label in zip(boxes_filt, pred_phrases): | |
| show_box(box.numpy(), plt.gca(), label) | |
| rounding_box_path = "/tmp/automatic_label_output.png" | |
| plt.axis("off") | |
| plt.savefig( | |
| Path(rounding_box_path), bbox_inches="tight", dpi=300, pad_inches=0.0 | |
| ) | |
| plt.close() | |
| # save masks and json data | |
| value = 0 # 0 for background | |
| mask_img = torch.zeros(masks.shape[-2:]) | |
| for idx, mask in enumerate(masks): | |
| mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1 | |
| plt.figure(figsize=(10, 10)) | |
| plt.imshow(mask_img.numpy()) | |
| plt.axis("off") | |
| masks_path = "/tmp/mask.png" | |
| plt.savefig(masks_path, bbox_inches="tight", dpi=300, pad_inches=0.0) | |
| plt.close() | |
| json_data = { | |
| "tags": tags, | |
| "mask": [{"value": value, "label": "background"}], | |
| } | |
| for label, box in zip(pred_phrases, boxes_filt): | |
| value += 1 | |
| name, logit = label.split("(") | |
| logit = logit[:-1] # the last is ')' | |
| json_data["mask"].append( | |
| { | |
| "value": value, | |
| "label": name, | |
| "logit": float(logit), | |
| "box": box.numpy().tolist(), | |
| } | |
| ) | |
| json_path = "/tmp/label.json" | |
| with open(json_path, "w") as f: | |
| json.dump(json_data, f) | |
| return ModelOutput( | |
| tags=tags, | |
| masked_img=Path(masks_path), | |
| rounding_box_img=Path(rounding_box_path), | |
| json_data=Path(json_path), | |
| ) | |
| def get_grounding_output( | |
| model, image, caption, box_threshold, text_threshold, device="cpu" | |
| ): | |
| caption = caption.lower() | |
| caption = caption.strip() | |
| if not caption.endswith("."): | |
| caption = caption + "." | |
| model = model.to(device) | |
| image = image.to(device) | |
| with torch.no_grad(): | |
| outputs = model(image[None], captions=[caption]) | |
| logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256) | |
| boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4) | |
| logits.shape[0] | |
| # filter output | |
| logits_filt = logits.clone() | |
| boxes_filt = boxes.clone() | |
| filt_mask = logits_filt.max(dim=1)[0] > box_threshold | |
| logits_filt = logits_filt[filt_mask] # num_filt, 256 | |
| boxes_filt = boxes_filt[filt_mask] # num_filt, 4 | |
| logits_filt.shape[0] | |
| # get phrase | |
| tokenlizer = model.tokenizer | |
| tokenized = tokenlizer(caption) | |
| # build pred | |
| pred_phrases = [] | |
| scores = [] | |
| for logit, box in zip(logits_filt, boxes_filt): | |
| pred_phrase = get_phrases_from_posmap( | |
| logit > text_threshold, tokenized, tokenlizer | |
| ) | |
| pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") | |
| scores.append(logit.max().item()) | |
| return boxes_filt, torch.Tensor(scores), pred_phrases | |
| def load_image(image_path): | |
| # load image | |
| image_pil = Image.open(image_path).convert("RGB") # load image | |
| transform = T.Compose( | |
| [ | |
| T.RandomResize([800], max_size=1333), | |
| T.ToTensor(), | |
| T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), | |
| ] | |
| ) | |
| image, _ = transform(image_pil, None) # 3, h, w | |
| return image_pil, image | |
| def load_model(model_config_path, model_checkpoint_path, device): | |
| args = SLConfig.fromfile(model_config_path) | |
| args.device = device | |
| model = build_model(args) | |
| checkpoint = torch.load(model_checkpoint_path, map_location="cpu") | |
| load_res = model.load_state_dict( | |
| clean_state_dict(checkpoint["model"]), strict=False | |
| ) | |
| print(load_res) | |
| _ = model.eval() | |
| return model | |
| def show_mask(mask, ax, random_color=False): | |
| if random_color: | |
| color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) | |
| else: | |
| color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) | |
| h, w = mask.shape[-2:] | |
| mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) | |
| ax.imshow(mask_image) | |
| def show_box(box, ax, label): | |
| x0, y0 = box[0], box[1] | |
| w, h = box[2] - box[0], box[3] - box[1] | |
| ax.add_patch( | |
| plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=1.5) | |
| ) | |
| ax.text(x0, y0, label) | |