from PIL import Image from transformers import GroundingDinoProcessor, GroundingDinoForObjectDetection import cv2 import os HF_CACHE = "/tmp/hf_cache" os.makedirs(HF_CACHE, exist_ok=True) os.environ["TRANSFORMERS_CACHE"] = HF_CACHE class DinoWrapper: """ Wrapper for Grounding DINO model for text-prompt-based object detection. """ def __init__(self, model_dir, device=None): """ Initialize the Grounding DINO model. :param model_name: HuggingFace model repo name :param device: 'cuda' or 'cpu'; if None, auto-detects """ device = "cpu" self.device = device self.model = GroundingDinoForObjectDetection.from_pretrained( pretrained_model_name_or_path=model_dir, local_files_only=True, use_safetensors=True ).to(self.device) self.processor = GroundingDinoProcessor.from_pretrained( pretrained_model_name_or_path=model_dir, local_files_only=True ) def predict_boxes(self, image, prompt, box_threshold=0.15, text_threshold=0.18): """ Predict bounding boxes based on the prompt. :param image: Input image (NumPy array, BGR) :param prompt: Textual description of target object(s) :param box_threshold: Confidence threshold :return: List of boxes [x1, y1, x2, y2] in absolute pixel coords """ print(f"[DEBUG] Prompt to model: {prompt}") image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) inputs = self.processor(images=image_pil, text=prompt, return_tensors="pt").to(self.device) print(f"[DEBUG] input_ids: {inputs['input_ids']}") outputs = self.model(**inputs) print(f"[DEBUG] Model output keys: {outputs.keys()}") results = self.processor.post_process_grounded_object_detection( outputs, inputs["input_ids"], box_threshold, text_threshold, [image_pil.size[::-1]] )[0] print(f"[DEBUG] text_labels: {results['text_labels'] if 'text_labels' in results else 'NO LABELS'}") print(f"[DEBUG] Raw results keys: {results.keys()}") print(f"[DEBUG] boxes: {results['boxes'] if 'boxes' in results else 'NO BOXES FOUND'}") print(f"[DEBUG] scores: {results['scores'] if 'scores' in results else 'NO SCORES FOUND'}") print(f"[DINO] Found {len(results['boxes'])} box(es) for prompt: '{prompt}'") boxes = results["boxes"].detach().cpu().numpy().tolist() return boxes def detect(self, image, prompt, box_threshold=0.25, text_threshold=0.15, min_box_area=500): boxes = self.predict_boxes(image, prompt, box_threshold, text_threshold) filtered = [box for box in boxes if (box[2] - box[0]) * (box[3] - box[1]) >= min_box_area] return filtered