Spaces:

stkrk
/

dino_sam_objects

Sleeping

Stanislav

feat: IMPORTANT changes to write DINO MODEL

08a4a7f 4 months ago

2.89 kB

	from PIL import Image
	from transformers import GroundingDinoProcessor, GroundingDinoForObjectDetection

	import cv2

	import os

	HF_CACHE = "/tmp/hf_cache"
	os.makedirs(HF_CACHE, exist_ok=True)
	os.environ["TRANSFORMERS_CACHE"] = HF_CACHE



	class DinoWrapper:
	"""
	Wrapper for Grounding DINO model for text-prompt-based object detection.
	"""

	def __init__(self, model_dir, device=None):
	"""
	Initialize the Grounding DINO model.

	:param model_name: HuggingFace model repo name
	:param device: 'cuda' or 'cpu'; if None, auto-detects
	"""

	device = "cpu"

	self.device = device

	self.model = GroundingDinoForObjectDetection.from_pretrained(
	pretrained_model_name_or_path=model_dir,
	local_files_only=True,
	use_safetensors=True
	).to(self.device)

	self.processor = GroundingDinoProcessor.from_pretrained(
	pretrained_model_name_or_path=model_dir,
	local_files_only=True
	)


	def predict_boxes(self, image, prompt, box_threshold=0.15, text_threshold=0.18):
	"""
	Predict bounding boxes based on the prompt.

	:param image: Input image (NumPy array, BGR)
	:param prompt: Textual description of target object(s)
	:param box_threshold: Confidence threshold
	:return: List of boxes [x1, y1, x2, y2] in absolute pixel coords
	"""

	print(f"[DEBUG] Prompt to model: {prompt}")

	image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

	inputs = self.processor(images=image_pil, text=prompt, return_tensors="pt").to(self.device)
	print(f"[DEBUG] input_ids: {inputs['input_ids']}")
	outputs = self.model(**inputs)
	print(f"[DEBUG] Model output keys: {outputs.keys()}")

	results = self.processor.post_process_grounded_object_detection(
	outputs,
	inputs["input_ids"],
	box_threshold,
	text_threshold,
	[image_pil.size[::-1]]
	)[0]

	print(f"[DEBUG] text_labels: {results['text_labels'] if 'text_labels' in results else 'NO LABELS'}")

	print(f"[DEBUG] Raw results keys: {results.keys()}")
	print(f"[DEBUG] boxes: {results['boxes'] if 'boxes' in results else 'NO BOXES FOUND'}")
	print(f"[DEBUG] scores: {results['scores'] if 'scores' in results else 'NO SCORES FOUND'}")

	print(f"[DINO] Found {len(results['boxes'])} box(es) for prompt: '{prompt}'")

	boxes = results["boxes"].detach().cpu().numpy().tolist()

	return boxes

	def detect(self, image, prompt, box_threshold=0.25, text_threshold=0.15, min_box_area=500):
	boxes = self.predict_boxes(image, prompt, box_threshold, text_threshold)
	filtered = [box for box in boxes if (box[2] - box[0]) * (box[3] - box[1]) >= min_box_area]
	return filtered