Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28

Commit

b6786fa

1 Parent(s): e2ca8f7

Update models/loaders/sam2_loader.py

Browse files

Files changed (1) hide show

models/loaders/sam2_loader.py +351 -136

models/loaders/sam2_loader.py CHANGED Viewed

@@ -1,221 +1,436 @@
 #!/usr/bin/env python3
 """
-SAM2 Model Loader
-Handles all SAM2 loading strategies with proper fallbacks
 """
 import os
 import time
 import logging
 import traceback
-from pathlib import Path
-from typing import Optional, Dict, Any
-import torch
 import numpy as np
 logger = logging.getLogger(__name__)
 class SAM2Loader:
     """Dedicated loader for SAM2 models"""
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/sam2_cache"):
-        self.device = device
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
         # Configure HF hub for spaces
-        os.environ["HF_HUB_DISABLE_SYMLINKS"] = "1"
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
-        self.model = None
         self.model_id = None
         self.load_time = 0.0
     def load(self, model_size: str = "auto") -> Optional[Any]:
         """
         Load SAM2 model with specified size
         Args:
             model_size: "tiny", "small", "base", "large", or "auto"
         Returns:
-            Loaded model or None
         """
         if model_size == "auto":
             model_size = self._determine_optimal_size()
         model_map = {
-            "tiny": "facebook/sam2.1-hiera-tiny",
             "small": "facebook/sam2.1-hiera-small",
-            "base": "facebook/sam2.1-hiera-base-plus",
             "large": "facebook/sam2.1-hiera-large",
         }
         self.model_id = model_map.get(model_size, model_map["tiny"])
-        logger.info(f"Loading SAM2 model: {self.model_id}")
-        # Try loading strategies in order
-        strategies = [
-            ("official", self._load_official),
-            ("transformers", self._load_transformers),
-            ("fallback", self._load_fallback)
-        ]
-        for strategy_name, strategy_func in strategies:
             try:
-                logger.info(f"Trying SAM2 loading strategy: {strategy_name}")
-                start_time = time.time()
-                model = strategy_func()
-                if model:
-                    self.load_time = time.time() - start_time
-                    self.model = model
-                    logger.info(f"SAM2 loaded successfully via {strategy_name} in {self.load_time:.2f}s")
-                    return model
             except Exception as e:
-                logger.error(f"SAM2 {strategy_name} strategy failed: {e}")
                 logger.debug(traceback.format_exc())
-                continue
         logger.error("All SAM2 loading strategies failed")
         return None
     def _determine_optimal_size(self) -> str:
         """Determine optimal model size based on available memory"""
         try:
             if torch.cuda.is_available():
                 props = torch.cuda.get_device_properties(0)
                 vram_gb = props.total_memory / (1024**3)
-                if vram_gb < 4:
-                    return "tiny"
-                elif vram_gb < 8:
-                    return "small"
-                elif vram_gb < 12:
-                    return "base"
-                else:
-                    return "large"
-        except:
             pass
-        return "tiny"  # Conservative default
     def _load_official(self) -> Optional[Any]:
-        """Load using official SAM2 API - return directly without wrapper"""
         from sam2.sam2_image_predictor import SAM2ImagePredictor
         predictor = SAM2ImagePredictor.from_pretrained(
             self.model_id,
             cache_dir=self.cache_dir,
             local_files_only=False,
             trust_remote_code=True,
         )
-        # Move to device and set to eval mode
         if hasattr(predictor, "model"):
             predictor.model = predictor.model.to(self.device)
             predictor.model.eval()
-        # Set device attribute if it exists
         if hasattr(predictor, "device"):
             predictor.device = self.device
-        # Return the predictor directly - no wrapper!
-        # The calling code expects the standard SAM2 interface
         return predictor
-    def _load_transformers(self) -> Optional[Any]:
-        """Load using transformers library"""
-        from transformers import AutoModel, AutoProcessor
-        dtype = torch.float16 if "cuda" in self.device else torch.float32
-        model = AutoModel.from_pretrained(
-            self.model_id,
-            trust_remote_code=True,
-            torch_dtype=dtype,
-            cache_dir=self.cache_dir
-        )
-        model = model.to(self.device)
-        model.eval()
-        try:
-            processor = AutoProcessor.from_pretrained(
-                self.model_id,
-                cache_dir=self.cache_dir
-            )
-        except:
-            processor = None
-        # Wrap to match expected API
-        class SAM2TransformersWrapper:
-            def __init__(self, model, processor, device):
-                self.model = model
-                self.processor = processor
-                self.device = device
-                self.current_image = None
-            def set_image(self, image):
-                """Store image for processing"""
-                self.current_image = image
-                # TODO: Actually encode image with model here
-            def predict(self, point_coords=None, point_labels=None, box=None, **kwargs):
-                """Generate masks from prompts"""
-                # TODO: Implement actual prediction
-                if self.current_image is not None:
-                    h, w = self.current_image.shape[:2]
-                else:
-                    h, w = 512, 512
-                # For now, return dummy mask
-                return {
-                    "masks": np.ones((1, h, w), dtype=np.float32),
-                    "scores": np.array([0.9]),
-                    "logits": np.ones((1, h, w), dtype=np.float32),
-                }
-        return SAM2TransformersWrapper(model, processor, self.device)
     def _load_fallback(self) -> Optional[Any]:
-        """Create fallback predictor for testing"""
         class FallbackSAM2:
             def __init__(self, device):
                 self.device = device
-                self.current_image = None
             def set_image(self, image):
-                self.current_image = image
-            def predict(self, point_coords=None, point_labels=None, box=None, **kwargs):
-                """Return full mask as fallback"""
-                if self.current_image is not None:
-                    h, w = self.current_image.shape[:2]
                 else:
                     h, w = 512, 512
                 return {
                     "masks": np.ones((1, h, w), dtype=np.float32),
-                    "scores": np.array([0.5]),
-                    "logits": np.ones((1, h, w), dtype=np.float32),
                 }
         logger.warning("Using fallback SAM2 (no real segmentation)")
         return FallbackSAM2(self.device)
     def cleanup(self):
         """Clean up resources"""
-        if self.model:
-            del self.model
             self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
         """Get loader information"""
         return {
-            "loaded": self.model is not None,
             "model_id": self.model_id,
             "device": self.device,
             "load_time": self.load_time,
-            "model_type": type(self.model).__name__ if self.model else None
-        }

 #!/usr/bin/env python3
 """
+SAM2 Loader + Guarded Predictor Adapter (VRAM-friendly, shape-safe)
+- Loads a SAM2 image predictor on the desired device.
+- set_image(): accepts RGB/BGR, uint8/float; optional model-only downscale to save VRAM.
+- predict(): forwards prompts, upsamples masks back to original size, normalizes outputs.
+- Uses torch.inference_mode + optional autocast on CUDA.
+- Returns shapes compatible with utils.cv_processing.segment_person_hq logic.
 """
+from __future__ import annotations
 import os
 import time
 import logging
 import traceback
+from typing import Optional, Dict, Any, Tuple, List
 import numpy as np
+import torch
+import cv2
 logger = logging.getLogger(__name__)
+# -------------------------- helpers --------------------------
+def _select_device(pref: str) -> str:
+    pref = (pref or "").lower()
+    if pref.startswith("cuda"):
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    if pref == "cpu":
+        return "cpu"
+    return "cuda" if torch.cuda.is_available() else "cpu"
+def _ensure_rgb_uint8(img: np.ndarray, force_bgr_to_rgb: bool = False) -> np.ndarray:
+    """
+    Accept BGR/RGB, 3ch/4ch, uint8/float; return RGB uint8 [H,W,3].
+    We DO NOT blindly swap channels; cv_processing already feeds RGB.
+    Set force_bgr_to_rgb=True only if you know inputs are BGR.
+    """
+    if img is None:
+        raise ValueError("set_image received None image")
+    arr = np.asarray(img)
+    if arr.ndim != 3 or arr.shape[2] < 3:
+        raise ValueError(f"Expected HxWxC image with C>=3, got shape={arr.shape}")
+    # If float, clamp + scale to uint8
+    if np.issubdtype(arr.dtype, np.floating):
+        arr = np.clip(arr, 0.0, 1.0)
+        arr = (arr * 255.0 + 0.5).astype(np.uint8)
+    elif arr.dtype != np.uint8:
+        if arr.dtype == np.uint16:
+            arr = (arr / 257).astype(np.uint8)
+        else:
+            arr = arr.astype(np.uint8)
+    # If 4-channel, drop alpha
+    if arr.shape[2] == 4:
+        arr = arr[:, :, :3]
+    if force_bgr_to_rgb:
+        arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
+    return arr
+def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
+    if h <= 0 or w <= 0:
+        return h, w, 1.0
+    s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
+    s2 = min(1.0, (float(target_pixels) / float(h * w)) ** 0.5) if target_pixels > 0 else 1.0
+    s = min(s1, s2)
+    nh = max(1, int(round(h * s)))
+    nw = max(1, int(round(w * s)))
+    return nh, nw, s
+def _ladder(nh: int, nw: int) -> List[Tuple[int, int]]:
+    """
+    Progressive smaller sizes for OOM fallback.
+    """
+    sizes = [(nh, nw)]
+    sizes.append((max(1, int(nh * 0.85)), max(1, int(nw * 0.85))))
+    sizes.append((max(1, int(nh * 0.70)), max(1, int(nw * 0.70))))
+    sizes.append((max(1, int(nh * 0.50)), max(1, int(nw * 0.50))))
+    sizes.append((max(1, int(nh * 0.35)), max(1, int(nw * 0.35))))
+    # de-duplicate and keep order
+    uniq = []
+    seen = set()
+    for s in sizes:
+        if s not in seen:
+            uniq.append(s); seen.add(s)
+    return uniq
+def _upsample_stack(masks: np.ndarray, out_hw: Tuple[int, int]) -> np.ndarray:
+    """
+    masks: (N,h,w) float → bilinear → (N,H,W) float [0..1]
+    """
+    if masks.ndim != 3:
+        masks = np.asarray(masks)
+        if masks.ndim == 2:
+            masks = masks[None, ...]
+        elif masks.ndim == 4 and masks.shape[1] == 1:
+            masks = masks[:, 0, :, :]
+        else:
+            # try to squeeze to N,H,W
+            masks = np.squeeze(masks)
+            if masks.ndim == 2:
+                masks = masks[None, ...]
+    n, h, w = masks.shape
+    H, W = out_hw
+    if (h, w) == (H, W):
+        return masks.astype(np.float32, copy=False)
+    out = np.zeros((n, H, W), dtype=np.float32)
+    for i in range(n):
+        out[i] = cv2.resize(masks[i].astype(np.float32), (W, H), interpolation=cv2.INTER_LINEAR)
+    return np.clip(out, 0.0, 1.0)
+def _normalize_masks_dtype(x: np.ndarray) -> np.ndarray:
+    x = np.asarray(x)
+    if x.dtype == np.uint8:
+        return (x.astype(np.float32) / 255.0)
+    return x.astype(np.float32, copy=False)
+# -------------------------- adapter --------------------------
+class _SAM2Adapter:
+    """
+    Wraps SAM2ImagePredictor to:
+      - store original H,W
+      - model-only downscale on set_image
+      - OOM-aware predict with retry at smaller sizes
+      - upsample masks back to original size
+    """
+    def __init__(self, predictor, device: str):
+        self.pred = predictor
+        self.device = device
+        # original image size (for upsample)
+        self.orig_hw: Tuple[int, int] = (0, 0)
+        # env tunables
+        self.max_edge = int(os.environ.get("SAM2_MAX_EDGE", "1024"))
+        self.target_pixels = int(os.environ.get("SAM2_TARGET_PIXELS", "900000"))
+        self.force_bgr_to_rgb = os.environ.get("SAM2_ASSUME_BGR", "0") == "1"
+        # precision
+        self.use_autocast = (device == "cuda")
+        # prefer bf16 if available, else fp16; it's only a hint for the internal ops
+        self.autocast_dtype = None
+        if self.use_autocast:
+            try:
+                if hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
+                    self.autocast_dtype = torch.bfloat16
+                else:
+                    cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
+                    self.autocast_dtype = torch.float16 if cc[0] >= 7 else None
+            except Exception:
+                self.autocast_dtype = None
+        # cached current working image (RGB uint8) and its size
+        self._current_rgb: Optional[np.ndarray] = None
+        self._current_hw: Tuple[int, int] = (0, 0)
+    # --- API mirror ---
+    def set_image(self, image: np.ndarray):
+        """
+        Accept RGB or BGR, uint8 or float, any resolution.
+        Model-only downscale; keep orig H,W for upsample later.
+        """
+        rgb = _ensure_rgb_uint8(image, force_bgr_to_rgb=self.force_bgr_to_rgb)
+        H, W = rgb.shape[:2]
+        self.orig_hw = (H, W)
+        nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
+        if s < 1.0:
+            work = cv2.resize(rgb, (nw, nh), interpolation=cv2.INTER_AREA)
+            self._current_rgb = work
+            self._current_hw = (nh, nw)
+        else:
+            self._current_rgb = rgb
+            self._current_hw = (H, W)
+        # prime embeddings on predictor
+        self.pred.set_image(self._current_rgb)
+    def predict(self, **kwargs) -> Dict[str, Any]:
+        """
+        Forwards prompts to underlying predictor; retries smaller if OOM.
+        Always returns:
+          {"masks": (N,H,W) float32 [0..1], "scores": (N,), "logits": optional}
+        where (H,W) are the ORIGINAL image size provided to set_image().
+        """
+        if self._current_rgb is None or self.orig_hw == (0, 0):
+            raise RuntimeError("SAM2Adapter.predict called before set_image()")
+        H, W = self.orig_hw
+        nh, nw = self._current_hw
+        sizes = _ladder(nh, nw)
+        last_exc: Optional[BaseException] = None
+        for (th, tw) in sizes:
+            try:
+                # if we need a smaller embedding, rebuild set_image()
+                if (th, tw) != (nh, nw):
+                    small = cv2.resize(self._current_rgb, (tw, th), interpolation=cv2.INTER_AREA)
+                    self.pred.set_image(small)
+                # inference guard
+                class _NoOp:
+                    def __enter__(self): return None
+                    def __exit__(self, *a): return False
+                amp_ctx = _NoOp()
+                if self.use_autocast and self.autocast_dtype is not None:
+                    amp_ctx = torch.cuda.amp.autocast(dtype=self.autocast_dtype)
+                with torch.inference_mode():
+                    with amp_ctx:
+                        out = self.pred.predict(**kwargs)
+                # normalize outputs to dict
+                masks = None
+                scores = None
+                logits = None
+                if isinstance(out, dict):
+                    masks = out.get("masks", None)
+                    scores = out.get("scores", None)
+                    logits = out.get("logits", None)
+                elif isinstance(out, (tuple, list)):
+                    if len(out) >= 1: masks = out[0]
+                    if len(out) >= 2: scores = out[1]
+                    if len(out) >= 3: logits = out[2]
+                else:
+                    masks = out
+                if masks is None:
+                    raise RuntimeError("SAM2 returned no masks")
+                masks = np.asarray(masks)
+                # SAM2 variants: (N,H,W) or (N,1,H,W) or (H,W)
+                if masks.ndim == 2:
+                    masks = masks[None, ...]
+                elif masks.ndim == 4 and masks.shape[1] == 1:
+                    masks = masks[:, 0, :, :]
+                masks = _normalize_masks_dtype(masks)
+                # upsample to original resolution
+                masks_up = _upsample_stack(masks, (H, W))
+                # standardize scores
+                if scores is None:
+                    scores = np.ones((masks_up.shape[0],), dtype=np.float32) * 0.5
+                else:
+                    scores = np.asarray(scores).astype(np.float32, copy=False).reshape(-1)
+                out_dict = {"masks": masks_up, "scores": scores}
+                if logits is not None:
+                    # best-effort: resize per-channel to (H,W)
+                    lg = np.asarray(logits)
+                    if lg.ndim == 3:
+                        lg = _upsample_stack(lg, (H, W))
+                    elif lg.ndim == 4 and lg.shape[1] == 1:
+                        lg = _upsample_stack(lg[:, 0, :, :], (H, W))
+                    out_dict["logits"] = lg.astype(np.float32, copy=False)
+                return out_dict
+            except torch.cuda.OutOfMemoryError as e:
+                last_exc = e
+                logger.warning(f"SAM2 OOM at {th}x{tw}; retrying smaller. {e}")
+                torch.cuda.empty_cache()
+                continue
+            except Exception as e:
+                last_exc = e
+                logger.debug(traceback.format_exc())
+                logger.warning(f"SAM2 predict failed at {th}x{tw}; retrying smaller. {e}")
+                torch.cuda.empty_cache()
+                continue
+        # All attempts failed → safe fallback (full mask)
+        logger.warning(f"SAM2 calls failed; returning fallback. {last_exc}")
+        return {
+            "masks": np.ones((1, H, W), dtype=np.float32),
+            "scores": np.array([0.5], dtype=np.float32),
+        }
+# -------------------------- Loader --------------------------
 class SAM2Loader:
     """Dedicated loader for SAM2 models"""
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/sam2_cache"):
+        self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
         # Configure HF hub for spaces
+        os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS", "1")
+        os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")
+        self.model = None          # underlying predictor (SAM2ImagePredictor)
+        self.adapter = None        # wrapped predictor exposed to callers
         self.model_id = None
         self.load_time = 0.0
     def load(self, model_size: str = "auto") -> Optional[Any]:
         """
         Load SAM2 model with specified size
         Args:
             model_size: "tiny", "small", "base", "large", or "auto"
         Returns:
+            Wrapped predictor (adapter) or None
         """
         if model_size == "auto":
             model_size = self._determine_optimal_size()
         model_map = {
+            "tiny":  "facebook/sam2.1-hiera-tiny",
             "small": "facebook/sam2.1-hiera-small",
+            "base":  "facebook/sam2.1-hiera-base-plus",
             "large": "facebook/sam2.1-hiera-large",
         }
         self.model_id = model_map.get(model_size, model_map["tiny"])
+        logger.info(f"Loading SAM2 model: {self.model_id} (device={self.device})")
+        # Try the official loader
+        strategies = [("official", self._load_official), ("fallback", self._load_fallback)]
+        for name, fn in strategies:
             try:
+                t0 = time.time()
+                pred = fn()
+                if pred is None:
+                    continue
+                self.model = pred
+                self.adapter = _SAM2Adapter(self.model, self.device)
+                self.load_time = time.time() - t0
+                logger.info(f"SAM2 loaded via {name} in {self.load_time:.2f}s")
+                return self.adapter
             except Exception as e:
+                logger.error(f"SAM2 {name} strategy failed: {e}")
                 logger.debug(traceback.format_exc())
         logger.error("All SAM2 loading strategies failed")
         return None
     def _determine_optimal_size(self) -> str:
         """Determine optimal model size based on available memory"""
         try:
             if torch.cuda.is_available():
                 props = torch.cuda.get_device_properties(0)
                 vram_gb = props.total_memory / (1024**3)
+                if vram_gb < 4:   return "tiny"
+                if vram_gb < 8:   return "small"
+                if vram_gb < 12:  return "base"
+                return "large"
+        except Exception:
             pass
+        return "tiny"
     def _load_official(self) -> Optional[Any]:
+        """Load using official SAM2 API"""
         from sam2.sam2_image_predictor import SAM2ImagePredictor
         predictor = SAM2ImagePredictor.from_pretrained(
             self.model_id,
             cache_dir=self.cache_dir,
             local_files_only=False,
             trust_remote_code=True,
         )
+        # Move internal model to device if present
         if hasattr(predictor, "model"):
             predictor.model = predictor.model.to(self.device)
             predictor.model.eval()
         if hasattr(predictor, "device"):
             predictor.device = self.device
         return predictor
     def _load_fallback(self) -> Optional[Any]:
+        """Create a tiny fallback predictor"""
         class FallbackSAM2:
             def __init__(self, device):
                 self.device = device
+                self._img = None
             def set_image(self, image):
+                self._img = np.asarray(image)
+            def predict(self, **kwargs):
+                if self._img is not None:
+                    h, w = self._img.shape[:2]
                 else:
                     h, w = 512, 512
                 return {
                     "masks": np.ones((1, h, w), dtype=np.float32),
+                    "scores": np.array([0.5], dtype=np.float32),
                 }
         logger.warning("Using fallback SAM2 (no real segmentation)")
         return FallbackSAM2(self.device)
     def cleanup(self):
         """Clean up resources"""
+        self.adapter = None
+        if self.model is not None:
+            try:
+                del self.model
+            except Exception:
+                pass
             self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
         """Get loader information"""
         return {
+            "loaded": self.adapter is not None,
             "model_id": self.model_id,
             "device": self.device,
             "load_time": self.load_time,
+            "model_type": type(self.model).__name__ if self.model else None,
+        }