import json from pathlib import Path from typing import Dict from PIL import Image import torch from transformers import BlipProcessor, BlipForConditionalGeneration from config import VISION_MODEL _processor = None _model = None _device = "cuda" if torch.cuda.is_available() else "cpu" def _load_blip(): global _processor, _model if _processor is None or _model is None: _processor = BlipProcessor.from_pretrained(VISION_MODEL) _model = BlipForConditionalGeneration.from_pretrained(VISION_MODEL).to(_device) _model.eval() return _processor, _model def caption_image(img_path: Path) -> str: processor, model = _load_blip() img = Image.open(str(img_path)).convert("RGB") inputs = processor(img, return_tensors="pt").to(_device) with torch.inference_mode(): out_ids = model.generate(**inputs, max_new_tokens=40) return processor.decode(out_ids[0], skip_special_tokens=True) def caption_folder(frames_dir: Path) -> Dict[str, str]: results = {} for p in sorted(frames_dir.glob("*.jpg")): results[p.name] = caption_image(p) return results def dump_json(data, out_path: Path): with open(out_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2)