import subprocess, shutil, torch, os, tempfile from transformers import pipeline import imageio_ffmpeg as ffmpeg_helper from logging_config import logger DEVICE = "cuda" if torch.cuda.is_available() else "cpu" def ensure_ffmpeg(): """Ensure ffmpeg binary exists in PATH (imageio-ffmpeg auto-download)""" if shutil.which("ffmpeg"): return ffmpeg_bin = ffmpeg_helper.get_ffmpeg_exe() os.environ["PATH"] = os.path.dirname(ffmpeg_bin) + os.pathsep + os.environ.get("PATH", "") def to_wav(src: str) -> str: """Convert any audio/video file to 16 kHz mono wav required by Whisper HF pipeline""" ensure_ffmpeg() wav = tempfile.mktemp(suffix=".wav") subprocess.run( [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-i", src, "-ar", "16000", "-ac", "1", "-y", wav, ], check=True, ) return wav def run_whisper_transcription(src: str): """Run OpenAI Whisper-small via HF pipeline and return list of segments.""" wav = to_wav(src) asr = pipeline( "automatic-speech-recognition", model="openai/whisper-small", device=0 if DEVICE == "cuda" else -1, return_timestamps=True, chunk_length_s=30, stride_length_s=5, generate_kwargs={"task": "transcribe", "language": "en"}, ) logger.info("Starting Whisper …") result = asr(wav) segments = [ { "text": c["text"].strip(), "start": c["timestamp"][0], "end": c["timestamp"][1], } for c in result["chunks"] if c["text"].strip() ] logger.info("Transcribed %d segments", len(segments)) return segments