|
import subprocess, shutil, torch, os, tempfile |
|
from transformers import pipeline |
|
import imageio_ffmpeg as ffmpeg_helper |
|
from logging_config import logger |
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
def ensure_ffmpeg(): |
|
"""Ensure ffmpeg binary exists in PATH (imageio-ffmpeg auto-download)""" |
|
if shutil.which("ffmpeg"): |
|
return |
|
ffmpeg_bin = ffmpeg_helper.get_ffmpeg_exe() |
|
os.environ["PATH"] = os.path.dirname(ffmpeg_bin) + os.pathsep + os.environ.get("PATH", "") |
|
|
|
|
|
def to_wav(src: str) -> str: |
|
"""Convert any audio/video file to 16 kHz mono wav required by Whisper HF pipeline""" |
|
ensure_ffmpeg() |
|
wav = tempfile.mktemp(suffix=".wav") |
|
subprocess.run( |
|
[ |
|
"ffmpeg", |
|
"-hide_banner", |
|
"-loglevel", |
|
"error", |
|
"-i", |
|
src, |
|
"-ar", |
|
"16000", |
|
"-ac", |
|
"1", |
|
"-y", |
|
wav, |
|
], |
|
check=True, |
|
) |
|
return wav |
|
|
|
|
|
def run_whisper_transcription(src: str): |
|
"""Run OpenAI Whisper-small via HF pipeline and return list of segments.""" |
|
wav = to_wav(src) |
|
asr = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-small", |
|
device=0 if DEVICE == "cuda" else -1, |
|
return_timestamps=True, |
|
chunk_length_s=30, |
|
stride_length_s=5, |
|
generate_kwargs={"task": "transcribe", "language": "en"}, |
|
) |
|
logger.info("Starting Whisper β¦") |
|
result = asr(wav) |
|
segments = [ |
|
{ |
|
"text": c["text"].strip(), |
|
"start": c["timestamp"][0], |
|
"end": c["timestamp"][1], |
|
} |
|
for c in result["chunks"] |
|
if c["text"].strip() |
|
] |
|
logger.info("Transcribed %d segments", len(segments)) |
|
return segments |
|
|