ClipQuery / transcription.py
maguid28's picture
initial commit
45b9636
raw
history blame
1.82 kB
import subprocess, shutil, torch, os, tempfile
from transformers import pipeline
import imageio_ffmpeg as ffmpeg_helper
from logging_config import logger
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def ensure_ffmpeg():
"""Ensure ffmpeg binary exists in PATH (imageio-ffmpeg auto-download)"""
if shutil.which("ffmpeg"):
return
ffmpeg_bin = ffmpeg_helper.get_ffmpeg_exe()
os.environ["PATH"] = os.path.dirname(ffmpeg_bin) + os.pathsep + os.environ.get("PATH", "")
def to_wav(src: str) -> str:
"""Convert any audio/video file to 16 kHz mono wav required by Whisper HF pipeline"""
ensure_ffmpeg()
wav = tempfile.mktemp(suffix=".wav")
subprocess.run(
[
"ffmpeg",
"-hide_banner",
"-loglevel",
"error",
"-i",
src,
"-ar",
"16000",
"-ac",
"1",
"-y",
wav,
],
check=True,
)
return wav
def run_whisper_transcription(src: str):
"""Run OpenAI Whisper-small via HF pipeline and return list of segments."""
wav = to_wav(src)
asr = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small",
device=0 if DEVICE == "cuda" else -1,
return_timestamps=True,
chunk_length_s=30,
stride_length_s=5,
generate_kwargs={"task": "transcribe", "language": "en"},
)
logger.info("Starting Whisper …")
result = asr(wav)
segments = [
{
"text": c["text"].strip(),
"start": c["timestamp"][0],
"end": c["timestamp"][1],
}
for c in result["chunks"]
if c["text"].strip()
]
logger.info("Transcribed %d segments", len(segments))
return segments