File size: 1,816 Bytes
45b9636 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import subprocess, shutil, torch, os, tempfile
from transformers import pipeline
import imageio_ffmpeg as ffmpeg_helper
from logging_config import logger
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def ensure_ffmpeg():
"""Ensure ffmpeg binary exists in PATH (imageio-ffmpeg auto-download)"""
if shutil.which("ffmpeg"):
return
ffmpeg_bin = ffmpeg_helper.get_ffmpeg_exe()
os.environ["PATH"] = os.path.dirname(ffmpeg_bin) + os.pathsep + os.environ.get("PATH", "")
def to_wav(src: str) -> str:
"""Convert any audio/video file to 16 kHz mono wav required by Whisper HF pipeline"""
ensure_ffmpeg()
wav = tempfile.mktemp(suffix=".wav")
subprocess.run(
[
"ffmpeg",
"-hide_banner",
"-loglevel",
"error",
"-i",
src,
"-ar",
"16000",
"-ac",
"1",
"-y",
wav,
],
check=True,
)
return wav
def run_whisper_transcription(src: str):
"""Run OpenAI Whisper-small via HF pipeline and return list of segments."""
wav = to_wav(src)
asr = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small",
device=0 if DEVICE == "cuda" else -1,
return_timestamps=True,
chunk_length_s=30,
stride_length_s=5,
generate_kwargs={"task": "transcribe", "language": "en"},
)
logger.info("Starting Whisper …")
result = asr(wav)
segments = [
{
"text": c["text"].strip(),
"start": c["timestamp"][0],
"end": c["timestamp"][1],
}
for c in result["chunks"]
if c["text"].strip()
]
logger.info("Transcribed %d segments", len(segments))
return segments
|