File size: 1,816 Bytes
45b9636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import subprocess, shutil, torch, os, tempfile
from transformers import pipeline
import imageio_ffmpeg as ffmpeg_helper
from logging_config import logger

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


def ensure_ffmpeg():
    """Ensure ffmpeg binary exists in PATH (imageio-ffmpeg auto-download)"""
    if shutil.which("ffmpeg"):
        return
    ffmpeg_bin = ffmpeg_helper.get_ffmpeg_exe()
    os.environ["PATH"] = os.path.dirname(ffmpeg_bin) + os.pathsep + os.environ.get("PATH", "")


def to_wav(src: str) -> str:
    """Convert any audio/video file to 16 kHz mono wav required by Whisper HF pipeline"""
    ensure_ffmpeg()
    wav = tempfile.mktemp(suffix=".wav")
    subprocess.run(
        [
            "ffmpeg",
            "-hide_banner",
            "-loglevel",
            "error",
            "-i",
            src,
            "-ar",
            "16000",
            "-ac",
            "1",
            "-y",
            wav,
        ],
        check=True,
    )
    return wav


def run_whisper_transcription(src: str):
    """Run OpenAI Whisper-small via HF pipeline and return list of segments."""
    wav = to_wav(src)
    asr = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-small",
        device=0 if DEVICE == "cuda" else -1,
        return_timestamps=True,
        chunk_length_s=30,
        stride_length_s=5,
        generate_kwargs={"task": "transcribe", "language": "en"},
    )
    logger.info("Starting Whisper …")
    result = asr(wav)
    segments = [
        {
            "text": c["text"].strip(),
            "start": c["timestamp"][0],
            "end": c["timestamp"][1],
        }
        for c in result["chunks"]
        if c["text"].strip()
    ]
    logger.info("Transcribed %d segments", len(segments))
    return segments