File size: 4,985 Bytes
ba81a65 ffa3aaf 8d799e6 3589927 8d799e6 810585d 8d799e6 ffa3aaf 810585d 8d799e6 810585d ffa3aaf 8d799e6 810585d 8d799e6 ffa3aaf 8d799e6 810585d 8d799e6 810585d 8d799e6 810585d ffa3aaf 8d799e6 810585d ffa3aaf ba81a65 810585d ffa3aaf 8911fc8 8562af7 ffa3aaf ccc607d 8d799e6 ee7445f 8562af7 1ae9511 03195d9 ffa3aaf 8d799e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import os, json
import gradio as gr
from faster_whisper import WhisperModel
from moviepy.editor import VideoFileClip
def convert_video_to_audio(video_input):
video_clip = VideoFileClip(video_input)
audio_clip = video_clip.audio
audio_clip_filepath = os.path.normpath(f"{video_input.split('.')[0]}.m4a")
audio_clip.write_audiofile(audio_clip_filepath, codec='aac')
audio_clip.close()
video_clip.close()
return audio_clip_filepath
def convert_seconds_to_time(seconds):
seconds = float(seconds)
hours, remainder = divmod(seconds, 3600)
minutes, remainder = divmod(remainder, 60)
whole_seconds = int(remainder)
milliseconds = int((remainder - whole_seconds) * 1000)
return f"{int(hours):02}:{int(minutes):02}:{whole_seconds:02},{milliseconds:03}"
def write_srt(segments, max_words_per_line, srt_path, device_type):
# Pause and char heuristics
max_chars = 23 if device_type == "mobile" else 42
pause_threshold = 2.0
with open(srt_path, "w", encoding="utf-8") as file:
result = ""
result_clean = []
json_output = {"lines": []}
line_counter = 1
words_in_line = []
for segment in segments:
for word in segment.words:
# Check if adding this word breaks char limit
tentative_line = " ".join([w.word.strip() for w in words_in_line + [word]])
# Detect pause (gap from previous word)
long_pause = False
if words_in_line:
prev_word = words_in_line[-1]
if word.start - prev_word.end >= pause_threshold:
long_pause = True
word_overflow = len(words_in_line) >= max_words_per_line
char_overflow = len(tentative_line) > max_chars
# Break conditions
if (word_overflow or char_overflow or long_pause):
# Finalize current line
if words_in_line:
start_time = convert_seconds_to_time(words_in_line[0].start)
end_time = convert_seconds_to_time(words_in_line[-1].end)
line_text = " ".join([w.word.strip() for w in words_in_line])
# SRT
result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
result_clean.append(line_text)
# JSON
json_output["lines"].append({
"line_index": line_counter,
"start": words_in_line[0].start,
"end": words_in_line[-1].end,
"text": line_text,
"words": [
{"word": w.word.strip(), "start": w.start, "end": w.end}
for w in words_in_line
]
})
line_counter += 1
# Start a fresh line with the current word
words_in_line = [word]
else:
# keep adding words
words_in_line.append(word)
# Flush last line
if words_in_line:
start_time = convert_seconds_to_time(words_in_line[0].start)
end_time = convert_seconds_to_time(words_in_line[-1].end)
line_text = " ".join([w.word.strip() for w in words_in_line])
result += f"{line_counter}\n{start_time} --> {end_time}\n{line_text}\n\n"
result_clean.append(line_text)
json_output["lines"].append({
"line_index": line_counter,
"start": words_in_line[0].start,
"end": words_in_line[-1].end,
"text": line_text,
"words": [
{"word": w.word.strip(), "start": w.start, "end": w.end}
for w in words_in_line
]
})
file.write(result)
return result, srt_path, " ".join(result_clean), json.dumps(json_output)
def transcriber(file_input:gr.File,
file_type: str,
max_words_per_line:int,
task:str,
model_version:str,
device_type: str):
srt_filepath = os.path.normpath(f"{file_input.split('.')[0]}.srt")
if file_type == "video" :
audio_input = convert_video_to_audio(file_input)
else:
audio_input = file_input
model = WhisperModel(model_version, device="auto", compute_type="int8")
segments, _ = model.transcribe(
audio_input,
beam_size=5,
task=task,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500),
word_timestamps=True
)
return write_srt(segments=segments, max_words_per_line=max_words_per_line, srt_path=srt_filepath, device_type=device_type) |