Spaces:
Build error
Build error
Commit
·
dccede6
1
Parent(s):
356f877
feat(app.py): [WIP] format transcription with gemini
Browse files
app.py
CHANGED
|
@@ -2,12 +2,14 @@ import random
|
|
| 2 |
import streamlit as st
|
| 3 |
import io
|
| 4 |
import os
|
|
|
|
| 5 |
from transformers import pipeline
|
| 6 |
import torch
|
| 7 |
import yt_dlp
|
| 8 |
from silero_vad import load_silero_vad, get_speech_timestamps
|
| 9 |
import numpy as np
|
| 10 |
import pydub
|
|
|
|
| 11 |
|
| 12 |
# --- Model Loading and Caching ---
|
| 13 |
@st.cache_resource
|
|
@@ -291,8 +293,9 @@ def process_transcription(video_url, vad_sensitivity, batch_size, transcriber, v
|
|
| 291 |
start_time = format_seconds(chunk['start'])
|
| 292 |
end_time = format_seconds(chunk['end'])
|
| 293 |
full_transcription += f"[{start_time} - {end_time}]: {chunk['text'].strip()}\n\n"
|
|
|
|
| 294 |
|
| 295 |
-
return full_transcription, audio_data, audio_format, info
|
| 296 |
|
| 297 |
def format_seconds(seconds):
|
| 298 |
"""Formats seconds into HH:MM:SS string."""
|
|
@@ -327,18 +330,26 @@ def download_video(video_url, video_format):
|
|
| 327 |
st.error(f"Error during video download: {e}")
|
| 328 |
return None, None, None
|
| 329 |
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
from transformers import pipeline
|
| 335 |
-
import torch
|
| 336 |
-
import yt_dlp
|
| 337 |
-
from silero_vad import load_silero_vad, get_speech_timestamps
|
| 338 |
-
import numpy as np
|
| 339 |
-
import pydub
|
| 340 |
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
def main():
|
| 344 |
"""Main function to run the Streamlit application."""
|
|
@@ -360,9 +371,9 @@ def main():
|
|
| 360 |
# Call setup_ui() to get UI element values
|
| 361 |
video_url, language, batch_size, transcribe_option, download_audio_option, download_video_option, process_button, vad_sensitivity, audio_format, video_format = setup_ui()
|
| 362 |
|
| 363 |
-
transcription_output = st.empty()
|
| 364 |
if st.session_state.full_transcription:
|
| 365 |
-
|
| 366 |
|
| 367 |
if process_button:
|
| 368 |
st.session_state.full_transcription = None
|
|
@@ -377,9 +388,12 @@ def main():
|
|
| 377 |
return
|
| 378 |
|
| 379 |
if transcribe_option:
|
| 380 |
-
st.session_state.full_transcription, st.session_state.audio_data, st.session_state.audio_format, st.session_state.info = process_transcription(video_url, vad_sensitivity, batch_size, transcriber, vad_model, audio_format, language)
|
| 381 |
if st.session_state.full_transcription:
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
if download_audio_option:
|
| 385 |
if st.session_state.audio_data is None or st.session_state.audio_format is None or st.session_state.info is None:
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
import io
|
| 4 |
import os
|
| 5 |
+
|
| 6 |
from transformers import pipeline
|
| 7 |
import torch
|
| 8 |
import yt_dlp
|
| 9 |
from silero_vad import load_silero_vad, get_speech_timestamps
|
| 10 |
import numpy as np
|
| 11 |
import pydub
|
| 12 |
+
from litellm import completion
|
| 13 |
|
| 14 |
# --- Model Loading and Caching ---
|
| 15 |
@st.cache_resource
|
|
|
|
| 293 |
start_time = format_seconds(chunk['start'])
|
| 294 |
end_time = format_seconds(chunk['end'])
|
| 295 |
full_transcription += f"[{start_time} - {end_time}]: {chunk['text'].strip()}\n\n"
|
| 296 |
+
formatted_transcription = format_transcript(full_transcription)
|
| 297 |
|
| 298 |
+
return full_transcription, formatted_transcription, audio_data, audio_format, info
|
| 299 |
|
| 300 |
def format_seconds(seconds):
|
| 301 |
"""Formats seconds into HH:MM:SS string."""
|
|
|
|
| 330 |
st.error(f"Error during video download: {e}")
|
| 331 |
return None, None, None
|
| 332 |
|
| 333 |
+
def format_transcript(input_transcription):
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
# os.environ["GEMINI_API_KEY"] = "..."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
+
sys_prompt = """
|
| 339 |
+
Video Transcription Formatting
|
| 340 |
+
|
| 341 |
+
As an LLM formatting provided video transcriptions (in any language), transform spoken language into clear, readable text. Prioritize readability, consistency, and context, adapting to the specific language conventions. **Do not hallucinate or add any information not present in the original transcript.**
|
| 342 |
+
|
| 343 |
+
* **Sentences:** Restructure long, rambling sentences; correct grammatical errors *while preserving the original meaning*; use proper punctuation appropriate for the language.
|
| 344 |
+
* **Reading:** Italicize/quote read text; clearly separate from explanations.
|
| 345 |
+
* **Repetitions:** Remove unnecessary repetitions unless for emphasis.
|
| 346 |
+
""".strip()
|
| 347 |
+
messages = [{"content": sys_prompt, "role": "system"},
|
| 348 |
+
{"content": f"Format the following video transcription: {input_transcription}", "role": "user"}]
|
| 349 |
+
|
| 350 |
+
response = completion(model="gemini/gemini-2.0-flash-exp", messages=messages)
|
| 351 |
+
formatted_text = response.choices[0].message.content
|
| 352 |
+
return formatted_text
|
| 353 |
|
| 354 |
def main():
|
| 355 |
"""Main function to run the Streamlit application."""
|
|
|
|
| 371 |
# Call setup_ui() to get UI element values
|
| 372 |
video_url, language, batch_size, transcribe_option, download_audio_option, download_video_option, process_button, vad_sensitivity, audio_format, video_format = setup_ui()
|
| 373 |
|
| 374 |
+
# transcription_output = st.empty()
|
| 375 |
if st.session_state.full_transcription:
|
| 376 |
+
st.text_area("Transcription:", value=st.session_state.full_transcription, height=300, key=random.random())
|
| 377 |
|
| 378 |
if process_button:
|
| 379 |
st.session_state.full_transcription = None
|
|
|
|
| 388 |
return
|
| 389 |
|
| 390 |
if transcribe_option:
|
| 391 |
+
st.session_state.full_transcription, st.session_state.formatted_transcription, st.session_state.audio_data, st.session_state.audio_format, st.session_state.info = process_transcription(video_url, vad_sensitivity, batch_size, transcriber, vad_model, audio_format, language)
|
| 392 |
if st.session_state.full_transcription:
|
| 393 |
+
st.text_area("Transcription:", value=st.session_state.full_transcription, height=300, key=random.random())
|
| 394 |
+
if st.session_state.formatted_transcription:
|
| 395 |
+
st.text_area("Formatted Transcription:", value=st.session_state.formatted_transcription, height=300, key=random.random())
|
| 396 |
+
|
| 397 |
|
| 398 |
if download_audio_option:
|
| 399 |
if st.session_state.audio_data is None or st.session_state.audio_format is None or st.session_state.info is None:
|