Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,8 @@ from config import MAX_TEXT_LEN
|
|
11 |
from data import selective_smoothing, GLOBAL_MEAN_T, GLOBAL_STD_T
|
12 |
from model import TextToPoseSeq2Seq
|
13 |
from transformers import BertTokenizer
|
|
|
|
|
14 |
|
15 |
# === Tokenizer and Model Init ===
|
16 |
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
|
@@ -34,6 +36,18 @@ video_df = pd.read_csv("annotated_vid_link.csv")
|
|
34 |
video_df["text_clean"] = video_df["text"].str.strip().str.lower()
|
35 |
video_lookup = dict(zip(video_df["text_clean"], video_df["Video URL"]))
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def get_youtube_link(input_text):
|
38 |
return video_lookup.get(input_text.strip().lower())
|
39 |
|
@@ -252,7 +266,13 @@ with gr.Blocks() as demo:
|
|
252 |
|
253 |
with gr.Row():
|
254 |
with gr.Column(scale=1):
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
threshold_slider = gr.Slider(0.0, 1.0, value=0.05, step=0.05, label="Confidence Threshold (for displaying joints)")
|
257 |
show_video_toggle = gr.Checkbox(label="Show Video Previews", value=True)
|
258 |
submit_btn = gr.Button("Submit")
|
@@ -266,6 +286,12 @@ with gr.Blocks() as demo:
|
|
266 |
submit_btn.click(fn=predict,
|
267 |
inputs=[text_input, threshold_slider, show_video_toggle],
|
268 |
outputs=[video_output, text_output, youtube_output])
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
clear_btn.click(lambda: ("", "", ""),
|
271 |
inputs=[],
|
|
|
11 |
from data import selective_smoothing, GLOBAL_MEAN_T, GLOBAL_STD_T
|
12 |
from model import TextToPoseSeq2Seq
|
13 |
from transformers import BertTokenizer
|
14 |
+
#whisper
|
15 |
+
from faster_whisper import WhisperModel
|
16 |
|
17 |
# === Tokenizer and Model Init ===
|
18 |
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
|
|
|
36 |
video_df["text_clean"] = video_df["text"].str.strip().str.lower()
|
37 |
video_lookup = dict(zip(video_df["text_clean"], video_df["Video URL"]))
|
38 |
|
39 |
+
# === Load Whisper ===
|
40 |
+
whisper_model = WhisperModel("small", compute_type="int8")
|
41 |
+
|
42 |
+
def transcribe_audio(audio_path):
|
43 |
+
try:
|
44 |
+
segments, _ = whisper_model.transcribe(audio_path, language="ms", beam_size=5)
|
45 |
+
full_text = " ".join([segment.text.strip() for segment in segments])
|
46 |
+
return full_text.strip()
|
47 |
+
except Exception as e:
|
48 |
+
print("Whisper Error:", e)
|
49 |
+
return ""
|
50 |
+
|
51 |
def get_youtube_link(input_text):
|
52 |
return video_lookup.get(input_text.strip().lower())
|
53 |
|
|
|
266 |
|
267 |
with gr.Row():
|
268 |
with gr.Column(scale=1):
|
269 |
+
with gr.Tab("Text Input"): #whisper
|
270 |
+
text_input = gr.Textbox(label="Enter Malay Word or Sentence")
|
271 |
+
with gr.Tab("Speech Input"):
|
272 |
+
audio_input = gr.Audio(source="microphone", type="filepath", label="Record Malay Audio")
|
273 |
+
audio_transcript = gr.Textbox(label="Transcribed Text", interactive=True)
|
274 |
+
transcribe_btn = gr.Button("Transcribe") #whisper
|
275 |
+
|
276 |
threshold_slider = gr.Slider(0.0, 1.0, value=0.05, step=0.05, label="Confidence Threshold (for displaying joints)")
|
277 |
show_video_toggle = gr.Checkbox(label="Show Video Previews", value=True)
|
278 |
submit_btn = gr.Button("Submit")
|
|
|
286 |
submit_btn.click(fn=predict,
|
287 |
inputs=[text_input, threshold_slider, show_video_toggle],
|
288 |
outputs=[video_output, text_output, youtube_output])
|
289 |
+
|
290 |
+
transcribe_btn.click(transcribe_audio,
|
291 |
+
inputs=audio_input, outputs=audio_transcript)
|
292 |
+
audio_transcript.change(predict,
|
293 |
+
inputs=[audio_transcript, threshold_slider, show_video_toggle],
|
294 |
+
outputs=[video_output, text_output, youtube_output])
|
295 |
|
296 |
clear_btn.click(lambda: ("", "", ""),
|
297 |
inputs=[],
|