hantupocong commited on
Commit
4620163
·
verified ·
1 Parent(s): 1cde223

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -1
app.py CHANGED
@@ -11,6 +11,8 @@ from config import MAX_TEXT_LEN
11
  from data import selective_smoothing, GLOBAL_MEAN_T, GLOBAL_STD_T
12
  from model import TextToPoseSeq2Seq
13
  from transformers import BertTokenizer
 
 
14
 
15
  # === Tokenizer and Model Init ===
16
  tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
@@ -34,6 +36,18 @@ video_df = pd.read_csv("annotated_vid_link.csv")
34
  video_df["text_clean"] = video_df["text"].str.strip().str.lower()
35
  video_lookup = dict(zip(video_df["text_clean"], video_df["Video URL"]))
36
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def get_youtube_link(input_text):
38
  return video_lookup.get(input_text.strip().lower())
39
 
@@ -252,7 +266,13 @@ with gr.Blocks() as demo:
252
 
253
  with gr.Row():
254
  with gr.Column(scale=1):
255
- text_input = gr.Textbox(label="Enter Malay Word or Sentence")
 
 
 
 
 
 
256
  threshold_slider = gr.Slider(0.0, 1.0, value=0.05, step=0.05, label="Confidence Threshold (for displaying joints)")
257
  show_video_toggle = gr.Checkbox(label="Show Video Previews", value=True)
258
  submit_btn = gr.Button("Submit")
@@ -266,6 +286,12 @@ with gr.Blocks() as demo:
266
  submit_btn.click(fn=predict,
267
  inputs=[text_input, threshold_slider, show_video_toggle],
268
  outputs=[video_output, text_output, youtube_output])
 
 
 
 
 
 
269
 
270
  clear_btn.click(lambda: ("", "", ""),
271
  inputs=[],
 
11
  from data import selective_smoothing, GLOBAL_MEAN_T, GLOBAL_STD_T
12
  from model import TextToPoseSeq2Seq
13
  from transformers import BertTokenizer
14
+ #whisper
15
+ from faster_whisper import WhisperModel
16
 
17
  # === Tokenizer and Model Init ===
18
  tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
 
36
  video_df["text_clean"] = video_df["text"].str.strip().str.lower()
37
  video_lookup = dict(zip(video_df["text_clean"], video_df["Video URL"]))
38
 
39
+ # === Load Whisper ===
40
+ whisper_model = WhisperModel("small", compute_type="int8")
41
+
42
+ def transcribe_audio(audio_path):
43
+ try:
44
+ segments, _ = whisper_model.transcribe(audio_path, language="ms", beam_size=5)
45
+ full_text = " ".join([segment.text.strip() for segment in segments])
46
+ return full_text.strip()
47
+ except Exception as e:
48
+ print("Whisper Error:", e)
49
+ return ""
50
+
51
  def get_youtube_link(input_text):
52
  return video_lookup.get(input_text.strip().lower())
53
 
 
266
 
267
  with gr.Row():
268
  with gr.Column(scale=1):
269
+ with gr.Tab("Text Input"): #whisper
270
+ text_input = gr.Textbox(label="Enter Malay Word or Sentence")
271
+ with gr.Tab("Speech Input"):
272
+ audio_input = gr.Audio(source="microphone", type="filepath", label="Record Malay Audio")
273
+ audio_transcript = gr.Textbox(label="Transcribed Text", interactive=True)
274
+ transcribe_btn = gr.Button("Transcribe") #whisper
275
+
276
  threshold_slider = gr.Slider(0.0, 1.0, value=0.05, step=0.05, label="Confidence Threshold (for displaying joints)")
277
  show_video_toggle = gr.Checkbox(label="Show Video Previews", value=True)
278
  submit_btn = gr.Button("Submit")
 
286
  submit_btn.click(fn=predict,
287
  inputs=[text_input, threshold_slider, show_video_toggle],
288
  outputs=[video_output, text_output, youtube_output])
289
+
290
+ transcribe_btn.click(transcribe_audio,
291
+ inputs=audio_input, outputs=audio_transcript)
292
+ audio_transcript.change(predict,
293
+ inputs=[audio_transcript, threshold_slider, show_video_toggle],
294
+ outputs=[video_output, text_output, youtube_output])
295
 
296
  clear_btn.click(lambda: ("", "", ""),
297
  inputs=[],