Spaces:

hantupocong
/

Text-to-MalaySignLanguage

Sleeping

App Files Files Community

hantupocong commited on Jul 22

Commit

4620163

verified ·

1 Parent(s): 1cde223

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -1

app.py CHANGED Viewed

@@ -11,6 +11,8 @@ from config import MAX_TEXT_LEN
 from data import selective_smoothing, GLOBAL_MEAN_T, GLOBAL_STD_T
 from model import TextToPoseSeq2Seq
 from transformers import BertTokenizer
 # === Tokenizer and Model Init ===
 tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
@@ -34,6 +36,18 @@ video_df = pd.read_csv("annotated_vid_link.csv")
 video_df["text_clean"] = video_df["text"].str.strip().str.lower()
 video_lookup = dict(zip(video_df["text_clean"], video_df["Video URL"]))
 def get_youtube_link(input_text):
     return video_lookup.get(input_text.strip().lower())
@@ -252,7 +266,13 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=1):
-            text_input = gr.Textbox(label="Enter Malay Word or Sentence")
             threshold_slider = gr.Slider(0.0, 1.0, value=0.05, step=0.05, label="Confidence Threshold (for displaying joints)")
             show_video_toggle = gr.Checkbox(label="Show Video Previews", value=True)
             submit_btn = gr.Button("Submit")
@@ -266,6 +286,12 @@ with gr.Blocks() as demo:
     submit_btn.click(fn=predict,
                      inputs=[text_input, threshold_slider, show_video_toggle],
                      outputs=[video_output, text_output, youtube_output])
     clear_btn.click(lambda: ("", "", ""),
                     inputs=[],

 from data import selective_smoothing, GLOBAL_MEAN_T, GLOBAL_STD_T
 from model import TextToPoseSeq2Seq
 from transformers import BertTokenizer
+#whisper
+from faster_whisper import WhisperModel
 # === Tokenizer and Model Init ===
 tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
 video_df["text_clean"] = video_df["text"].str.strip().str.lower()
 video_lookup = dict(zip(video_df["text_clean"], video_df["Video URL"]))
+# === Load Whisper ===
+whisper_model = WhisperModel("small", compute_type="int8")
+def transcribe_audio(audio_path):
+    try:
+        segments, _ = whisper_model.transcribe(audio_path, language="ms", beam_size=5)
+        full_text = " ".join([segment.text.strip() for segment in segments])
+        return full_text.strip()
+    except Exception as e:
+        print("Whisper Error:", e)
+        return ""
 def get_youtube_link(input_text):
     return video_lookup.get(input_text.strip().lower())
     with gr.Row():
         with gr.Column(scale=1):
+            with gr.Tab("Text Input"):   #whisper
+                text_input = gr.Textbox(label="Enter Malay Word or Sentence")
+            with gr.Tab("Speech Input"):
+                audio_input = gr.Audio(source="microphone", type="filepath", label="Record Malay Audio")
+                audio_transcript = gr.Textbox(label="Transcribed Text", interactive=True)
+                transcribe_btn = gr.Button("Transcribe")  #whisper
             threshold_slider = gr.Slider(0.0, 1.0, value=0.05, step=0.05, label="Confidence Threshold (for displaying joints)")
             show_video_toggle = gr.Checkbox(label="Show Video Previews", value=True)
             submit_btn = gr.Button("Submit")
     submit_btn.click(fn=predict,
                      inputs=[text_input, threshold_slider, show_video_toggle],
                      outputs=[video_output, text_output, youtube_output])
+    transcribe_btn.click(transcribe_audio,
+                         inputs=audio_input, outputs=audio_transcript)
+    audio_transcript.change(predict,
+                            inputs=[audio_transcript, threshold_slider, show_video_toggle],
+                            outputs=[video_output, text_output, youtube_output])
     clear_btn.click(lambda: ("", "", ""),
                     inputs=[],