Spaces:

NazishHasan
/

Alifbaataa

Sleeping

App Files Files Community

NazishHasan commited on Jun 20

Commit

f95bfeb

verified ·

1 Parent(s): 4d41f1c

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -15

app.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import gradio as gr
 import os
-from transformers import pipeline # Import for ASR (Speech-to-Text)
 # Define base paths for assets - IMPORTANT: You need to create these folders and place your files here
 # For Hugging Face Spaces, these paths are relative to your app.py file
 ASSETS_DIR = "./assets"
 IMAGE_DIR = os.path.join(ASSETS_DIR, "images")
-AUDIO_DIR = os.path.join(ASSETS_DIR, "audio")
 # Create asset directories if they don't exist (for local testing, Hugging Face handles this with git lfs)
 # On Hugging Face Spaces, you'll upload these folders with your files.
@@ -23,6 +24,16 @@ except Exception as e:
     print(f"Warning: Could not load ASR model. Pronunciation check will be a placeholder. Error: {e}")
     asr_pipeline = None # Set to None if model loading fails
 # Data for Arabic Letters
 # Updated to use .jpg for images and .wav for audio, based on your provided file names.
@@ -358,38 +369,72 @@ def play_audio(audio_path):
     # Returning None for audio will result in an error message in Gradio's console
     return None
 def check_pronunciation(audio_input):
     """
     Performs pronunciation check using ASR.
     Compares transcribed text to the expected Arabic letter.
     """
     if audio_input is None:
-        return "من فضلك سجل صوتك أولاً. (Please record your voice first.)"
     if asr_pipeline is None:
-        return "وظيفة التحقق من النطق غير متوفرة. (Pronunciation check not available. ASR model failed to load.)"
     try:
         # Transcribe the audio input
         transcription_result = asr_pipeline(audio_input, generate_kwargs={"language": "ar"})
         transcribed_text = transcription_result["text"].strip().lower()
         # Get the expected Arabic letter for comparison
         expected_letter = arabic_letters_data[current_letter_idx]["letter"].lower()
-        feedback_message = ""
         # Simple check: does the transcription contain the expected letter?
-        # For more robust pronunciation, you'd need phonetic comparison or a dedicated model.
         if expected_letter in transcribed_text:
-            feedback_message = f"أحسنت! (Excellent!) لقد قلت: '{transcribed_text}'"
         else:
-            feedback_message = f"حاول مرة أخرى. (Try again.) لقد قلت: '{transcribed_text}'" \
                                f" كان المتوقع: '{arabic_letters_data[current_letter_idx]['letter']}'"
-        return feedback_message
     except Exception as e:
-        return f"حدث خطأ أثناء التحقق من النطق: {e}. (An error occurred during pronunciation check.)"
 # --- Functions for Arabic Storytelling Section ---
@@ -578,7 +623,7 @@ with gr.Blocks(
     gr.Markdown("# <span style='color:#007bff;'>تطبيق تعلم العربية للأطفال</span>", rtl=True)
     # Add logo image
     gr.Image(
-        value=os.path.join(IMAGE_DIR, "Applogo.jpg"), # Path to your logo image
         label="شعار التطبيق (App Logo)",
         width=300,
         height=150,
@@ -667,6 +712,13 @@ with gr.Blocks(
                     rtl=True,
                     elem_classes=["gr-markdown"]
                 )
                 # Button to trigger the pronunciation check
                 check_pronunciation_btn = gr.Button("تحقق من النطق (Check Pronunciation)", elem_classes=["gr-button"])
@@ -731,7 +783,7 @@ with gr.Blocks(
         get_current_letter_content,
         inputs=None,
         outputs=[
-            letter_image, # Removed letter_display from outputs
             letter_audio_output,
             word_example_display,
             word_image,
@@ -745,7 +797,7 @@ with gr.Blocks(
         next_letter_func,
         inputs=None,
         outputs=[
-            letter_image, # Removed letter_display from outputs
             letter_audio_output,
             word_example_display,
             word_image,
@@ -756,7 +808,7 @@ with gr.Blocks(
         prev_letter_func,
         inputs=None,
         outputs=[
-            letter_image, # Removed letter_display from outputs
             letter_audio_output,
             word_example_display,
             word_image,
@@ -780,7 +832,7 @@ with gr.Blocks(
     check_pronunciation_btn.click(
         check_pronunciation,
         inputs=pronunciation_input,
-        outputs=pronunciation_feedback
     )
     # --- Event Handlers for Arabic Storytelling Tab ---

 import gradio as gr
 import os
+from transformers import pipeline # Import for ASR (Speech-to-Text) and TTS (Text-to-Speech)
+import torchaudio # Required by some TTS models for audio handling
 # Define base paths for assets - IMPORTANT: You need to create these folders and place your files here
 # For Hugging Face Spaces, these paths are relative to your app.py file
 ASSETS_DIR = "./assets"
 IMAGE_DIR = os.path.join(ASSETS_DIR, "images")
+AUDIO_DIR = os.path.join(ASSETS_DIR, "audio") # This will also be used for dynamically generated audio
 # Create asset directories if they don't exist (for local testing, Hugging Face handles this with git lfs)
 # On Hugging Face Spaces, you'll upload these folders with your files.
     print(f"Warning: Could not load ASR model. Pronunciation check will be a placeholder. Error: {e}")
     asr_pipeline = None # Set to None if model loading fails
+# Initialize TTS pipeline for audio feedback
+# Using a specific Arabic TTS model.
+try:
+    # Ensure this model is suitable for your Hugging Face Space's resources.
+    # facebook/mms-tts-ara is a good general Arabic TTS model.
+    tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-ara", device=-1)
+except Exception as e:
+    print(f"Warning: Could not load TTS model. Audio feedback will not be generated. Error: {e}")
+    tts_pipeline = None # Set to None if model loading fails
 # Data for Arabic Letters
 # Updated to use .jpg for images and .wav for audio, based on your provided file names.
     # Returning None for audio will result in an error message in Gradio's console
     return None
+def generate_tts_audio(text_to_speak, filename="temp_feedback_audio.wav"):
+    """
+    Generates audio from text using the TTS pipeline and saves it to a file.
+    Returns the file path.
+    """
+    if tts_pipeline is None:
+        print("TTS pipeline not loaded, cannot generate audio.")
+        return None
+    try:
+        # Generate speech
+        speech = tts_pipeline(text_to_speak, return_timestamps="word")
+        # Save the audio to a temporary file
+        output_path = os.path.join(AUDIO_DIR, filename)
+        # Assuming the pipeline returns a dictionary with 'audio' (numpy array) and 'sampling_rate'
+        torchaudio.save(output_path, speech['audio'].unsqueeze(0), speech['sampling_rate'])
+        return output_path
+    except Exception as e:
+        print(f"Error generating TTS audio for '{text_to_speak}': {e}")
+        return None
 def check_pronunciation(audio_input):
     """
     Performs pronunciation check using ASR.
     Compares transcribed text to the expected Arabic letter.
+    Returns text feedback and an audio file for the feedback.
     """
+    feedback_text = ""
+    feedback_audio_path = None
     if audio_input is None:
+        feedback_text = "من فضلك سجل صوتك أولاً. (Please record your voice first.)"
+        feedback_audio_path = generate_tts_audio("من فضلك سجل صوتك أولاً.")
+        return feedback_text, feedback_audio_path
     if asr_pipeline is None:
+        feedback_text = "وظيفة التحقق من النطق غير متوفرة. (Pronunciation check not available. ASR model failed to load.)"
+        feedback_audio_path = generate_tts_audio("وظيفة التحقق من النطق غير متوفرة.")
+        return feedback_text, feedback_audio_path
     try:
         # Transcribe the audio input
+        # Ensure the language is explicitly set to Arabic for better results with Whisper
         transcription_result = asr_pipeline(audio_input, generate_kwargs={"language": "ar"})
         transcribed_text = transcription_result["text"].strip().lower()
         # Get the expected Arabic letter for comparison
         expected_letter = arabic_letters_data[current_letter_idx]["letter"].lower()
         # Simple check: does the transcription contain the expected letter?
         if expected_letter in transcribed_text:
+            feedback_text = f"أحسنت! (Excellent!) لقد قلت: '{transcribed_text}'"
+            feedback_audio_path = generate_tts_audio("أحسنت!")
         else:
+            feedback_text = f"حاول مرة أخرى. (Try again.) لقد قلت: '{transcribed_text}'" \
                                f" كان المتوقع: '{arabic_letters_data[current_letter_idx]['letter']}'"
+            feedback_audio_path = generate_tts_audio("حاول مرة أخرى.")
+        return feedback_text, feedback_audio_path
     except Exception as e:
+        feedback_text = f"حدث خطأ أثناء التحقق من النطق: {e}. (An error occurred during pronunciation check.)"
+        feedback_audio_path = generate_tts_audio("حدث خطأ أثناء التحقق من النطق.") # Generic error audio
+        return feedback_text, feedback_audio_path
 # --- Functions for Arabic Storytelling Section ---
     gr.Markdown("# <span style='color:#007bff;'>تطبيق تعلم العربية للأطفال</span>", rtl=True)
     # Add logo image
     gr.Image(
+        value=os.path.join(IMAGE_DIR, "Applogo.png"), # Path to your logo image
         label="شعار التطبيق (App Logo)",
         width=300,
         height=150,
                     rtl=True,
                     elem_classes=["gr-markdown"]
                 )
+                # New Audio component to play pronunciation feedback
+                pronunciation_feedback_audio = gr.Audio(
+                    label="صوت التقييم (Feedback Audio)",
+                    autoplay=True,
+                    type="filepath",
+                    visible=False # Keep it hidden, as we only trigger playback
+                )
                 # Button to trigger the pronunciation check
                 check_pronunciation_btn = gr.Button("تحقق من النطق (Check Pronunciation)", elem_classes=["gr-button"])
         get_current_letter_content,
         inputs=None,
         outputs=[
+            letter_image,
             letter_audio_output,
             word_example_display,
             word_image,
         next_letter_func,
         inputs=None,
         outputs=[
+            letter_image,
             letter_audio_output,
             word_example_display,
             word_image,
         prev_letter_func,
         inputs=None,
         outputs=[
+            letter_image,
             letter_audio_output,
             word_example_display,
             word_image,
     check_pronunciation_btn.click(
         check_pronunciation,
         inputs=pronunciation_input,
+        outputs=[pronunciation_feedback, pronunciation_feedback_audio]
     )
     # --- Event Handlers for Arabic Storytelling Tab ---