NazishHasan commited on
Commit
f95bfeb
·
verified ·
1 Parent(s): 4d41f1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -15
app.py CHANGED
@@ -1,12 +1,13 @@
1
  import gradio as gr
2
  import os
3
- from transformers import pipeline # Import for ASR (Speech-to-Text)
 
4
 
5
  # Define base paths for assets - IMPORTANT: You need to create these folders and place your files here
6
  # For Hugging Face Spaces, these paths are relative to your app.py file
7
  ASSETS_DIR = "./assets"
8
  IMAGE_DIR = os.path.join(ASSETS_DIR, "images")
9
- AUDIO_DIR = os.path.join(ASSETS_DIR, "audio")
10
 
11
  # Create asset directories if they don't exist (for local testing, Hugging Face handles this with git lfs)
12
  # On Hugging Face Spaces, you'll upload these folders with your files.
@@ -23,6 +24,16 @@ except Exception as e:
23
  print(f"Warning: Could not load ASR model. Pronunciation check will be a placeholder. Error: {e}")
24
  asr_pipeline = None # Set to None if model loading fails
25
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # Data for Arabic Letters
28
  # Updated to use .jpg for images and .wav for audio, based on your provided file names.
@@ -358,38 +369,72 @@ def play_audio(audio_path):
358
  # Returning None for audio will result in an error message in Gradio's console
359
  return None
360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  def check_pronunciation(audio_input):
362
  """
363
  Performs pronunciation check using ASR.
364
  Compares transcribed text to the expected Arabic letter.
 
365
  """
 
 
 
366
  if audio_input is None:
367
- return "من فضلك سجل صوتك أولاً. (Please record your voice first.)"
 
 
368
 
369
  if asr_pipeline is None:
370
- return "وظيفة التحقق من النطق غير متوفرة. (Pronunciation check not available. ASR model failed to load.)"
 
 
371
 
372
  try:
373
  # Transcribe the audio input
 
374
  transcription_result = asr_pipeline(audio_input, generate_kwargs={"language": "ar"})
375
  transcribed_text = transcription_result["text"].strip().lower()
376
 
377
  # Get the expected Arabic letter for comparison
378
  expected_letter = arabic_letters_data[current_letter_idx]["letter"].lower()
379
 
380
- feedback_message = ""
381
  # Simple check: does the transcription contain the expected letter?
382
- # For more robust pronunciation, you'd need phonetic comparison or a dedicated model.
383
  if expected_letter in transcribed_text:
384
- feedback_message = f"أحسنت! (Excellent!) لقد قلت: '{transcribed_text}'"
 
385
  else:
386
- feedback_message = f"حاول مرة أخرى. (Try again.) لقد قلت: '{transcribed_text}'" \
387
  f" كان المتوقع: '{arabic_letters_data[current_letter_idx]['letter']}'"
 
388
 
389
- return feedback_message
390
 
391
  except Exception as e:
392
- return f"حدث خطأ أثناء التحقق من النطق: {e}. (An error occurred during pronunciation check.)"
 
 
393
 
394
 
395
  # --- Functions for Arabic Storytelling Section ---
@@ -578,7 +623,7 @@ with gr.Blocks(
578
  gr.Markdown("# <span style='color:#007bff;'>تطبيق تعلم العربية للأطفال</span>", rtl=True)
579
  # Add logo image
580
  gr.Image(
581
- value=os.path.join(IMAGE_DIR, "Applogo.jpg"), # Path to your logo image
582
  label="شعار التطبيق (App Logo)",
583
  width=300,
584
  height=150,
@@ -667,6 +712,13 @@ with gr.Blocks(
667
  rtl=True,
668
  elem_classes=["gr-markdown"]
669
  )
 
 
 
 
 
 
 
670
  # Button to trigger the pronunciation check
671
  check_pronunciation_btn = gr.Button("تحقق من النطق (Check Pronunciation)", elem_classes=["gr-button"])
672
 
@@ -731,7 +783,7 @@ with gr.Blocks(
731
  get_current_letter_content,
732
  inputs=None,
733
  outputs=[
734
- letter_image, # Removed letter_display from outputs
735
  letter_audio_output,
736
  word_example_display,
737
  word_image,
@@ -745,7 +797,7 @@ with gr.Blocks(
745
  next_letter_func,
746
  inputs=None,
747
  outputs=[
748
- letter_image, # Removed letter_display from outputs
749
  letter_audio_output,
750
  word_example_display,
751
  word_image,
@@ -756,7 +808,7 @@ with gr.Blocks(
756
  prev_letter_func,
757
  inputs=None,
758
  outputs=[
759
- letter_image, # Removed letter_display from outputs
760
  letter_audio_output,
761
  word_example_display,
762
  word_image,
@@ -780,7 +832,7 @@ with gr.Blocks(
780
  check_pronunciation_btn.click(
781
  check_pronunciation,
782
  inputs=pronunciation_input,
783
- outputs=pronunciation_feedback
784
  )
785
 
786
  # --- Event Handlers for Arabic Storytelling Tab ---
 
1
  import gradio as gr
2
  import os
3
+ from transformers import pipeline # Import for ASR (Speech-to-Text) and TTS (Text-to-Speech)
4
+ import torchaudio # Required by some TTS models for audio handling
5
 
6
  # Define base paths for assets - IMPORTANT: You need to create these folders and place your files here
7
  # For Hugging Face Spaces, these paths are relative to your app.py file
8
  ASSETS_DIR = "./assets"
9
  IMAGE_DIR = os.path.join(ASSETS_DIR, "images")
10
+ AUDIO_DIR = os.path.join(ASSETS_DIR, "audio") # This will also be used for dynamically generated audio
11
 
12
  # Create asset directories if they don't exist (for local testing, Hugging Face handles this with git lfs)
13
  # On Hugging Face Spaces, you'll upload these folders with your files.
 
24
  print(f"Warning: Could not load ASR model. Pronunciation check will be a placeholder. Error: {e}")
25
  asr_pipeline = None # Set to None if model loading fails
26
 
27
+ # Initialize TTS pipeline for audio feedback
28
+ # Using a specific Arabic TTS model.
29
+ try:
30
+ # Ensure this model is suitable for your Hugging Face Space's resources.
31
+ # facebook/mms-tts-ara is a good general Arabic TTS model.
32
+ tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-ara", device=-1)
33
+ except Exception as e:
34
+ print(f"Warning: Could not load TTS model. Audio feedback will not be generated. Error: {e}")
35
+ tts_pipeline = None # Set to None if model loading fails
36
+
37
 
38
  # Data for Arabic Letters
39
  # Updated to use .jpg for images and .wav for audio, based on your provided file names.
 
369
  # Returning None for audio will result in an error message in Gradio's console
370
  return None
371
 
372
+ def generate_tts_audio(text_to_speak, filename="temp_feedback_audio.wav"):
373
+ """
374
+ Generates audio from text using the TTS pipeline and saves it to a file.
375
+ Returns the file path.
376
+ """
377
+ if tts_pipeline is None:
378
+ print("TTS pipeline not loaded, cannot generate audio.")
379
+ return None
380
+
381
+ try:
382
+ # Generate speech
383
+ speech = tts_pipeline(text_to_speak, return_timestamps="word")
384
+
385
+ # Save the audio to a temporary file
386
+ output_path = os.path.join(AUDIO_DIR, filename)
387
+ # Assuming the pipeline returns a dictionary with 'audio' (numpy array) and 'sampling_rate'
388
+ torchaudio.save(output_path, speech['audio'].unsqueeze(0), speech['sampling_rate'])
389
+ return output_path
390
+ except Exception as e:
391
+ print(f"Error generating TTS audio for '{text_to_speak}': {e}")
392
+ return None
393
+
394
+
395
  def check_pronunciation(audio_input):
396
  """
397
  Performs pronunciation check using ASR.
398
  Compares transcribed text to the expected Arabic letter.
399
+ Returns text feedback and an audio file for the feedback.
400
  """
401
+ feedback_text = ""
402
+ feedback_audio_path = None
403
+
404
  if audio_input is None:
405
+ feedback_text = "من فضلك سجل صوتك أولاً. (Please record your voice first.)"
406
+ feedback_audio_path = generate_tts_audio("من فضلك سجل صوتك أولاً.")
407
+ return feedback_text, feedback_audio_path
408
 
409
  if asr_pipeline is None:
410
+ feedback_text = "وظيفة التحقق من النطق غير متوفرة. (Pronunciation check not available. ASR model failed to load.)"
411
+ feedback_audio_path = generate_tts_audio("وظيفة التحقق من النطق غير متوفرة.")
412
+ return feedback_text, feedback_audio_path
413
 
414
  try:
415
  # Transcribe the audio input
416
+ # Ensure the language is explicitly set to Arabic for better results with Whisper
417
  transcription_result = asr_pipeline(audio_input, generate_kwargs={"language": "ar"})
418
  transcribed_text = transcription_result["text"].strip().lower()
419
 
420
  # Get the expected Arabic letter for comparison
421
  expected_letter = arabic_letters_data[current_letter_idx]["letter"].lower()
422
 
 
423
  # Simple check: does the transcription contain the expected letter?
 
424
  if expected_letter in transcribed_text:
425
+ feedback_text = f"أحسنت! (Excellent!) لقد قلت: '{transcribed_text}'"
426
+ feedback_audio_path = generate_tts_audio("أحسنت!")
427
  else:
428
+ feedback_text = f"حاول مرة أخرى. (Try again.) لقد قلت: '{transcribed_text}'" \
429
  f" كان المتوقع: '{arabic_letters_data[current_letter_idx]['letter']}'"
430
+ feedback_audio_path = generate_tts_audio("حاول مرة أخرى.")
431
 
432
+ return feedback_text, feedback_audio_path
433
 
434
  except Exception as e:
435
+ feedback_text = f"حدث خطأ أثناء التحقق من النطق: {e}. (An error occurred during pronunciation check.)"
436
+ feedback_audio_path = generate_tts_audio("حدث خطأ أثناء التحقق من النطق.") # Generic error audio
437
+ return feedback_text, feedback_audio_path
438
 
439
 
440
  # --- Functions for Arabic Storytelling Section ---
 
623
  gr.Markdown("# <span style='color:#007bff;'>تطبيق تعلم العربية للأطفال</span>", rtl=True)
624
  # Add logo image
625
  gr.Image(
626
+ value=os.path.join(IMAGE_DIR, "Applogo.png"), # Path to your logo image
627
  label="شعار التطبيق (App Logo)",
628
  width=300,
629
  height=150,
 
712
  rtl=True,
713
  elem_classes=["gr-markdown"]
714
  )
715
+ # New Audio component to play pronunciation feedback
716
+ pronunciation_feedback_audio = gr.Audio(
717
+ label="صوت التقييم (Feedback Audio)",
718
+ autoplay=True,
719
+ type="filepath",
720
+ visible=False # Keep it hidden, as we only trigger playback
721
+ )
722
  # Button to trigger the pronunciation check
723
  check_pronunciation_btn = gr.Button("تحقق من النطق (Check Pronunciation)", elem_classes=["gr-button"])
724
 
 
783
  get_current_letter_content,
784
  inputs=None,
785
  outputs=[
786
+ letter_image,
787
  letter_audio_output,
788
  word_example_display,
789
  word_image,
 
797
  next_letter_func,
798
  inputs=None,
799
  outputs=[
800
+ letter_image,
801
  letter_audio_output,
802
  word_example_display,
803
  word_image,
 
808
  prev_letter_func,
809
  inputs=None,
810
  outputs=[
811
+ letter_image,
812
  letter_audio_output,
813
  word_example_display,
814
  word_image,
 
832
  check_pronunciation_btn.click(
833
  check_pronunciation,
834
  inputs=pronunciation_input,
835
+ outputs=[pronunciation_feedback, pronunciation_feedback_audio]
836
  )
837
 
838
  # --- Event Handlers for Arabic Storytelling Tab ---