Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
-
from transformers import pipeline # Import for ASR (Speech-to-Text)
|
|
|
|
| 4 |
|
| 5 |
# Define base paths for assets - IMPORTANT: You need to create these folders and place your files here
|
| 6 |
# For Hugging Face Spaces, these paths are relative to your app.py file
|
| 7 |
ASSETS_DIR = "./assets"
|
| 8 |
IMAGE_DIR = os.path.join(ASSETS_DIR, "images")
|
| 9 |
-
AUDIO_DIR = os.path.join(ASSETS_DIR, "audio")
|
| 10 |
|
| 11 |
# Create asset directories if they don't exist (for local testing, Hugging Face handles this with git lfs)
|
| 12 |
# On Hugging Face Spaces, you'll upload these folders with your files.
|
|
@@ -23,6 +24,16 @@ except Exception as e:
|
|
| 23 |
print(f"Warning: Could not load ASR model. Pronunciation check will be a placeholder. Error: {e}")
|
| 24 |
asr_pipeline = None # Set to None if model loading fails
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# Data for Arabic Letters
|
| 28 |
# Updated to use .jpg for images and .wav for audio, based on your provided file names.
|
|
@@ -358,38 +369,72 @@ def play_audio(audio_path):
|
|
| 358 |
# Returning None for audio will result in an error message in Gradio's console
|
| 359 |
return None
|
| 360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
def check_pronunciation(audio_input):
|
| 362 |
"""
|
| 363 |
Performs pronunciation check using ASR.
|
| 364 |
Compares transcribed text to the expected Arabic letter.
|
|
|
|
| 365 |
"""
|
|
|
|
|
|
|
|
|
|
| 366 |
if audio_input is None:
|
| 367 |
-
|
|
|
|
|
|
|
| 368 |
|
| 369 |
if asr_pipeline is None:
|
| 370 |
-
|
|
|
|
|
|
|
| 371 |
|
| 372 |
try:
|
| 373 |
# Transcribe the audio input
|
|
|
|
| 374 |
transcription_result = asr_pipeline(audio_input, generate_kwargs={"language": "ar"})
|
| 375 |
transcribed_text = transcription_result["text"].strip().lower()
|
| 376 |
|
| 377 |
# Get the expected Arabic letter for comparison
|
| 378 |
expected_letter = arabic_letters_data[current_letter_idx]["letter"].lower()
|
| 379 |
|
| 380 |
-
feedback_message = ""
|
| 381 |
# Simple check: does the transcription contain the expected letter?
|
| 382 |
-
# For more robust pronunciation, you'd need phonetic comparison or a dedicated model.
|
| 383 |
if expected_letter in transcribed_text:
|
| 384 |
-
|
|
|
|
| 385 |
else:
|
| 386 |
-
|
| 387 |
f" كان المتوقع: '{arabic_letters_data[current_letter_idx]['letter']}'"
|
|
|
|
| 388 |
|
| 389 |
-
return
|
| 390 |
|
| 391 |
except Exception as e:
|
| 392 |
-
|
|
|
|
|
|
|
| 393 |
|
| 394 |
|
| 395 |
# --- Functions for Arabic Storytelling Section ---
|
|
@@ -578,7 +623,7 @@ with gr.Blocks(
|
|
| 578 |
gr.Markdown("# <span style='color:#007bff;'>تطبيق تعلم العربية للأطفال</span>", rtl=True)
|
| 579 |
# Add logo image
|
| 580 |
gr.Image(
|
| 581 |
-
value=os.path.join(IMAGE_DIR, "Applogo.
|
| 582 |
label="شعار التطبيق (App Logo)",
|
| 583 |
width=300,
|
| 584 |
height=150,
|
|
@@ -667,6 +712,13 @@ with gr.Blocks(
|
|
| 667 |
rtl=True,
|
| 668 |
elem_classes=["gr-markdown"]
|
| 669 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
# Button to trigger the pronunciation check
|
| 671 |
check_pronunciation_btn = gr.Button("تحقق من النطق (Check Pronunciation)", elem_classes=["gr-button"])
|
| 672 |
|
|
@@ -731,7 +783,7 @@ with gr.Blocks(
|
|
| 731 |
get_current_letter_content,
|
| 732 |
inputs=None,
|
| 733 |
outputs=[
|
| 734 |
-
letter_image,
|
| 735 |
letter_audio_output,
|
| 736 |
word_example_display,
|
| 737 |
word_image,
|
|
@@ -745,7 +797,7 @@ with gr.Blocks(
|
|
| 745 |
next_letter_func,
|
| 746 |
inputs=None,
|
| 747 |
outputs=[
|
| 748 |
-
letter_image,
|
| 749 |
letter_audio_output,
|
| 750 |
word_example_display,
|
| 751 |
word_image,
|
|
@@ -756,7 +808,7 @@ with gr.Blocks(
|
|
| 756 |
prev_letter_func,
|
| 757 |
inputs=None,
|
| 758 |
outputs=[
|
| 759 |
-
letter_image,
|
| 760 |
letter_audio_output,
|
| 761 |
word_example_display,
|
| 762 |
word_image,
|
|
@@ -780,7 +832,7 @@ with gr.Blocks(
|
|
| 780 |
check_pronunciation_btn.click(
|
| 781 |
check_pronunciation,
|
| 782 |
inputs=pronunciation_input,
|
| 783 |
-
outputs=pronunciation_feedback
|
| 784 |
)
|
| 785 |
|
| 786 |
# --- Event Handlers for Arabic Storytelling Tab ---
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
+
from transformers import pipeline # Import for ASR (Speech-to-Text) and TTS (Text-to-Speech)
|
| 4 |
+
import torchaudio # Required by some TTS models for audio handling
|
| 5 |
|
| 6 |
# Define base paths for assets - IMPORTANT: You need to create these folders and place your files here
|
| 7 |
# For Hugging Face Spaces, these paths are relative to your app.py file
|
| 8 |
ASSETS_DIR = "./assets"
|
| 9 |
IMAGE_DIR = os.path.join(ASSETS_DIR, "images")
|
| 10 |
+
AUDIO_DIR = os.path.join(ASSETS_DIR, "audio") # This will also be used for dynamically generated audio
|
| 11 |
|
| 12 |
# Create asset directories if they don't exist (for local testing, Hugging Face handles this with git lfs)
|
| 13 |
# On Hugging Face Spaces, you'll upload these folders with your files.
|
|
|
|
| 24 |
print(f"Warning: Could not load ASR model. Pronunciation check will be a placeholder. Error: {e}")
|
| 25 |
asr_pipeline = None # Set to None if model loading fails
|
| 26 |
|
| 27 |
+
# Initialize TTS pipeline for audio feedback
|
| 28 |
+
# Using a specific Arabic TTS model.
|
| 29 |
+
try:
|
| 30 |
+
# Ensure this model is suitable for your Hugging Face Space's resources.
|
| 31 |
+
# facebook/mms-tts-ara is a good general Arabic TTS model.
|
| 32 |
+
tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-ara", device=-1)
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"Warning: Could not load TTS model. Audio feedback will not be generated. Error: {e}")
|
| 35 |
+
tts_pipeline = None # Set to None if model loading fails
|
| 36 |
+
|
| 37 |
|
| 38 |
# Data for Arabic Letters
|
| 39 |
# Updated to use .jpg for images and .wav for audio, based on your provided file names.
|
|
|
|
| 369 |
# Returning None for audio will result in an error message in Gradio's console
|
| 370 |
return None
|
| 371 |
|
| 372 |
+
def generate_tts_audio(text_to_speak, filename="temp_feedback_audio.wav"):
|
| 373 |
+
"""
|
| 374 |
+
Generates audio from text using the TTS pipeline and saves it to a file.
|
| 375 |
+
Returns the file path.
|
| 376 |
+
"""
|
| 377 |
+
if tts_pipeline is None:
|
| 378 |
+
print("TTS pipeline not loaded, cannot generate audio.")
|
| 379 |
+
return None
|
| 380 |
+
|
| 381 |
+
try:
|
| 382 |
+
# Generate speech
|
| 383 |
+
speech = tts_pipeline(text_to_speak, return_timestamps="word")
|
| 384 |
+
|
| 385 |
+
# Save the audio to a temporary file
|
| 386 |
+
output_path = os.path.join(AUDIO_DIR, filename)
|
| 387 |
+
# Assuming the pipeline returns a dictionary with 'audio' (numpy array) and 'sampling_rate'
|
| 388 |
+
torchaudio.save(output_path, speech['audio'].unsqueeze(0), speech['sampling_rate'])
|
| 389 |
+
return output_path
|
| 390 |
+
except Exception as e:
|
| 391 |
+
print(f"Error generating TTS audio for '{text_to_speak}': {e}")
|
| 392 |
+
return None
|
| 393 |
+
|
| 394 |
+
|
| 395 |
def check_pronunciation(audio_input):
|
| 396 |
"""
|
| 397 |
Performs pronunciation check using ASR.
|
| 398 |
Compares transcribed text to the expected Arabic letter.
|
| 399 |
+
Returns text feedback and an audio file for the feedback.
|
| 400 |
"""
|
| 401 |
+
feedback_text = ""
|
| 402 |
+
feedback_audio_path = None
|
| 403 |
+
|
| 404 |
if audio_input is None:
|
| 405 |
+
feedback_text = "من فضلك سجل صوتك أولاً. (Please record your voice first.)"
|
| 406 |
+
feedback_audio_path = generate_tts_audio("من فضلك سجل صوتك أولاً.")
|
| 407 |
+
return feedback_text, feedback_audio_path
|
| 408 |
|
| 409 |
if asr_pipeline is None:
|
| 410 |
+
feedback_text = "وظيفة التحقق من النطق غير متوفرة. (Pronunciation check not available. ASR model failed to load.)"
|
| 411 |
+
feedback_audio_path = generate_tts_audio("وظيفة التحقق من النطق غير متوفرة.")
|
| 412 |
+
return feedback_text, feedback_audio_path
|
| 413 |
|
| 414 |
try:
|
| 415 |
# Transcribe the audio input
|
| 416 |
+
# Ensure the language is explicitly set to Arabic for better results with Whisper
|
| 417 |
transcription_result = asr_pipeline(audio_input, generate_kwargs={"language": "ar"})
|
| 418 |
transcribed_text = transcription_result["text"].strip().lower()
|
| 419 |
|
| 420 |
# Get the expected Arabic letter for comparison
|
| 421 |
expected_letter = arabic_letters_data[current_letter_idx]["letter"].lower()
|
| 422 |
|
|
|
|
| 423 |
# Simple check: does the transcription contain the expected letter?
|
|
|
|
| 424 |
if expected_letter in transcribed_text:
|
| 425 |
+
feedback_text = f"أحسنت! (Excellent!) لقد قلت: '{transcribed_text}'"
|
| 426 |
+
feedback_audio_path = generate_tts_audio("أحسنت!")
|
| 427 |
else:
|
| 428 |
+
feedback_text = f"حاول مرة أخرى. (Try again.) لقد قلت: '{transcribed_text}'" \
|
| 429 |
f" كان المتوقع: '{arabic_letters_data[current_letter_idx]['letter']}'"
|
| 430 |
+
feedback_audio_path = generate_tts_audio("حاول مرة أخرى.")
|
| 431 |
|
| 432 |
+
return feedback_text, feedback_audio_path
|
| 433 |
|
| 434 |
except Exception as e:
|
| 435 |
+
feedback_text = f"حدث خطأ أثناء التحقق من النطق: {e}. (An error occurred during pronunciation check.)"
|
| 436 |
+
feedback_audio_path = generate_tts_audio("حدث خطأ أثناء التحقق من النطق.") # Generic error audio
|
| 437 |
+
return feedback_text, feedback_audio_path
|
| 438 |
|
| 439 |
|
| 440 |
# --- Functions for Arabic Storytelling Section ---
|
|
|
|
| 623 |
gr.Markdown("# <span style='color:#007bff;'>تطبيق تعلم العربية للأطفال</span>", rtl=True)
|
| 624 |
# Add logo image
|
| 625 |
gr.Image(
|
| 626 |
+
value=os.path.join(IMAGE_DIR, "Applogo.png"), # Path to your logo image
|
| 627 |
label="شعار التطبيق (App Logo)",
|
| 628 |
width=300,
|
| 629 |
height=150,
|
|
|
|
| 712 |
rtl=True,
|
| 713 |
elem_classes=["gr-markdown"]
|
| 714 |
)
|
| 715 |
+
# New Audio component to play pronunciation feedback
|
| 716 |
+
pronunciation_feedback_audio = gr.Audio(
|
| 717 |
+
label="صوت التقييم (Feedback Audio)",
|
| 718 |
+
autoplay=True,
|
| 719 |
+
type="filepath",
|
| 720 |
+
visible=False # Keep it hidden, as we only trigger playback
|
| 721 |
+
)
|
| 722 |
# Button to trigger the pronunciation check
|
| 723 |
check_pronunciation_btn = gr.Button("تحقق من النطق (Check Pronunciation)", elem_classes=["gr-button"])
|
| 724 |
|
|
|
|
| 783 |
get_current_letter_content,
|
| 784 |
inputs=None,
|
| 785 |
outputs=[
|
| 786 |
+
letter_image,
|
| 787 |
letter_audio_output,
|
| 788 |
word_example_display,
|
| 789 |
word_image,
|
|
|
|
| 797 |
next_letter_func,
|
| 798 |
inputs=None,
|
| 799 |
outputs=[
|
| 800 |
+
letter_image,
|
| 801 |
letter_audio_output,
|
| 802 |
word_example_display,
|
| 803 |
word_image,
|
|
|
|
| 808 |
prev_letter_func,
|
| 809 |
inputs=None,
|
| 810 |
outputs=[
|
| 811 |
+
letter_image,
|
| 812 |
letter_audio_output,
|
| 813 |
word_example_display,
|
| 814 |
word_image,
|
|
|
|
| 832 |
check_pronunciation_btn.click(
|
| 833 |
check_pronunciation,
|
| 834 |
inputs=pronunciation_input,
|
| 835 |
+
outputs=[pronunciation_feedback, pronunciation_feedback_audio]
|
| 836 |
)
|
| 837 |
|
| 838 |
# --- Event Handlers for Arabic Storytelling Tab ---
|