Spaces:

Hassan-16
/

TTS

Running

App Files Files Community

Hassan-16 commited on Jun 28

Commit

92ccd47

verified ·

1 Parent(s): 1509739

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -64

app.py CHANGED Viewed

@@ -23,8 +23,18 @@ CUDA_AVAILABLE = torch.cuda.is_available()
 device = "cuda" if CUDA_AVAILABLE else "cpu"
 logger.info(f"Using hardware: {device}")
-# Load a single model instance
-model = KModel("hexgrad/Kokoro-82M").to(device).eval()
 # Define pipelines for American ('a') and British ('b') English
 pipelines = {
@@ -39,7 +49,45 @@ try:
 except AttributeError as e:
     logger.warning(f"Could not set custom pronunciations: {e}")
 def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
         raise FileNotFoundError(f"Voice file not found: {voice_path}")
@@ -47,16 +95,20 @@ def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
     pipeline = pipelines[voice[0]]
     use_gpu = use_gpu and CUDA_AVAILABLE
     try:
         generator = pipeline(text, voice=voice_path, speed=speed)
         for _, ps, audio in generator:
             return (24000, audio.numpy()), ps
     except gr.exceptions.Error as e:
         if use_gpu:
             gr.Warning(str(e))
-            gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
             model.to("cpu")
             generator = pipeline(text, voice=voice_path, speed=speed)
             for _, ps, audio in generator:
                 return (24000, audio.numpy()), ps
         else:
             raise gr.Error(e)
@@ -74,55 +126,23 @@ def tokenize_first(text, voice="af_bella.pt"):
     return ""
 def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
         raise FileNotFoundError(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     use_gpu = use_gpu and CUDA_AVAILABLE
-    first = True
-    if not use_gpu:
         model.to("cpu")
     generator = pipeline(text, voice=voice_path, speed=speed)
     for _, _, audio in generator:
         yield 24000, audio.numpy()
         if first:
             first = False
             yield 24000, torch.zeros(1).numpy()
-# Dynamically load .pt voice files from VOICE_DIR
-def load_voice_choices():
-    voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
-    choices = {}
-    for voice_file in voice_files:
-        prefix = voice_file[:2]
-        if prefix == 'af':
-            label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}"
-        elif prefix == 'am':
-            label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}"
-        elif prefix == 'bf':
-            label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}"
-        elif prefix == 'bm':
-            label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}"
-        else:
-            label = f"Unknown {voice_file[:-3]}"
-        choices[label] = voice_file
-    return choices
-CHOICES = load_voice_choices()
-# Log available voices
-for label, voice_path in CHOICES.items():
-    full_path = os.path.join(VOICE_DIR, voice_path)
-    if not os.path.exists(full_path):
-        logger.warning(f"Voice file not found: {full_path}")
-    else:
-        logger.info(f"Loaded voice: {label} ({voice_path})")
-# If no voices are found, add a default fallback
-if not CHOICES:
-    logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
-    CHOICES = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"}
 TOKEN_NOTE = '''
 💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
@@ -134,45 +154,43 @@ TOKEN_NOTE = '''
 ⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
 '''
-with gr.Blocks() as generate_tab:
-    out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
-    generate_btn = gr.Button("Generate", variant="primary")
-    with gr.Accordion("Output Tokens", open=True):
-        out_ps = gr.Textbox(interactive=False, show_label=False,
-                            info="Tokens used to generate the audio, up to 510 context length.")
-        tokenize_btn = gr.Button("Tokenize", variant="secondary")
-        gr.Markdown(TOKEN_NOTE)
-with gr.Blocks() as stream_tab:
-    out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
-    with gr.Row():
-        stream_btn = gr.Button("Stream", variant="primary")
-        stop_btn = gr.Button("Stop", variant="stop")
-    with gr.Accordion("Note", open=True):
-        gr.Markdown("⚠️ There may be delays in streaming audio due to processing limitations.")
-with gr.Blocks() as app:
     with gr.Row():
         with gr.Column():
-            text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
             with gr.Row():
                 voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
                                     info="Quality and availability vary by language")
                 use_gpu = gr.Dropdown(
-                    [("GPU �-held", True), ("CPU 🐌", False)],
                     value=CUDA_AVAILABLE,
                     label="Hardware",
-                    info="GPU is usually faster, but may require CUDA support",
                     interactive=CUDA_AVAILABLE
                 )
             speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
         with gr.Column():
-            gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
-    generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu],
-                       outputs=[out_audio, out_ps])
     tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
     stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
     stop_btn.click(fn=None, cancels=[stream_event])
 if __name__ == "__main__":
-    app.queue().launch()

 device = "cuda" if CUDA_AVAILABLE else "cpu"
 logger.info(f"Using hardware: {device}")
+# Cache model in a persistent directory
+MODEL_CACHE_DIR = os.path.join(os.path.dirname(__file__), "model_cache")
+os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
+# Load a single model instance with caching
+try:
+    start_time = time.time()
+    model = KModel("hexgrad/Kokoro-82M", cache_dir=MODEL_CACHE_DIR).to(device).eval()
+    logger.info(f"Model loading time: {time.time() - start_time} seconds")
+except Exception as e:
+    logger.error(f"Failed to load model: {e}")
+    raise
 # Define pipelines for American ('a') and British ('b') English
 pipelines = {
 except AttributeError as e:
     logger.warning(f"Could not set custom pronunciations: {e}")
+# Cache voice choices to avoid repeated file scanning
+VOICE_CHOICES = None
+def load_voice_choices():
+    global VOICE_CHOICES
+    if VOICE_CHOICES is not None:
+        return VOICE_CHOICES
+    voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
+    choices = {}
+    for voice_file in voice_files:
+        prefix = voice_file[:2]
+        if prefix == 'af':
+            label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'am':
+            label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'bf':
+            label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}"
+        elif prefix == 'bm':
+            label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}"
+        else:
+            label = f"Unknown {voice_file[:-3]}"
+        choices[label] = voice_file
+    if not choices:
+        logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
+        choices = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"}
+    VOICE_CHOICES = choices
+    return choices
+CHOICES = load_voice_choices()
+# Log available voices
+for label, voice_path in CHOICES.items():
+    full_path = os.path.join(VOICE_DIR, voice_path)
+    if not os.path.exists(full_path):
+        logger.warning(f"Voice file not found: {full_path}")
+    else:
+        logger.info(f"Loaded voice: {label} ({voice_path})")
 def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
+    start_time = time.time()
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
         raise FileNotFoundError(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     use_gpu = use_gpu and CUDA_AVAILABLE
     try:
+        if not use_gpu and model.device.type != "cpu":
+            model.to("cpu")
         generator = pipeline(text, voice=voice_path, speed=speed)
         for _, ps, audio in generator:
+            logger.info(f"Generation time: {time.time() - start_time} seconds")
             return (24000, audio.numpy()), ps
     except gr.exceptions.Error as e:
         if use_gpu:
             gr.Warning(str(e))
+            gr.Info("Retrying with CPU.")
             model.to("cpu")
             generator = pipeline(text, voice=voice_path, speed=speed)
             for _, ps, audio in generator:
+                logger.info(f"Generation time (CPU retry): {time.time() - start_time} seconds")
                 return (24000, audio.numpy()), ps
         else:
             raise gr.Error(e)
     return ""
 def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
+    start_time = time.time()
     voice_path = os.path.join(VOICE_DIR, voice)
     if not os.path.exists(voice_path):
         raise FileNotFoundError(f"Voice file not found: {voice_path}")
     pipeline = pipelines[voice[0]]
     use_gpu = use_gpu and CUDA_AVAILABLE
+    if not use_gpu and model.device.type != "cpu":
         model.to("cpu")
+    first = True
     generator = pipeline(text, voice=voice_path, speed=speed)
     for _, _, audio in generator:
         yield 24000, audio.numpy()
         if first:
             first = False
             yield 24000, torch.zeros(1).numpy()
+    logger.info(f"Streaming generation time: {time.time() - start_time} seconds")
 TOKEN_NOTE = '''
 💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
 ⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
 '''
+with gr.Blocks(theme="soft") as app:
     with gr.Row():
         with gr.Column():
+            text = gr.Textbox(label="Input Text", value=TEXT, info="Arbitrarily many characters supported")
             with gr.Row():
                 voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
                                     info="Quality and availability vary by language")
                 use_gpu = gr.Dropdown(
+                    [("GPU 🚀", True), ("CPU 🐌", False)],
                     value=CUDA_AVAILABLE,
                     label="Hardware",
+                    info="GPU is faster but requires CUDA support",
                     interactive=CUDA_AVAILABLE
                 )
             speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
         with gr.Column():
+            with gr.Tab(label="Generate"):
+                out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
+                generate_btn = gr.Button("Generate", variant="primary")
+                with gr.Accordion("Output Tokens", open=True):
+                    out_ps = gr.Textbox(interactive=False, show_label=False,
+                                        info="Tokens used to generate the audio, up to 510 context length.")
+                    tokenize_btn = gr.Button("Tokenize", variant="secondary")
+                    gr.Markdown(TOKEN_NOTE)
+            with gr.Tab(label="Stream"):
+                out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
+                with gr.Row():
+                    stream_btn = gr.Button("Stream", variant="primary")
+                    stop_btn = gr.Button("Stop", variant="stop")
+                gr.Markdown("⚠️ Streaming may have initial delays due to processing.")
+    generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps])
     tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
     stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
     stop_btn.click(fn=None, cancels=[stream_event])
 if __name__ == "__main__":
+    logger.info("Starting Gradio app...")
+    app.launch()
+    logger.info("Gradio app started.")