import gradio as gr import os import io import wave import numpy as np import soundfile as sf from huggingface_hub import snapshot_download from helper import load_text_to_speech, load_voice_style _SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None} def _init_supertonic() -> None: if _SUPERTONIC_STATE["initialized"]: return print("Initializing Supertonic...") # Download models if not present assets_dir = os.path.join(os.path.dirname(__file__), "assets") if not os.path.exists(assets_dir): print(f"Downloading Supertonic models to {assets_dir}...") snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir) onnx_dir = os.path.join(assets_dir, "onnx") tts = load_text_to_speech(onnx_dir, use_gpu=False) _SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir}) print("Supertonic initialized.") def get_supertonic_voices(): """Get list of available Supertonic voice styles.""" # Ensure assets are downloaded to list voices assets_dir = os.path.join(os.path.dirname(__file__), "assets") if not os.path.exists(assets_dir): # If not initialized/downloaded yet, we might not see voices. # But we can try to download just to list, or just init. _init_supertonic() assets_dir = _SUPERTONIC_STATE["assets_dir"] voice_styles_dir = os.path.join(assets_dir, "voice_styles") if not os.path.exists(voice_styles_dir): return [] files = os.listdir(voice_styles_dir) voices = [f.replace('.json', '') for f in files if f.endswith('.json')] return sorted(voices) def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray: audio_clipped = np.clip(audio_np, -1.0, 1.0) return (audio_clipped * 32767.0).astype(np.int16) def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes: buffer = io.BytesIO() with wave.open(buffer, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(audio_int16.tobytes()) return buffer.getvalue() def supertonic_tts(text: str, speed: float, voice: str, steps: int, silence_duration: float, max_len: int): if not text or not text.strip(): raise gr.Error("Please enter text to synthesize.") _init_supertonic() tts = _SUPERTONIC_STATE["tts"] assets_dir = _SUPERTONIC_STATE["assets_dir"] voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json") if not os.path.exists(voice_path): raise gr.Error(f"Voice style {voice} not found.") style = load_voice_style([voice_path]) try: sr = tts.sample_rate for audio_chunk in tts.stream(text, style, steps, speed, silence_duration, max_len): audio_int16 = _audio_np_to_int16(audio_chunk) yield _wav_bytes_from_int16(audio_int16, sr) except Exception as e: raise gr.Error(f"Error during speech generation: {str(e)}") with gr.Blocks() as demo: gr.HTML("

Supertonic-TTS

Powered by Supertone/Supertonic on CPU

") # We need to initialize to get voices, but we don't want to block startup too long if download is needed. # For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first? # Or we can just list a default if not found. try: available_voices = get_supertonic_voices() except Exception: available_voices = [] default_voice = available_voices[0] if available_voices else None with gr.Row(variant='panel'): speed_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed' ) steps_slider = gr.Slider( minimum=1, maximum=50, value=5, step=1, label='Steps (Quality vs Speed)' ) voice_dropdown = gr.Dropdown( choices=available_voices, label='Voice', value=default_voice, allow_custom_value=True ) silence_slider = gr.Slider( minimum=0.0, maximum=2.0, value=0.3, step=0.1, label='Silence Duration (s)' ) maxlen_slider = gr.Slider( minimum=50, maximum=1000, value=300, step=10, label='Max Chunk Length' ) text_input = gr.Textbox( label="Input Text", placeholder="Enter the text you want to convert to speech here...", lines=5, value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen." ) generate_btn = gr.Button( "Generate Speech", variant="primary", ) audio_output = gr.Audio( label="Generated Speech", streaming=True, autoplay=True ) generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider, silence_slider, maxlen_slider] generate_btn.click( fn=supertonic_tts, inputs=generate_inputs, outputs=audio_output, api_name="generate_speech" ) text_input.submit( fn=supertonic_tts, inputs=generate_inputs, outputs=audio_output, api_name="generate_speech_enter" ) if __name__ == "__main__": demo.queue().launch(theme='Nymbo/Nymbo_Theme')