Supertonic-TTS

import gradio as gr
import os
import io
import wave
import numpy as np
import soundfile as sf
from huggingface_hub import snapshot_download
from helper import load_text_to_speech, load_voice_style

_SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None}

def _init_supertonic() -> None:
    if _SUPERTONIC_STATE["initialized"]:
        return
    
    print("Initializing Supertonic...")
    # Download models if not present
    assets_dir = os.path.join(os.path.dirname(__file__), "assets")
    if not os.path.exists(assets_dir):
        print(f"Downloading Supertonic models to {assets_dir}...")
        snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir)
    
    onnx_dir = os.path.join(assets_dir, "onnx")
    tts = load_text_to_speech(onnx_dir, use_gpu=False)
    
    _SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir})
    print("Supertonic initialized.")

def get_supertonic_voices():
    """Get list of available Supertonic voice styles."""
    # Ensure assets are downloaded to list voices
    assets_dir = os.path.join(os.path.dirname(__file__), "assets")
    if not os.path.exists(assets_dir):
         # If not initialized/downloaded yet, we might not see voices. 
         # But we can try to download just to list, or just init.
         _init_supertonic()
         assets_dir = _SUPERTONIC_STATE["assets_dir"]

    voice_styles_dir = os.path.join(assets_dir, "voice_styles")
    if not os.path.exists(voice_styles_dir):
        return []
        
    files = os.listdir(voice_styles_dir)
    voices = [f.replace('.json', '') for f in files if f.endswith('.json')]
    return sorted(voices)

def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
    audio_clipped = np.clip(audio_np, -1.0, 1.0)
    return (audio_clipped * 32767.0).astype(np.int16)

def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes:
    buffer = io.BytesIO()
    with wave.open(buffer, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio_int16.tobytes())
    return buffer.getvalue()

def supertonic_tts(text: str, speed: float, voice: str, steps: int, silence_duration: float, max_len: int):
    if not text or not text.strip():
        raise gr.Error("Please enter text to synthesize.")

    _init_supertonic()
    tts = _SUPERTONIC_STATE["tts"]
    assets_dir = _SUPERTONIC_STATE["assets_dir"]
    
    voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json")
    if not os.path.exists(voice_path):
        raise gr.Error(f"Voice style {voice} not found.")
        
    style = load_voice_style([voice_path])
    
    try:
        sr = tts.sample_rate
        for audio_chunk in tts.stream(text, style, steps, speed, silence_duration, max_len):
             audio_int16 = _audio_np_to_int16(audio_chunk)
             yield _wav_bytes_from_int16(audio_int16, sr)
        
    except Exception as e:
        raise gr.Error(f"Error during speech generation: {str(e)}")

with gr.Blocks() as demo:
    gr.HTML("<h1 style='text-align: center;'>Supertonic-TTS</h1><p style='text-align: center;'>Powered by Supertone/Supertonic on CPU</p>")

    # We need to initialize to get voices, but we don't want to block startup too long if download is needed.
    # For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first?
    # Or we can just list a default if not found.
    try:
        available_voices = get_supertonic_voices()
    except Exception:
        available_voices = []
        
    default_voice = available_voices[0] if available_voices else None

    with gr.Row(variant='panel'):
        speed_slider = gr.Slider(
            minimum=0.5,
            maximum=2.0,
            value=1.0,
            step=0.1,
            label='Speed'
        )
        steps_slider = gr.Slider(
            minimum=1,
            maximum=50,
            value=5,
            step=1,
            label='Steps (Quality vs Speed)'
        )
        voice_dropdown = gr.Dropdown(
            choices=available_voices,
            label='Voice',
            value=default_voice,
            allow_custom_value=True 
        )
        silence_slider = gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=0.3,
            step=0.1,
            label='Silence Duration (s)'
        )
        maxlen_slider = gr.Slider(
            minimum=50,
            maximum=1000,
            value=300,
            step=10,
            label='Max Chunk Length'
        )

    text_input = gr.Textbox(
        label="Input Text",
        placeholder="Enter the text you want to convert to speech here...",
        lines=5,
        value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
    )

    generate_btn = gr.Button(
        "Generate Speech",
        variant="primary",
    )

    audio_output = gr.Audio(
        label="Generated Speech",
        streaming=True,
        autoplay=True
    )

    generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider, silence_slider, maxlen_slider]

    generate_btn.click(
        fn=supertonic_tts,
        inputs=generate_inputs,
        outputs=audio_output,
        api_name="generate_speech"
    )

    text_input.submit(
        fn=supertonic_tts,
        inputs=generate_inputs,
        outputs=audio_output,
        api_name="generate_speech_enter"
    )

if __name__ == "__main__":
    demo.queue().launch(theme='Nymbo/Nymbo_Theme')