Spaces:

reach-vb
/

Step-Audio-2-mini

Runtime error

File size: 5,624 Bytes

import os
import tempfile
import traceback
from pathlib import Path

import gradio as gr
import spaces  # required for ZeroGPU

# ---- Your model libs (ensure these are available in the repo or pip) ----
from stepaudio2 import StepAudio2
from token2wav import Token2wav

# ------------------------- constants -------------------------
MODEL_PATH = "stepfun-ai/Step-Audio-2-mini"
PROMPT_WAV = "assets/default_female.wav"
CACHE_DIR = "/tmp/stepaudio2"

# Ensure Gradio uses a writable temp dir on Spaces
os.environ["GRADIO_TEMP_DIR"] = CACHE_DIR
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)

# ------------------------- helpers -------------------------
def save_tmp_audio(audio_bytes: bytes, cache_dir: str) -> str:
    Path(cache_dir).mkdir(parents=True, exist_ok=True)
    with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False, suffix=".wav") as f:
        f.write(audio_bytes)
    return f.name

def add_message(chatbot, history, mic, text):
    if not mic and not text:
        return chatbot, history, "Input is empty"

    if text:
        chatbot.append({"role": "user", "content": text})
        history.append({"role": "human", "content": text})
    elif mic and Path(mic).exists():
        chatbot.append({"role": "user", "content": {"path": mic}})
        history.append({"role": "human", "content": [{"type": "audio", "audio": mic}]})
    return chatbot, history, None

def reset_state(system_prompt):
    return [], [{"role": "system", "content": system_prompt}]

# ------------------------- globals -------------------------
AUDIO_MODEL = StepAudio2(MODEL_PATH)                 # load on CPU
TOKEN2WAV = Token2wav(f"{MODEL_PATH}/token2wav")     # load on CPU

@spaces.GPU(duration=120)  # GPU only during this call; no-ops outside ZeroGPU
def gpu_predict(chatbot, history):
    global AUDIO_MODEL, TOKEN2WAV
    try:
        # Move to CUDA only when GPU is attached
        try:
            if hasattr(AUDIO_MODEL, "to"):
                AUDIO_MODEL.to("cuda")
            if hasattr(TOKEN2WAV, "to"):
                TOKEN2WAV.to("cuda")
        except Exception:
            pass

        history.append({"role": "assistant", "content": [{"type": "text", "text": "<tts_start>"}], "eot": False})

        tokens, text, audio_tokens = AUDIO_MODEL(
            history,
            max_new_tokens=4096,
            temperature=0.7,
            repetition_penalty=1.05,
            do_sample=True,
        )

        audio_bytes = TOKEN2WAV(audio_tokens, PROMPT_WAV)
        audio_path = save_tmp_audio(audio_bytes, CACHE_DIR)

        chatbot.append({"role": "assistant", "content": {"path": audio_path}})
        history[-1]["content"].append({"type": "token", "token": tokens})
        history[-1]["eot"] = True

    except Exception:
        print(traceback.format_exc())
        gr.Warning("Some error happened, please try again.")
    return chatbot, history

def build_demo():
    with gr.Blocks(delete_cache=(86400, 86400)) as demo:
        gr.Markdown("<center><font size=8>Step Audio 2 Demo</center>")

        with gr.Row():
            system_prompt = gr.Textbox(
                label="System Prompt",
                value=(
                    "你的名字叫做小跃，是由阶跃星辰公司训练出来的语音大模型。\n"
                    "你情感细腻，观察能力强，擅长分析用户的内容，并作出善解人意的回复，"
                    "说话的过程中时刻注意用户的感受，富有同理心，提供多样的情绪价值。\n"
                    "今天是2025年8月29日，星期五\n"
                    "请用默认女声与用户交流。"
                ),
                lines=2,
            )

        chatbot = gr.Chatbot(elem_id="chatbot", min_height=800, type="messages")
        history = gr.State([{"role": "system", "content": system_prompt.value}])

        mic = gr.Audio(type="filepath", label="🎙️ Microphone input (optional)")
        text = gr.Textbox(placeholder="Enter message ...", label="💬 Text input")

        with gr.Row():
            clean_btn = gr.Button("🧹 Clear History (清除历史)")
            regen_btn = gr.Button("🤔️ Regenerate (重试)")
            submit_btn = gr.Button("🚀 Submit")

        def on_submit(chatbot, history, mic, text):
            chatbot, history, error = add_message(chatbot, history, mic, text)
            if error:
                gr.Warning(error)
                return chatbot, history, None, None
            chatbot, history = gpu_predict(chatbot, history)
            return chatbot, history, None, None

        submit_btn.click(
            fn=on_submit,
            inputs=[chatbot, history, mic, text],
            outputs=[chatbot, history, mic, text],
            concurrency_limit=4,
            concurrency_id="gpu_queue",
        )

        clean_btn.click(
            fn=reset_state,
            inputs=[system_prompt],
            outputs=[chatbot, history],
        )

        def regenerate(chatbot, history):
            while chatbot and chatbot[-1]["role"] == "assistant":
                chatbot.pop()
            while history and history[-1]["role"] == "assistant":
                history.pop()
            return gpu_predict(chatbot, history)

        regen_btn.click(
            regenerate,
            [chatbot, history],
            [chatbot, history],
            concurrency_id="gpu_queue",
        )
    return demo

# Spaces runs this file; just build and launch with defaults (no ports/names).
if __name__ == "__main__":
    demo = build_demo()
    demo.queue().launch()  # no args — Spaces handles host/port