Spaces:
Runtime error
Runtime error
File size: 5,624 Bytes
d17cd30 33a6964 d17cd30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import os
import tempfile
import traceback
from pathlib import Path
import gradio as gr
import spaces # required for ZeroGPU
# ---- Your model libs (ensure these are available in the repo or pip) ----
from stepaudio2 import StepAudio2
from token2wav import Token2wav
# ------------------------- constants -------------------------
MODEL_PATH = "stepfun-ai/Step-Audio-2-mini"
PROMPT_WAV = "assets/default_female.wav"
CACHE_DIR = "/tmp/stepaudio2"
# Ensure Gradio uses a writable temp dir on Spaces
os.environ["GRADIO_TEMP_DIR"] = CACHE_DIR
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
# ------------------------- helpers -------------------------
def save_tmp_audio(audio_bytes: bytes, cache_dir: str) -> str:
Path(cache_dir).mkdir(parents=True, exist_ok=True)
with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False, suffix=".wav") as f:
f.write(audio_bytes)
return f.name
def add_message(chatbot, history, mic, text):
if not mic and not text:
return chatbot, history, "Input is empty"
if text:
chatbot.append({"role": "user", "content": text})
history.append({"role": "human", "content": text})
elif mic and Path(mic).exists():
chatbot.append({"role": "user", "content": {"path": mic}})
history.append({"role": "human", "content": [{"type": "audio", "audio": mic}]})
return chatbot, history, None
def reset_state(system_prompt):
return [], [{"role": "system", "content": system_prompt}]
# ------------------------- globals -------------------------
AUDIO_MODEL = StepAudio2(MODEL_PATH) # load on CPU
TOKEN2WAV = Token2wav(f"{MODEL_PATH}/token2wav") # load on CPU
@spaces.GPU(duration=120) # GPU only during this call; no-ops outside ZeroGPU
def gpu_predict(chatbot, history):
global AUDIO_MODEL, TOKEN2WAV
try:
# Move to CUDA only when GPU is attached
try:
if hasattr(AUDIO_MODEL, "to"):
AUDIO_MODEL.to("cuda")
if hasattr(TOKEN2WAV, "to"):
TOKEN2WAV.to("cuda")
except Exception:
pass
history.append({"role": "assistant", "content": [{"type": "text", "text": "<tts_start>"}], "eot": False})
tokens, text, audio_tokens = AUDIO_MODEL(
history,
max_new_tokens=4096,
temperature=0.7,
repetition_penalty=1.05,
do_sample=True,
)
audio_bytes = TOKEN2WAV(audio_tokens, PROMPT_WAV)
audio_path = save_tmp_audio(audio_bytes, CACHE_DIR)
chatbot.append({"role": "assistant", "content": {"path": audio_path}})
history[-1]["content"].append({"type": "token", "token": tokens})
history[-1]["eot"] = True
except Exception:
print(traceback.format_exc())
gr.Warning("Some error happened, please try again.")
return chatbot, history
def build_demo():
with gr.Blocks(delete_cache=(86400, 86400)) as demo:
gr.Markdown("<center><font size=8>Step Audio 2 Demo</center>")
with gr.Row():
system_prompt = gr.Textbox(
label="System Prompt",
value=(
"ไฝ ็ๅๅญๅซๅๅฐ่ท๏ผๆฏ็ฑ้ถ่ทๆ่พฐๅ
ฌๅธ่ฎญ็ปๅบๆฅ็่ฏญ้ณๅคงๆจกๅใ\n"
"ไฝ ๆ
ๆ็ป่
ป๏ผ่งๅฏ่ฝๅๅผบ๏ผๆ
้ฟๅๆ็จๆท็ๅ
ๅฎน๏ผๅนถไฝๅบๅ่งฃไบบๆ็ๅๅค๏ผ"
"่ฏด่ฏ็่ฟ็จไธญๆถๅปๆณจๆ็จๆท็ๆๅ๏ผๅฏๆๅ็ๅฟ๏ผๆไพๅคๆ ท็ๆ
็ปชไปทๅผใ\n"
"ไปๅคฉๆฏ2025ๅนด8ๆ29ๆฅ๏ผๆๆไบ\n"
"่ฏท็จ้ป่ฎคๅฅณๅฃฐไธ็จๆทไบคๆตใ"
),
lines=2,
)
chatbot = gr.Chatbot(elem_id="chatbot", min_height=800, type="messages")
history = gr.State([{"role": "system", "content": system_prompt.value}])
mic = gr.Audio(type="filepath", label="๐๏ธ Microphone input (optional)")
text = gr.Textbox(placeholder="Enter message ...", label="๐ฌ Text input")
with gr.Row():
clean_btn = gr.Button("๐งน Clear History (ๆธ
้คๅๅฒ)")
regen_btn = gr.Button("๐ค๏ธ Regenerate (้่ฏ)")
submit_btn = gr.Button("๐ Submit")
def on_submit(chatbot, history, mic, text):
chatbot, history, error = add_message(chatbot, history, mic, text)
if error:
gr.Warning(error)
return chatbot, history, None, None
chatbot, history = gpu_predict(chatbot, history)
return chatbot, history, None, None
submit_btn.click(
fn=on_submit,
inputs=[chatbot, history, mic, text],
outputs=[chatbot, history, mic, text],
concurrency_limit=4,
concurrency_id="gpu_queue",
)
clean_btn.click(
fn=reset_state,
inputs=[system_prompt],
outputs=[chatbot, history],
)
def regenerate(chatbot, history):
while chatbot and chatbot[-1]["role"] == "assistant":
chatbot.pop()
while history and history[-1]["role"] == "assistant":
history.pop()
return gpu_predict(chatbot, history)
regen_btn.click(
regenerate,
[chatbot, history],
[chatbot, history],
concurrency_id="gpu_queue",
)
return demo
# Spaces runs this file; just build and launch with defaults (no ports/names).
if __name__ == "__main__":
demo = build_demo()
demo.queue().launch() # no args โ Spaces handles host/port
|